## Load and Verify the Data

In [1]:
import pandas as pd
titanic = pd.read_csv('titanic.csv')

In [2]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Understand the Features

###### PassengerId: A unique index for each passenger.
###### Survived: Shows if the passenger survived or not (1 = survived, 0 = did not survive).
###### Pclass: Passenger Class (1 = First class ticket, 2 = Second class ticket, 3 = Third class ticket).
###### Name: The passenger's name, which also contains the title (Mr., Mrs., Doctor, et cetera).
###### Sex: The passenger's gender (male or female).
###### Age: The passenger's age (or if the age was not recorded, Nan).
###### SibSp: The number of siblings and spouses traveling with the passenger.
###### Parch: The number of parents and children traveling with the passenger.
###### Ticket: The passenger's ticket number.
###### Fare: The price of the passenger's ticket.
###### Cabin: The passenger's cabin number (or if the cabin number was not recorded, Nan).
###### Embarked: The port where the passenger boarded (C = Cherbourg, Q = Queenstown, S = Southampton).

## Perform a few Basic Maneuvers

In [3]:
titanic.shape

(1309, 12)

In [4]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     1309 non-null   int64  
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


In [6]:
titanic.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

## Impute Missing Data

###### Fill in the missing Age and Fare values (both float64 data types) by imputing the median value.

In [7]:
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())

## Drop Unnecessary Columns

###### Given Pclass, Fare and Cabin are both redundant.
###### PassengerId, Name, and Ticket are not connected to survivability.

In [8]:
titanic = titanic.drop(titanic[['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin']], axis = 1)

## Drop Unnecessary Rows

###### Drop rows where Embarked contains no values (0.15% of the data)

In [9]:
titanic = titanic.drop(titanic[titanic['Embarked'].isna()].index)

In [10]:
titanic.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [11]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1307 entries, 0 to 1308
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  1307 non-null   int64  
 1   Pclass    1307 non-null   int64  
 2   Sex       1307 non-null   object 
 3   Age       1307 non-null   float64
 4   SibSp     1307 non-null   int64  
 5   Parch     1307 non-null   int64  
 6   Embarked  1307 non-null   object 
dtypes: float64(1), int64(4), object(2)
memory usage: 81.7+ KB


## Map the Object Types

###### First, obtain a list of unique values

In [12]:
titanic['Sex'].unique()

array(['male', 'female'], dtype=object)

In [13]:
titanic['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

###### Second, perform the actual mapping

In [14]:
titanic['Sex'] = titanic['Sex'].map({'male':0, 'female':1})
titanic['Embarked'] = titanic['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [15]:
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,0,22.0,1,0,2
1,1,1,1,38.0,1,0,0
2,1,3,1,26.0,0,0,2
3,1,1,1,35.0,1,0,2
4,0,3,0,35.0,0,0,2


## Separate the Features from the Targets

In [16]:
features = titanic.drop(['Survived'], axis = 1)
targets = titanic['Survived']

## Split into Training and Testing

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size = 0.2, random_state = 17)

## Create a Logistic Regression Model

In [19]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

##  Train the Model

In [20]:
model.fit(x_train, y_train)

LogisticRegression()

In [21]:
prediction = model.predict(x_test)

## Evaluate the Model Performance

In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.87      0.90      0.89       157
           1       0.85      0.80      0.82       105

    accuracy                           0.86       262
   macro avg       0.86      0.85      0.86       262
weighted avg       0.86      0.86      0.86       262

