# Tutorial 1 The Titanic Disaster

## Project Summary
Predict which passengers survived the Titanic shipwreck.

## Importing the Libraries

In [1]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt

## Importing the Dataset

In [2]:
dataset = pd.read_csv('../data/Titanic.csv')

## Showing the Dataset in a Table

In [3]:
pd.DataFrame(dataset)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.2500,S,No
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,Yes
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.9250,S,Yes
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1000,S,Yes
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,8.0500,S,No
...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,13.0000,S,No
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,30.0000,S,Yes
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,23.4500,S,No
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,30.0000,C,Yes


## A Quick Review of the Data

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   PassengerId                        891 non-null    int64  
 1   Pclass                             891 non-null    int64  
 2   Name                               891 non-null    object 
 3   Sex                                891 non-null    object 
 4   Age                                714 non-null    float64
 5   Number of Siblings/Spouses Aboard  891 non-null    int64  
 6   Number of Parents/Children Aboard  891 non-null    int64  
 7   Fare                               891 non-null    float64
 8   Embarked                           889 non-null    object 
 9   Survived                           891 non-null    object 
dtypes: float64(2), int64(4), object(4)
memory usage: 69.7+ KB


## Encoding Categorical Data

### Encoding Data Inputs

In [5]:
sexes = {'male': 0.0, 'female': 1.0}
dataset['Sex'] = dataset['Sex'].map(sexes)
pd.DataFrame(dataset['Sex'])

Unnamed: 0,Sex
0,0.0
1,1.0
2,1.0
3,1.0
4,0.0
...,...
886,0.0
887,1.0
888,1.0
889,0.0


In [6]:
ports = {'S': 0.0, 'C': 1.0, 'Q': 2.0}
dataset['Embarked'] = dataset['Embarked'].map(ports)
pd.DataFrame(dataset['Embarked'])

Unnamed: 0,Embarked
0,0.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
886,0.0
887,0.0
888,0.0
889,1.0


### Encoding Data Outputs (Labels)

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dataset['Survived'] = le.fit_transform(dataset['Survived']) # Fit label encoder and return encoded labels.
pd.DataFrame(dataset['Survived'])

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


## Dropping Irrelevant Data Inputs

In [8]:
dataset = dataset.drop(['PassengerId'], axis=1)
dataset = dataset.drop(['Name'], axis=1)

## Checking the Current Dataset

In [9]:
pd.DataFrame(dataset)

Unnamed: 0,Pclass,Sex,Age,Number of Siblings/Spouses Aboard,Number of Parents/Children Aboard,Fare,Embarked,Survived
0,3,0.0,22.0,1,0,7.2500,0.0,0
1,1,1.0,38.0,1,0,71.2833,1.0,1
2,3,1.0,26.0,0,0,7.9250,0.0,1
3,1,1.0,35.0,1,0,53.1000,0.0,1
4,3,0.0,35.0,0,0,8.0500,0.0,0
...,...,...,...,...,...,...,...,...
886,2,0.0,27.0,0,0,13.0000,0.0,0
887,1,1.0,19.0,0,0,30.0000,0.0,1
888,3,1.0,,1,2,23.4500,0.0,0
889,1,0.0,26.0,0,0,30.0000,1.0,1


## Seperate The Input and Output

In [10]:
dataset.info()
X = dataset.iloc[:, 0:9].values
Y = dataset.iloc[:, -1].values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Pclass                             891 non-null    int64  
 1   Sex                                891 non-null    float64
 2   Age                                714 non-null    float64
 3   Number of Siblings/Spouses Aboard  891 non-null    int64  
 4   Number of Parents/Children Aboard  891 non-null    int64  
 5   Fare                               891 non-null    float64
 6   Embarked                           889 non-null    float64
 7   Survived                           891 non-null    int64  
dtypes: float64(4), int64(4)
memory usage: 55.8 KB


## Showing the Input Data in a Table format

In [11]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.0,0.0,22.0,1.0,0.0,7.2500,0.0,0.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,1.0
2,3.0,1.0,26.0,0.0,0.0,7.9250,0.0,1.0
3,1.0,1.0,35.0,1.0,0.0,53.1000,0.0,1.0
4,3.0,0.0,35.0,0.0,0.0,8.0500,0.0,0.0
...,...,...,...,...,...,...,...,...
886,2.0,0.0,27.0,0.0,0.0,13.0000,0.0,0.0
887,1.0,1.0,19.0,0.0,0.0,30.0000,0.0,1.0
888,3.0,1.0,,1.0,2.0,23.4500,0.0,0.0
889,1.0,0.0,26.0,0.0,0.0,30.0000,1.0,1.0


## A Quick Check of the Output Data

In [12]:
pd.DataFrame(Y)

Unnamed: 0,0
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


## Taking Care of Missing Data Inputs

In [13]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

The imputation strategy.
- If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.
- If “median”, then replace missing values using the median along each column. Can only be used with numeric data.
- If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.

In [14]:
imputer.fit(X) # Fit the imputer on X.
X = imputer.transform(X) # Impute all missing values in X.

## Feature Scaling

In [15]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.0,0.0,22.000000,1.0,0.0,7.2500,0.0,0.0
1,1.0,1.0,38.000000,1.0,0.0,71.2833,1.0,1.0
2,3.0,1.0,26.000000,0.0,0.0,7.9250,0.0,1.0
3,1.0,1.0,35.000000,1.0,0.0,53.1000,0.0,1.0
4,3.0,0.0,35.000000,0.0,0.0,8.0500,0.0,0.0
...,...,...,...,...,...,...,...,...
886,2.0,0.0,27.000000,0.0,0.0,13.0000,0.0,0.0
887,1.0,1.0,19.000000,0.0,0.0,30.0000,0.0,1.0
888,3.0,1.0,29.699118,1.0,2.0,23.4500,0.0,0.0
889,1.0,0.0,26.000000,0.0,0.0,30.0000,1.0,1.0


In [16]:
# Scale the age and fare
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[:, [2,5]] = sc.fit_transform(X[:, [2,5]])
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.0,0.0,-0.592481,1.0,0.0,-0.502445,0.0,0.0
1,1.0,1.0,0.638789,1.0,0.0,0.786845,1.0,1.0
2,3.0,1.0,-0.284663,0.0,0.0,-0.488854,0.0,1.0
3,1.0,1.0,0.407926,1.0,0.0,0.420730,0.0,1.0
4,3.0,0.0,0.407926,0.0,0.0,-0.486337,0.0,0.0
...,...,...,...,...,...,...,...,...
886,2.0,0.0,-0.207709,0.0,0.0,-0.386671,0.0,0.0
887,1.0,1.0,-0.823344,0.0,0.0,-0.044381,0.0,1.0
888,3.0,1.0,0.000000,1.0,2.0,-0.176263,0.0,0.0
889,1.0,0.0,-0.284663,0.0,0.0,-0.044381,1.0,1.0


## Splitting the Dataset into the Training set and Test set

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

random_state: int, RandomState instance or None, default=None
Controls the shuffling applied to the data before applying the split. Pass an int for reproducible output across multiple function calls.

In [18]:
print(X_train)

[[ 3.          0.          0.         ... -0.18029018  2.
   1.        ]
 [ 1.          1.          0.02315421 ...  0.49782998  1.
   1.        ]
 [ 2.          1.          0.33097161 ... -0.43700744  0.
   1.        ]
 ...
 [ 2.          0.         -0.66943495 ...  0.83147785  0.
   0.        ]
 [ 3.          1.          0.         ... -0.49640477  0.
   0.        ]
 [ 3.          0.         -0.66943495 ... -0.48633742  0.
   0.        ]]


In [19]:
print(Y_train)

[1 1 1 0 1 1 0 0 1 0 1 0 0 1 1 0 1 0 1 0 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 0
 0 1 1 0 1 0 1 0 0 1 1 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 1 1 1 0 1 0 1 0
 1 1 0 0 1 0 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0
 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 1 1 1
 1 1 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 1 1 1 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1
 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 1 0 0 1 1 1 1 0 0
 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 1 0
 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0
 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 1 1 1 1 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0
 1 1 1 0 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 1 1 1 0 0 1 1
 0 1 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 1 0 1 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 1 0
 1 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 1
 0 1 1 1 0 1 1 1 0 1 0 1 

In [20]:
print(X_test)

[[ 1.          1.          1.40833252 ... -0.12634532  0.
   1.        ]
 [ 3.          0.          0.         ... -0.48944219  0.
   0.        ]
 [ 2.          1.         -0.97725235 ... -0.43700744  0.
   1.        ]
 ...
 [ 3.          0.          0.         ... -0.49271408  2.
   0.        ]
 [ 3.          0.         -0.66943495 ... -0.49271408  2.
   0.        ]
 [ 3.          1.         -1.90070456 ... -0.26066181  1.
   1.        ]]


In [21]:
print(Y_test)

[1 0 1 0 1 0 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 1
 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1
 0 0 1 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 1 0 1 0 0 0
 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1 0 1 0 1 0 0 0 1]


## Train and Test Predictive Models

In [22]:
# Support vector machine
from sklearn.svm import SVC
sv_classfier = SVC(kernel = 'rbf')
sv_classfier.fit(X_train, Y_train)
Y_pred = sv_classfier.predict(X_test)
print('Error rate = ', np.abs(np.sum(Y_pred)-np.sum(Y_test))*100/np.sum(Y_test), '%')

Error rate =  0.0 %


In [23]:
from sklearn.linear_model import LogisticRegression
# Linear regression
lr = LogisticRegression()
lr.fit(X_train, Y_train)
Y_pred = lr.predict(X_test)
print(Y_pred)
print('Error rate = ', np.abs(np.sum(Y_pred)-np.sum(Y_test))*100/np.sum(Y_test), '%')

[1 0 1 0 1 0 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 1 1 1 0 1 0 1 0 0 0 1 1
 0 1 0 1 1 1 0 1 0 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1
 0 0 1 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 0 1 0 1 0 0 0
 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1 0 1 0 1 0 0 0 1]
Error rate =  0.0 %
