# Data Preprocesing using Titanic Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
titanic = pd.read_csv('titanic.csv')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Understanding the variables
Independent variables: Pclass, Sex, Age, SibSp, Fare./n
Dependent variables: Survived./n
For the sake of simplicity, I will use pclass, sex, age and fare to try and see if they contributed to one surviving

In [33]:
X = titanic.loc[:, ["Sex", "Age", "Fare"]].values
y = titanic.loc[:, "Survived"].values

In [35]:
print(X)

[['male' 34.5 7.8292]
 ['female' 47.0 7.0]
 ['male' 62.0 9.6875]
 ...
 ['male' 38.5 7.25]
 ['male' nan 8.05]
 ['male' nan 22.3583]]


In [36]:
print(y)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [25]:
X.isna

<bound method DataFrame.isna of         Sex   Age      Fare
0      male  34.5    7.8292
1    female  47.0    7.0000
2      male  62.0    9.6875
3      male  27.0    8.6625
4    female  22.0   12.2875
..      ...   ...       ...
413    male   NaN    8.0500
414  female  39.0  108.9000
415    male  38.5    7.2500
416    male   NaN    8.0500
417    male   NaN   22.3583

[418 rows x 3 columns]>

In [26]:
print(y.isna)

<bound method Series.isna of 0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64>


In [27]:
X.describe

<bound method NDFrame.describe of         Sex   Age      Fare
0      male  34.5    7.8292
1    female  47.0    7.0000
2      male  62.0    9.6875
3      male  27.0    8.6625
4    female  22.0   12.2875
..      ...   ...       ...
413    male   NaN    8.0500
414  female  39.0  108.9000
415    male  38.5    7.2500
416    male   NaN    8.0500
417    male   NaN   22.3583

[418 rows x 3 columns]>

## Handling Missing Observations

In [38]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:2])
X[:, 1:2] = imputer.transform(X[:, 1:2])

In [39]:
X

array([['male', 34.5, 7.8292],
       ['female', 47.0, 7.0],
       ['male', 62.0, 9.6875],
       ...,
       ['male', 38.5, 7.25],
       ['male', 30.272590361445783, 8.05],
       ['male', 30.272590361445783, 22.3583]], dtype=object)

# Encoding categorical data

## Encoding inddependent variable Sex

In [41]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 0] = le.fit_transform(X[:, 0])

In [42]:
X

array([[1, 34.5, 7.8292],
       [0, 47.0, 7.0],
       [1, 62.0, 9.6875],
       ...,
       [1, 38.5, 7.25],
       [1, 30.272590361445783, 8.05],
       [1, 30.272590361445783, 22.3583]], dtype=object)

In [44]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=0)

In [45]:
print(X_train)

[[0 76.0 78.85]
 [1 26.0 7.8792]
 [1 2.0 23.0]
 [1 30.272590361445783 7.8792]
 [1 30.272590361445783 26.0]
 [0 0.17 20.575]
 [1 24.0 9.325]
 [0 27.0 52.0]
 [0 55.0 27.7208]
 [1 8.0 32.5]
 [0 30.272590361445783 14.5]
 [0 26.0 13.775]
 [0 35.0 7.75]
 [0 18.0 7.8792]
 [1 39.0 29.7]
 [0 15.0 39.0]
 [0 10.0 46.9]
 [1 27.0 26.0]
 [1 46.0 79.2]
 [1 30.272590361445783 25.7417]
 [0 58.0 512.3292]
 [1 21.0 7.225]
 [1 30.272590361445783 51.8625]
 [1 27.0 7.225]
 [0 50.0 211.5]
 [1 50.0 26.0]
 [0 63.0 221.7792]
 [1 49.0 65.0]
 [1 26.0 7.775]
 [1 6.0 134.5]
 [1 21.0 7.775]
 [0 12.0 39.0]
 [1 30.272590361445783 6.4375]
 [0 38.0 7.2292]
 [0 12.0 15.75]
 [0 37.0 90.0]
 [0 36.0 31.6792]
 [1 21.0 6.4958]
 [1 26.5 7.225]
 [0 30.272590361445783 69.55]
 [1 22.0 7.7958]
 [0 30.272590361445783 15.5]
 [0 30.0 164.8667]
 [0 1.0 41.5792]
 [1 30.272590361445783 6.4375]
 [0 30.272590361445783 23.45]
 [0 18.0 60.0]
 [0 26.0 13.5]
 [1 30.0 57.75]
 [1 36.0 13.0]
 [1 29.0 7.925]
 [1 30.272590361445783 14.4542]
 [1 32

In [46]:
print(X_test)

[[1 14.5 69.55]
 [1 30.272590361445783 7.55]
 [0 53.0 27.4458]
 [1 30.272590361445783 7.75]
 [0 45.0 14.1083]
 [1 55.0 93.5]
 [0 23.0 83.1583]
 [1 30.272590361445783 7.8958]
 [1 9.0 3.1708]
 [1 21.0 7.8542]
 [0 31.0 21.0]
 [0 36.0 262.375]
 [1 23.0 10.5]
 [1 23.0 10.5]
 [1 18.0 73.5]
 [1 30.272590361445783 8.05]
 [0 22.0 8.6625]
 [1 41.0 13.0]
 [0 23.0 82.2667]
 [0 30.272590361445783 69.55]
 [1 26.0 13.0]
 [0 21.0 8.6625]
 [1 30.272590361445783 12.875]
 [1 30.272590361445783 15.5792]
 [1 31.0 18.0]
 [1 24.0 82.2667]
 [0 30.272590361445783 8.1125]
 [1 32.5 9.5]
 [1 55.0 50.0]
 [1 18.0 8.6625]
 [0 45.0 262.375]
 [0 48.0 106.425]
 [0 22.0 13.9]
 [0 24.0 27.7208]
 [0 30.272590361445783 7.75]
 [1 30.272590361445783 7.2292]
 [1 43.0 21.0]
 [0 33.0 151.55]
 [0 35.0 211.5]
 [0 30.272590361445783 14.4542]
 [0 24.0 37.0042]
 [1 30.0 13.0]
 [0 30.272590361445783 7.75]
 [0 22.0 39.6875]
 [1 30.272590361445783 15.0458]
 [0 36.0 12.1833]
 [1 30.272590361445783 8.05]
 [1 21.0 7.75]
 [1 17.0 7.8958]
 

In [47]:
print(y_train)

[1 0 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 1
 0 0 1 0 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1 0 0
 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0
 1 0 0 0 0 0 1 1 1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0
 0 0 1 0 0 1 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 1 0
 0 0 1 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 1 1 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 1 0 0 0 0 1 0 0]


In [48]:
print(y_test)

[0 0 1 0 1 0 1 0 0 0 1 1 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 1 1 1 1 1 0 0
 1 1 1 1 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 0 1 1 1 1 0 0 1 1 1
 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 1 1 0 0
 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0]


## Feature Scaling

In [49]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [50]:
X_train

array([[-1.41785376e+00,  3.52339962e+00,  8.86678723e-01],
       [ 7.05291356e-01, -3.44056603e-01, -4.53604632e-01],
       [ 7.05291356e-01, -2.20043559e+00, -1.68048368e-01],
       [ 7.05291356e-01, -1.35754799e-02, -4.53604632e-01],
       [ 7.05291356e-01, -1.35754799e-02, -1.11393376e-01],
       [-1.41785376e+00, -2.34198449e+00, -2.13844486e-01],
       [ 7.05291356e-01, -4.98754852e-01, -4.26300704e-01],
       [-1.41785376e+00, -2.66707479e-01,  3.79616549e-01],
       [-1.41785376e+00,  1.89906801e+00, -7.88960732e-02],
       [ 7.05291356e-01, -1.73634084e+00,  1.13591051e-02],
       [-1.41785376e+00, -1.35754799e-02, -3.28570843e-01],
       [-1.41785376e+00, -3.44056603e-01, -3.42262466e-01],
       [-1.41785376e+00,  3.52085517e-01, -4.56044574e-01],
       [-1.41785376e+00, -9.62849599e-01, -4.53604632e-01],
       [ 7.05291356e-01,  6.61482014e-01, -4.15188869e-02],
       [-1.41785376e+00, -1.19489697e+00,  1.34111587e-01],
       [-1.41785376e+00, -1.58164259e+00

In [51]:
X_test

array([[ 7.05291356e-01, -1.23357153e+00,  7.11048249e-01],
       [ 7.05291356e-01, -1.35754799e-02, -4.59821574e-01],
       [-1.41785376e+00,  1.74436976e+00, -8.40894474e-02],
       [ 7.05291356e-01, -1.35754799e-02, -4.56044574e-01],
       [-1.41785376e+00,  1.12557676e+00, -3.35968097e-01],
       [ 7.05291356e-01,  1.89906801e+00,  1.16334393e+00],
       [-1.41785376e+00, -5.76103977e-01,  9.68040956e-01],
       [ 7.05291356e-01, -1.35754799e-02, -4.53291142e-01],
       [ 7.05291356e-01, -1.65899172e+00, -5.42522753e-01],
       [ 7.05291356e-01, -7.30802226e-01, -4.54076757e-01],
       [-1.41785376e+00,  4.26890188e-02, -2.05818362e-01],
       [-1.41785376e+00,  4.29434641e-01,  4.35254782e+00],
       [ 7.05291356e-01, -5.76103977e-01, -4.04110832e-01],
       [ 7.05291356e-01, -5.76103977e-01, -4.04110832e-01],
       [ 7.05291356e-01, -9.62849599e-01,  7.85643988e-01],
       [ 7.05291356e-01, -1.35754799e-02, -4.50379075e-01],
       [-1.41785376e+00, -6.53453101e-01