# COMP4115: Exploratory Data Analysis and Visualization
# Lab 3: Data Preprocessing on Titanic Data

## 1. load and explore the data

In [142]:
import pandas as pd

# load data
df = pd.read_csv("titanic.csv")

In [143]:
# preview the data
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [144]:
# Data Frame information (null, data type etc)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## 2. About the Data
This dataset was used to apply machine learning algorithms to predict which passengers survied the tragedy. More details will be found in __[Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic#description)__
- `passengerId, Ticket` are basically random numbers and thus we assume that do not contain any valuable information.
- `Survived, Passenger Class, Age Siblings Spouses, Parents Children` and `Fare` are numerical values
- `Sex, Embarked` are categorical features that we need to map to integer values.
- `Name, Cabin` might also contain valuable information.

In [145]:
# Drop 'passengerId', Ticket', 'Cabin'
df = df.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S


## 3. Handle missing values


### 3.1 Handing missing value by mean/median

In [146]:
# The following line of code is just for keeping a copy of the orignial data. It it usually not needed for a real project. 
df_copy = df.copy()  

# replace NAs by mean
df['Age'].fillna((df['Age'].mean()), inplace=True)
print(df.tail())
print(df_copy.tail())

     Survived  Pclass                                      Name     Sex  \
886         0       2                     Montvila, Rev. Juozas    male   
887         1       1              Graham, Miss. Margaret Edith  female   
888         0       3  Johnston, Miss. Catherine Helen "Carrie"  female   
889         1       1                     Behr, Mr. Karl Howell    male   
890         0       3                       Dooley, Mr. Patrick    male   

           Age  SibSp  Parch   Fare Embarked  
886  27.000000      0      0  13.00        S  
887  19.000000      0      0  30.00        S  
888  29.699118      1      2  23.45        S  
889  26.000000      0      0  30.00        C  
890  32.000000      0      0   7.75        Q  
     Survived  Pclass                                      Name     Sex   Age  \
886         0       2                     Montvila, Rev. Juozas    male  27.0   
887         1       1              Graham, Miss. Margaret Edith  female  19.0   
888         0       3  J

In [147]:
# replace NAs by median
df = df_copy.copy()   # Just for illustraction purpose, I copied back the original data with missing value. 
df['Age'].fillna((df['Age'].median()), inplace=True)
print(df.tail())
print(df_copy.tail())

     Survived  Pclass                                      Name     Sex   Age  \
886         0       2                     Montvila, Rev. Juozas    male  27.0   
887         1       1              Graham, Miss. Margaret Edith  female  19.0   
888         0       3  Johnston, Miss. Catherine Helen "Carrie"  female  28.0   
889         1       1                     Behr, Mr. Karl Howell    male  26.0   
890         0       3                       Dooley, Mr. Patrick    male  32.0   

     SibSp  Parch   Fare Embarked  
886      0      0  13.00        S  
887      0      0  30.00        S  
888      1      2  23.45        S  
889      0      0  30.00        C  
890      0      0   7.75        Q  
     Survived  Pclass                                      Name     Sex   Age  \
886         0       2                     Montvila, Rev. Juozas    male  27.0   
887         1       1              Graham, Miss. Margaret Edith  female  19.0   
888         0       3  Johnston, Miss. Catherine Helen

### 3.2 Guess the values of age based on different groups
We will guess values of age based on sex (male / female) and Pclass (1st,2nd,3rd) of the passenger.

In [148]:
df = df_copy.copy()
import numpy as np
#Determine pivot table
impute_grps = df.pivot_table(index=["Pclass","Sex"],values=["Age"],aggfunc=np.mean)
print (impute_grps)

                     Age
Pclass Sex              
1      female  34.611765
       male    41.281386
2      female  28.722973
       male    30.740707
3      female  21.750000
       male    26.507589


In [149]:
# fill NAs
for i, row in df.iterrows():
    if pd.isnull(row['Age']):
        #print(df.loc[[i]])
        df.at[i, 'Age'] = impute_grps.loc[(row['Pclass'],row['Sex'])]
        #print(df.loc[[i]])
        #print("----------")

In [150]:
print(df.tail())
print(df_copy.tail())

     Survived  Pclass                                      Name     Sex  \
886         0       2                     Montvila, Rev. Juozas    male   
887         1       1              Graham, Miss. Margaret Edith  female   
888         0       3  Johnston, Miss. Catherine Helen "Carrie"  female   
889         1       1                     Behr, Mr. Karl Howell    male   
890         0       3                       Dooley, Mr. Patrick    male   

       Age  SibSp  Parch   Fare Embarked  
886  27.00      0      0  13.00        S  
887  19.00      0      0  30.00        S  
888  21.75      1      2  23.45        S  
889  26.00      0      0  30.00        C  
890  32.00      0      0   7.75        Q  
     Survived  Pclass                                      Name     Sex   Age  \
886         0       2                     Montvila, Rev. Juozas    male  27.0   
887         1       1              Graham, Miss. Margaret Edith  female  19.0   
888         0       3  Johnston, Miss. Catherine

### 3.3 Handing missing value by mode

In [151]:
# To replace Nan value in 'Embarked', we will use the mode
# in 'Embaraked'. This will give us the most frequent port 
# the passengers embarked from

freq_port = df['Embarked'].dropna().mode()[0]
print(freq_port)

df['Embarked'].fillna(freq_port, inplace=True)
df.info()

S
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Fare        891 non-null float64
Embarked    891 non-null object
dtypes: float64(2), int64(4), object(3)
memory usage: 62.7+ KB


In [152]:
df['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

## 4. Feature Type Conversion

### 4.1 one-hot encoding for 'Embarked' column

In [153]:
# pd.get_dummies(df['Embarked'])
one_hot_encoded = pd.get_dummies(df['Embarked'],prefix='Embarked')
df = df.join(one_hot_encoded)
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,0,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,0,0,1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,0,0,1


In [154]:
df = df.drop(['Embarked'], axis=1)
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,0,0,1
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,1,0,0
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,0,0,1
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,0,0,1
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,0,0,1


### 4.2 Feature Type Conversion: 'Sex' column

In [155]:
df['Sex'] = df['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Name          891 non-null object
Sex           891 non-null int32
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Fare          891 non-null float64
Embarked_C    891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: float64(2), int32(1), int64(4), object(1), uint8(3)
memory usage: 54.9+ KB


## 5. Feature Construction: Family Size

### 5.1 Create feature 'Family Size'
How did the number of people the person traveled with impact the chance of survival?

In [156]:
# SibSp = Number of Sibling / Spouses
# Parch = Parents / Children

df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Survival chance with FamilySize
df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,FamilySize,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


In [157]:
df['IsAlone'] = 0
df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1

df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

# We can also create new features based on intuitive combinations

Unnamed: 0,IsAlone,Survived
0,0,0.50565
1,1,0.303538


### 5.2 Create feature 'Title' 
Hypothesis: The Title of the person is a feature that can predict survival


In [158]:
# from the Name column we will extract title of each passenger
# and save that in a column in the dataset called 'Title'
# if you want to match Titles or names with any other expression
# refer to this tutorial on regex in python:
# https://www.tutorialspoint.com/python/python_reg_expressions.htm

# Create new column called title
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.',expand=False)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize,IsAlone,Title
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,0,1,2,0,Mr
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,0,0,2,0,Mrs
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,0,1,1,1,Miss
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0,0,1,2,0,Mrs
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0,0,1,1,1,Mr


In [159]:
# Double check that our titles makes sense (by comparing to sex)
pd.crosstab(df['Title'], df['Sex'])

Sex,0,1
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,1,0
Col,2,0
Countess,0,1
Don,1,0
Dr,6,1
Jonkheer,1,0
Lady,0,1
Major,2,0
Master,40,0
Miss,0,182


In [160]:
# We see common titles like Miss, Mrs, Mr, Master are dominant, we will
# correct some Titles to standard forms and replace the rarest titles 
# with single name 'Rare'
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr',\
                 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

df['Title'] = df['Title'].replace('Mlle', 'Miss') #Mademoiselle
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs') #Madame

In [161]:
# Now that we have more logical titles, and a few groups
# we can plot the survival chance for each title
df[['Title', 'Survived']].groupby(['Title']).mean()

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Master,0.575
Miss,0.702703
Mr,0.156673
Mrs,0.793651
Rare,0.347826


In [162]:
# Title dummy mapping
one_hot_encoded = pd.get_dummies(df['Title'],prefix='Title')
df = df.join(one_hot_encoded)
df.head(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize,IsAlone,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0,0,1,2,0,Mr,0,0,1,0,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1,0,0,2,0,Mrs,0,0,0,1,0
2,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0,0,1,1,1,Miss,0,1,0,0,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0,0,1,2,0,Mrs,0,0,0,1,0
4,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0,0,1,1,1,Mr,0,0,1,0,0


In [163]:
df = df.drop(['Name', 'Title'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
Survived        891 non-null int64
Pclass          891 non-null int64
Sex             891 non-null int32
Age             891 non-null float64
SibSp           891 non-null int64
Parch           891 non-null int64
Fare            891 non-null float64
Embarked_C      891 non-null uint8
Embarked_Q      891 non-null uint8
Embarked_S      891 non-null uint8
FamilySize      891 non-null int64
IsAlone         891 non-null int64
Title_Master    891 non-null uint8
Title_Miss      891 non-null uint8
Title_Mr        891 non-null uint8
Title_Mrs       891 non-null uint8
Title_Rare      891 non-null uint8
dtypes: float64(2), int32(1), int64(6), uint8(8)
memory usage: 66.2 KB


## 6. Data Normalization

In [164]:
df.head(7)
# min-max normalization
from sklearn import preprocessing
min_max_scaler  = preprocessing.MinMaxScaler()

df_scale_min_max = min_max_scaler.fit_transform(df.loc[:, df.columns != 'Survived'])
df_scale_min_max = pd.DataFrame(df_scale_min_max,columns = df.columns[df.columns != 'Survived'])
df_scale_min_max.head(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize,IsAlone,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1.0,0.0,0.271174,0.125,0.0,0.014151,0.0,0.0,1.0,0.1,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.472229,0.125,0.0,0.139136,1.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,1.0,0.321438,0.0,0.0,0.015469,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.434531,0.125,0.0,0.103644,0.0,0.0,1.0,0.1,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,0.434531,0.0,0.0,0.015713,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [165]:
df_final = df[['Survived']].join(df_scale_min_max)
df_final.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize,IsAlone,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,1.0,0.0,0.271174,0.125,0.0,0.014151,0.0,0.0,1.0,0.1,0.0,0.0,0.0,1.0,0.0,0.0
1,1,0.0,1.0,0.472229,0.125,0.0,0.139136,1.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,1.0,0.0
2,1,1.0,1.0,0.321438,0.0,0.0,0.015469,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,1,0.0,1.0,0.434531,0.125,0.0,0.103644,0.0,0.0,1.0,0.1,0.0,0.0,0.0,0.0,1.0,0.0
4,0,1.0,0.0,0.434531,0.0,0.0,0.015713,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [166]:
# Z-score normalization
z_score_scaler  = preprocessing.StandardScaler()
df_scale_z_score = z_score_scaler.fit_transform(df.loc[:, df.columns != 'Survived'])
df_scale_z_score = pd.DataFrame(df_scale_z_score,columns = df.columns[df.columns != 'Survived'])
df_final = df[['Survived']].join(df_scale_z_score)
df_final.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,FamilySize,IsAlone,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,0,0.827377,-0.737695,-0.551366,0.432793,-0.473674,-0.502445,-0.482043,-0.307562,0.615838,0.05916,-1.231645,-0.216803,-0.511898,0.850532,-0.40584,-0.162781
1,1,-1.566107,1.355574,0.65403,0.432793,-0.473674,0.786845,2.074505,-0.307562,-1.623803,0.05916,-1.231645,-0.216803,-0.511898,-1.175735,2.464027,-0.162781
2,1,0.827377,1.355574,-0.250017,-0.474545,-0.473674,-0.488854,-0.482043,-0.307562,0.615838,-0.560975,0.811922,-0.216803,1.953514,-1.175735,-0.40584,-0.162781
3,1,-1.566107,1.355574,0.428018,0.432793,-0.473674,0.42073,-0.482043,-0.307562,0.615838,0.05916,-1.231645,-0.216803,-0.511898,-1.175735,2.464027,-0.162781
4,0,0.827377,-0.737695,0.428018,-0.474545,-0.473674,-0.486337,-0.482043,-0.307562,0.615838,-0.560975,0.811922,-0.216803,-0.511898,0.850532,-0.40584,-0.162781
