In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
train = pd.read_csv('./train.csv')
train.columns = train.columns.str.lower()
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
train.isna().sum()[train.isna().sum()>0].sort_values(ascending=False)

cabin       687
age         177
embarked      2
dtype: int64

In [4]:
train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
train['title'] = train['name'].str.extract('\s(\w+)[.]\s')
train['title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'Countess',
       'Jonkheer'], dtype=object)

In [6]:
train['title'].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Countess      1
Capt          1
Ms            1
Sir           1
Lady          1
Mme           1
Don           1
Jonkheer      1
Name: title, dtype: int64

In [7]:
to_Mr = ['Don', 'Jonkheer', 'Major', 'Col', 'Capt', 'Sir']
to_Mrs = ['Countess', 'Mme']
to_Miss = ['Mlle', 'Ms', 'Lady']
train['title'] = train['title'].replace(to_Mr, ['Mr']*len(to_Mr))
train['title'] = train['title'].replace(to_Mrs, ['Mrs']*len(to_Mrs))
train['title'] = train['title'].replace(to_Miss, ['Miss']*len(to_Miss))
train['title'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Rev', 'Dr'], dtype=object)

In [8]:
title_age_mean = train.groupby('title')['age'].transform('mean')
title_age_mean

0      32.740148
1      35.763636
2      22.020000
3      35.763636
4      32.740148
         ...    
886    43.166667
887    22.020000
888    22.020000
889    32.740148
890    32.740148
Name: age, Length: 891, dtype: float64

In [9]:
train['age'] = train['age'].fillna( title_age_mean )
train['age'].isna().sum()

0

In [10]:
embarked_top = train['embarked'].describe()['top']
embarked_top

'S'

In [11]:
train['embarked'] = train['embarked'].fillna( embarked_top )
train['embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [12]:
train['cabin'].str[0].unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [23]:
train['cabin'] = train['cabin'].fillna('O').str[0]
train['cabin'].unique()

array(['O', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [46]:
train['pclass'].describe()

count    891.000000
mean       2.308642
std        0.836071
min        1.000000
25%        2.000000
50%        3.000000
75%        3.000000
max        3.000000
Name: pclass, dtype: float64

In [45]:
train.groupby('pclass')['survived'].mean()

pclass
1    0.629630
2    0.472826
3    0.242363
Name: survived, dtype: float64

In [38]:
train.pivot_table(index=['pclass'], values=['survived'])

Unnamed: 0_level_0,survived
pclass,Unnamed: 1_level_1
1,0.62963
2,0.472826
3,0.242363


In [39]:
pd.crosstab(index=[train['pclass']], columns=[train['survived']])

survived,0,1
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,80,136
2,97,87
3,372,119


In [29]:
pd.get_dummies(train[['pclass', 'sex', 'cabin', 'embarked', 'title']]).head()

Unnamed: 0,pclass,sex_female,sex_male,cabin_A,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,...,cabin_T,embarked_C,embarked_Q,embarked_S,title_Dr,title_Master,title_Miss,title_Mr,title_Mrs,title_Rev
0,3,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,1,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,3,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,1,1,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
4,3,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0


In [64]:
age_cut_series = pd.cut(train['age'], 6, labels=[n for n in range(6)])
age_cut_series.name = 'age_'
pd.concat([train,age_cut_series], axis=1)

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,title,age_
0,1,0,3,"Braund, Mr. Owen Harris",male,22.00,1,0,A/5 21171,7.2500,O,S,Mr,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.00,1,0,PC 17599,71.2833,C,C,Mrs,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.00,0,0,STON/O2. 3101282,7.9250,O,S,Miss,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.00,1,0,113803,53.1000,C,S,Mrs,2
4,5,0,3,"Allen, Mr. William Henry",male,35.00,0,0,373450,8.0500,O,S,Mr,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.00,0,0,211536,13.0000,O,S,Rev,2
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.00,0,0,112053,30.0000,B,S,Miss,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,22.02,1,2,W./C. 6607,23.4500,O,S,Miss,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.00,0,0,111369,30.0000,C,C,Mr,1
