In [1]:
# Classic,data manipulation and linear algebra
import pandas as pd
import numpy as np

# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Stats
import scipy.stats as ss
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

In [2]:
train = pd.read_csv('../dataset/train.csv')
test = pd.read_csv('../dataset/test.csv')

In [3]:
# Adding a column in each dataset before merging
train['Type'] = 'train'
test['Type'] = 'test'

# Merging train and test
data = train.append(test) # The entire data: train + test.

# How many rows and columns in dataset
data.shape

(1309, 13)

# FE

In [4]:
# Creating variable Title
data['Title'] = data['Name']
# Cleaning name and extracting Title
for name_string in data['Name']:
    data['Title'] = data['Name'].str.extract('([A-Za-z]+)\.', expand=True)

In [5]:
# Replacing rare titles 
mapping = {'Mlle': 'Miss', 
           'Ms': 'Miss', 
           'Mme': 'Mrs',
           'Major': 'Other', 
           'Col': 'Other', 
           'Dr' : 'Other', 
           'Rev' : 'Other',
           'Capt': 'Other', 
           'Jonkheer': 'Royal',
           'Sir': 'Royal', 
           'Lady': 'Royal', 
           'Don': 'Royal',
           'Countess': 'Royal', 
           'Dona': 'Royal'}
data.replace({'Title': mapping}, inplace=True)
titles = ['Miss', 'Mr', 'Mrs', 'Royal', 'Other', 'Master']

In [6]:
# Replacing missing age by median/title 
for title in titles:
    age_to_impute = data.groupby('Title')['Age'].median()[titles.index(title)]
    data.loc[(data['Age'].isnull()) & (data['Title'] == title), 'Age'] = age_to_impute

In [7]:
# Creating new feature : family size
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
data.loc[:,'FsizeD']='Alone'
data.loc[(data['Family_Size']>1),'FsizeD']='Small'
data.loc[(data['Family_Size']>4),'FsizeD']='Big'

In [8]:
fa = data[data["Pclass"]==3]
data['Fare'].fillna(fa['Fare'].median(), inplace = True)

In [9]:
data.loc[:,'Child']=1
data.loc[(data['Age']>=18),'Child']=0

In [10]:
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

In [11]:
data.drop('Cabin', axis=1, inplace=True)

In [12]:
target_col = ["Survived"]
cat_cols = ['Pclass', 'Sex', 'Embarked', 'Title', 'FsizeD', 'Child']
num_cols = ['Age', 'SibSp', 'Parch', 'Fare', 'Family_Size']

In [13]:
#Label encoding Binary columns
le = LabelEncoder()
for i in cat_cols :
    data[i] = le.fit_transform(data[i])

In [14]:
#Scaling Numerical columns
std = StandardScaler()
scaled = std.fit_transform(data[num_cols])
scaled = pd.DataFrame(scaled,columns=num_cols)

In [15]:
data.reset_index(drop=True, inplace=True)

In [16]:
#dropping original values merging scaled values for numerical columns
df_data_og = data.copy()
data = data.drop(columns = num_cols,axis = 1)
data = data.merge(scaled,left_index=True,right_index=True,how = "left")

In [17]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Ticket,Embarked,Type,Title,FsizeD,Child,Age,SibSp,Parch,Fare,Family_Size
0,1,0.0,2,"Braund, Mr. Owen Harris",1,A/5 21171,2,train,2,2,0,-0.369154,0.481288,-0.445000,-0.503176,0.073352
1,2,1.0,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,PC 17599,0,train,3,2,0,0.749065,0.481288,-0.445000,0.734809,0.073352
2,3,1.0,2,"Heikkinen, Miss. Laina",0,STON/O2. 3101282,2,train,1,0,0,-0.089599,-0.479087,-0.445000,-0.490126,-0.558346
3,4,1.0,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,113803,2,train,3,2,0,0.539399,0.481288,-0.445000,0.383263,0.073352
4,5,0.0,2,"Allen, Mr. William Henry",1,373450,2,train,2,0,0,0.539399,-0.479087,-0.445000,-0.487709,-0.558346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,2,"Spector, Mr. Woolf",1,A.5. 3236,2,test,2,0,0,-0.631236,-0.479087,-0.445000,-0.487709,-0.558346
1305,1306,,0,"Oliva y Ocana, Dona. Fermina",0,PC 17758,0,test,5,0,0,0.818954,-0.479087,-0.445000,1.462069,-0.558346
1306,1307,,2,"Saether, Mr. Simon Sivertsen",1,SOTON/O.Q. 3101262,2,test,2,0,0,0.784010,-0.479087,-0.445000,-0.503176,-0.558346
1307,1308,,2,"Ware, Mr. Frederick",1,359309,2,test,2,0,0,-0.631236,-0.479087,-0.445000,-0.487709,-0.558346


In [18]:
train = data[data['Type']=='train']
test = data[data['Type']=='test']

In [19]:
test.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Ticket,Embarked,Type,Title,FsizeD,Child,Age,SibSp,Parch,Fare,Family_Size
891,892,,2,"Kelly, Mr. James",1,330911,1,test,2,0,0,0.504455,-0.479087,-0.445,-0.491978,-0.558346
892,893,,2,"Wilkes, Mrs. James (Ellen Needs)",0,363272,2,test,3,2,0,1.378064,0.481288,-0.445,-0.50801,0.073352
893,894,,1,"Myles, Mr. Thomas Francis",1,240276,1,test,2,0,0,2.426394,-0.479087,-0.445,-0.456051,-0.558346
894,895,,2,"Wirz, Mr. Albert",1,315154,2,test,2,0,0,-0.01971,-0.479087,-0.445,-0.475868,-0.558346
895,896,,2,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,3101298,2,test,3,2,0,-0.369154,0.481288,0.710763,-0.405784,0.705051


In [20]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Ticket,Embarked,Type,Title,FsizeD,Child,Age,SibSp,Parch,Fare,Family_Size
0,1,0.0,2,"Braund, Mr. Owen Harris",1,A/5 21171,2,train,2,2,0,-0.369154,0.481288,-0.445,-0.503176,0.073352
1,2,1.0,0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,PC 17599,0,train,3,2,0,0.749065,0.481288,-0.445,0.734809,0.073352
2,3,1.0,2,"Heikkinen, Miss. Laina",0,STON/O2. 3101282,2,train,1,0,0,-0.089599,-0.479087,-0.445,-0.490126,-0.558346
3,4,1.0,0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,113803,2,train,3,2,0,0.539399,0.481288,-0.445,0.383263,0.073352
4,5,0.0,2,"Allen, Mr. William Henry",1,373450,2,train,2,0,0,0.539399,-0.479087,-0.445,-0.487709,-0.558346


In [21]:
train_tmp, val_tmp = train_test_split(train, test_size=0.3, random_state=42)

In [22]:
train_tmp.shape

(623, 16)

In [23]:
val_tmp.shape

(268, 16)

In [24]:
val_tmp.Survived.value_counts(normalize=True)

0.0    0.585821
1.0    0.414179
Name: Survived, dtype: float64

In [25]:
train_tmp.Survived.value_counts(normalize=True)

0.0    0.629213
1.0    0.370787
Name: Survived, dtype: float64

In [26]:
train_tmp.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Ticket,Embarked,Type,Title,FsizeD,Child,Age,SibSp,Parch,Fare,Family_Size
445,446,1.0,0,"Dodge, Master. Washington",1,33638,2,train,0,2,1,-1.62715,-0.479087,1.866526,0.93926,0.705051
650,651,0.0,2,"Mitkoff, Mr. Mito",1,349221,2,train,2,0,0,-0.631236,-0.479087,-0.445,-0.490691,-0.558346
172,173,1.0,2,"Johnson, Miss. Eleanor Ileen",0,347742,2,train,1,2,1,-1.836816,0.481288,0.710763,-0.428099,0.705051
450,451,0.0,1,"West, Mr. Edwy Arthur",1,C.A. 34651,2,train,2,2,0,0.609288,0.481288,1.866526,-0.10684,1.336749
314,315,0.0,1,"Hart, Mr. Benjamin",1,F.C.C. 13529,2,train,2,2,0,1.098509,0.481288,0.710763,-0.135841,0.705051


In [27]:
use_features = ['Pclass', 'Sex', 'Embarked', 'Title', 'FsizeD', 'Child', 'Age', 'SibSp', 'Parch', 'Fare', 'Family_Size']

In [28]:
for col in use_features:
    print(col, train_tmp[col].nunique())

Pclass 3
Sex 2
Embarked 3
Title 6
FsizeD 3
Child 2
Age 80
SibSp 7
Parch 7
Fare 207
Family_Size 9


In [30]:
train_tmp.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex              int32
Ticket          object
Embarked         int32
Type            object
Title            int32
FsizeD           int32
Child            int64
Age            float64
SibSp          float64
Parch          float64
Fare           float64
Family_Size    float64
dtype: object