# Part 0. Getting started

In [1]:
import numpy as np 
import pandas as pd 
pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import RidgeClassifier, LinearRegression
from sklearn import svm
import xgboost as xgb

Creating two pandas dataframes with our data.

In [2]:
test_data = pd.read_csv('../input/titanic/test.csv')
train_data = pd.read_csv('../input/titanic/train.csv')
comb = [train_data, test_data]

Taking a quick glance of the training dataset. So many columns to be converted into numerical ones and so many NaN values to be filled!

In [3]:
print("TRAIN DATA"), comb[0].info(), print("-"*39), print("-"*39), print("TEST DATA"), comb[1].info();

TRAIN DATA
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
---------------------------------------
---------------------------------------
TEST DATA
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtyp

In [4]:
comb[0].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
comb[1].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Part 1. Managing Data
## Filling empty values
#### The first thing we must do is to find a right way to fill empty values. As we can see, there are many NaN values in Embarked (in the training set only), in Cabin and in Age columns (in both training set and test set).

Filling empty values in Age with the list that consists of random values in range of the mean age +- standart deviation:

In [6]:
for ds in comb:
    age_avg = ds['Age'].mean()
    age_std = ds['Age'].std()
    age_nan = ds['Age'].isnull().sum()
    age_rand_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_nan)
    ds['Age'][np.isnan(ds['Age'])] = age_rand_list
    ds['Age'] = ds['Age'].astype(int)

Filling empty values in Embarked with the mode value:

In [7]:
emb_mode = comb[0]['Embarked'].mode().at[0]
comb[0]['Embarked'] = comb[0]['Embarked'].fillna(emb_mode)

And filling empty data in Fare with the mode:

In [8]:
fare = float(comb[1]['Fare'].mode())
comb[1]['Fare'] = comb[1]['Fare'].fillna(fare)

## EDA. Data Transforming
#### We have many columns that are candidates to be split so as to gain a bigger success.


My guess is that the Cabin column will be really informative for our predictions

In [9]:
for ds in comb:
    ds['Cabin_Null'] = ds['Cabin'].isnull().astype(int)

pd.crosstab(comb[0]['Cabin_Null'], comb[0]['Survived'])

Survived,0,1
Cabin_Null,Unnamed: 1_level_1,Unnamed: 2_level_1
0,68,136
1,481,206


As we can see, there's an unexpectable correlation between presence of the cabin information and the fact of survival. When the cabin number is known, the survival probability is significantly higher.

In [10]:
comb[0][['Cabin_Null', 'Survived']].groupby(['Cabin_Null'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Cabin_Null,Survived
0,0,0.666667
1,1,0.299854


Let's now get the type of ticket by extracting letters into a separate column.

In [11]:
for daset in comb:
    daset['TickType'] = 0
    tic_let = daset['Ticket'].str.extract(r'(^\S+) ')
    daset['TickType'] = tic_let
    daset['TickType'] = daset['TickType'].fillna('Other')

comb[0]['TickType'].unique()

array(['A/5', 'PC', 'STON/O2.', 'Other', 'PP', 'A/5.', 'C.A.', 'A./5.',
       'SC/Paris', 'S.C./A.4.', 'A/4.', 'CA', 'S.P.', 'S.O.C.', 'SO/C',
       'W./C.', 'SOTON/OQ', 'W.E.P.', 'STON/O', 'A4.', 'C', 'SOTON/O.Q.',
       'SC/PARIS', 'S.O.P.', 'A.5.', 'Fa', 'CA.', 'F.C.C.', 'W/C',
       'SW/PP', 'SCO/W', 'P/PP', 'SC', 'SC/AH', 'A/S', 'A/4', 'WE/P',
       'S.W./PP', 'S.O./P.P.', 'F.C.', 'SOTON/O2', 'S.C./PARIS',
       'C.A./SOTON'], dtype=object)

I suppose that there may be a correlation between tickets that start with the same letter. However, there are too many different values, so we will group ticket types depending on their first letter.

In [12]:
for dataset in comb:
    for letter in "ACFPSW":
        let_regex = r"^(" + letter + ".*)"
        l_tic = dataset.TickType.str.extract(let_regex).dropna(axis=0).reset_index(drop=True)
        tic_l_list = list(l_tic[0].unique())
        dataset['TickType'].replace(tic_l_list, letter, inplace=True)

pd.crosstab(comb[0].TickType, comb[0].Survived)

Survived,0,1
TickType,Unnamed: 1_level_1,Unnamed: 2_level_1
A,27,2
C,31,16
F,3,4
Other,410,255
P,23,42
S,44,21
W,11,2


Much better! However, we still see that F type is not quite representative. Let's merge it with Other.

In [13]:
for ds in comb:
    ds['TickType'] = ds['TickType'].replace('F', 'Other')

comb[0][['TickType', 'Survived']].groupby(['TickType'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,TickType,Survived
3,P,0.646154
2,Other,0.385417
1,C,0.340426
4,S,0.323077
5,W,0.153846
0,A,0.068966


Let's unite SibSp and Parch columns to see how many relatives of each passenger were aboard.

In [14]:
for ds in comb:
    ds['RelatAboard'] = ds['SibSp'] + ds['Parch']

comb[0][['RelatAboard', 'Survived']].groupby(['RelatAboard'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,RelatAboard,Survived
3,3,0.724138
2,2,0.578431
1,1,0.552795
6,6,0.333333
0,0,0.303538
4,4,0.2
5,5,0.136364
7,7,0.0
8,10,0.0


In [15]:
pd.crosstab(comb[0]['RelatAboard'], comb[0]['Survived'])

Survived,0,1
RelatAboard,Unnamed: 1_level_1,Unnamed: 2_level_1
0,374,163
1,72,89
2,43,59
3,8,21
4,12,3
5,19,3
6,8,4
7,6,0
10,7,0


The RelatAboard column must be split into groups. 

In [16]:
for ds in comb:
    ds['OTTmembers'] = 2
    ds.loc[ds['RelatAboard'] == 0, 'OTTmembers'] = 0
    ds.loc[(ds['RelatAboard'] == 1) | (ds['RelatAboard'] == 2), 'OTTmembers'] = 1
    ds.loc[ds['RelatAboard'] == 3, 'OTTmembers'] = 3

comb[0][['OTTmembers', 'Survived']].groupby(['OTTmembers'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,OTTmembers,Survived
3,3,0.724138
1,1,0.562738
0,0,0.303538
2,2,0.16129


Pretty distinct, sir.

Let's look at the ranks of the passengers. That may bring some additional informativity.

In [17]:
for ds in comb:
    ds['Rank'] = 0
    rank = ds['Name'].str.extract(r'(\w+\.) ')
    ds['Rank'] = rank

ranks = set(comb[0]['Rank'].values)
print(ranks)

{'Master.', 'Jonkheer.', 'Dr.', 'Don.', 'Rev.', 'Col.', 'Countess.', 'Capt.', 'Mme.', 'Mlle.', 'Mr.', 'Lady.', 'Sir.', 'Miss.', 'Major.', 'Ms.', 'Mrs.'}


Too many ranks! And they're not representative enough, so we will group them in the following way:

In [18]:
ot_list = ['Capt.', 'Col.', 'Countess.', 'Don.', 'Dr.', 'Jonkheer.', 'Rev.', 'Sir.', 'Major.']
miss_list = ['Lady.', 'Mlle.', 'Mme.', 'Ms.']

for dataset in comb:
    dataset['Rank'].replace(ot_list, 'Other', inplace=True)
    dataset['Rank'].replace(miss_list, 'Miss.', inplace=True)
    
comb[0][['Rank', 'Survived']].groupby(['Rank'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Rank,Survived
3,Mrs.,0.792
1,Miss.,0.705882
0,Master.,0.575
4,Other,0.318182
2,Mr.,0.156673


That looks nice!

 Now let's try to do something with Age column. I guess we need to split it into some groups. Let it be 5 groups.

In [19]:
comb[0]['AgeGroup'] = pd.cut(comb[0]['Age'], 5)

comb[0][['AgeGroup', 'Survived']].groupby(['AgeGroup'], as_index=False).mean().sort_values(by='AgeGroup', ascending=False)

Unnamed: 0,AgeGroup,Survived
4,"(64.0, 80.0]",0.090909
3,"(48.0, 64.0]",0.434783
2,"(32.0, 48.0]",0.363636
1,"(16.0, 32.0]",0.367946
0,"(-0.08, 16.0]",0.486957


Let's merge the 3rd and 2nd age groups, since they correlate almost equally with the fact of survival.

Splitting Age into 4 categories:

In [20]:
for ds in comb:
    ds.loc[ds['Age'] < 16, 'Age'] = 0
    ds.loc[(ds['Age'] >= 16) & (ds['Age'] < 32), 'Age'] = 1
    ds.loc[(ds['Age'] >= 32) & (ds['Age'] < 64), 'Age'] = 2
    ds.loc[ds['Age'] >= 64, 'Age'] = 3
    ds['Age'] = ds['Age'].astype(int)
   
comb[0][['Age', 'Survived']].groupby(['Age'], as_index=False).mean().sort_values(by='Survived', ascending=False) # As we can see, the division is pretty distinct.

Unnamed: 0,Age,Survived
0,0,0.544444
2,2,0.385965
1,1,0.358744
3,3,0.076923


Now let's do something similiar to Fare column:

In [21]:
comb[0]['FareGroup'] = pd.qcut(comb[0]['Fare'], 5)

comb[0][['FareGroup', 'Survived']].groupby(['FareGroup'], as_index=False).mean().sort_values(by='FareGroup', ascending=False)

Unnamed: 0,FareGroup,Survived
4,"(39.688, 512.329]",0.642045
3,"(21.679, 39.688]",0.444444
2,"(10.5, 21.679]",0.424419
1,"(7.854, 10.5]",0.201087
0,"(-0.001, 7.854]",0.217877


Merging groups with similiar correlation values:

In [22]:
for ds in comb:
    ds.loc[ds['Fare'] <= 10.5, 'Fare'] = 0
    ds.loc[(ds['Fare'] > 10.5) & (ds['Fare'] <= 39.688), 'Fare'] = 1
    ds.loc[ds['Fare'] > 39.688, 'Fare'] = 2
    ds['Fare'] = ds['Fare'].astype(int)
    
comb[0][['Fare', 'Survived']].groupby(['Fare'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Fare,Survived
2,2,0.642045
1,1,0.434659
0,0,0.209366


Seems to be accurate.

## Encoding
#### We need our datasets to be numerical so as to be able to train a model. Let's encode everything we'll need into numbers.

In [23]:
s_dict = {'male': 0, 'female': 1}

for ds in comb:
    ds['Sex'] = ds['Sex'].map(s_dict)
    ds['Embarked'] = ds['Embarked'].astype('category').cat.codes
    ds['TickType'] = ds['TickType'].astype('category').cat.codes
    ds['Rank'] = ds['Rank'].astype('category').cat.codes

## Dropping

#### There are some columns left that we don't need.

In [24]:
drop_col_test = ['Cabin', 'Name', 'Parch', 'SibSp', 'RelatAboard', 'Ticket']
drop_col_train = drop_col_test.copy()
drop_col_train.extend(['FareGroup', 'AgeGroup', 'PassengerId'])

comb[0] = comb[0].drop(columns=drop_col_train)
comb[1] = comb[1].drop(columns=drop_col_test)

Now we can see that our both datasets are non-null & numerical and the data is cleverly divided into columns.

In [25]:
print("TRAIN DATA"), comb[0].info(), print("-"*39), print("-"*39), print("TEST DATA"), comb[1].info();

TRAIN DATA
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   Survived    891 non-null    int64
 1   Pclass      891 non-null    int64
 2   Sex         891 non-null    int64
 3   Age         891 non-null    int64
 4   Fare        891 non-null    int64
 5   Embarked    891 non-null    int8 
 6   Cabin_Null  891 non-null    int64
 7   TickType    891 non-null    int8 
 8   OTTmembers  891 non-null    int64
 9   Rank        891 non-null    int8 
dtypes: int64(7), int8(3)
memory usage: 51.5 KB
---------------------------------------
---------------------------------------
TEST DATA
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   PassengerId  418 non-null    int64
 1   Pclass       418 non-null    int64
 2   Sex     

In [26]:
train = comb[0]
test = comb[1]

# Part 2. Correlations
## Looking at the correlations again
Let's look again at the correlations between different features and the fact of survival.

In [27]:
train[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


In [28]:
train[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Sex,Survived
1,1,0.742038
0,0,0.188908


In [29]:
train[['Age', 'Survived']].groupby(['Age'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Age,Survived
0,0,0.544444
2,2,0.385965
1,1,0.358744
3,3,0.076923


In [30]:
train[['Fare', 'Survived']].groupby(['Fare'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Fare,Survived
2,2,0.642045
1,1,0.434659
0,0,0.209366


In [31]:
train[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Embarked,Survived
0,0,0.553571
1,1,0.38961
2,2,0.339009


In [32]:
train[['Cabin_Null', 'Survived']].groupby(['Cabin_Null'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Cabin_Null,Survived
0,0,0.666667
1,1,0.299854


In [33]:
train[['TickType', 'Survived']].groupby(['TickType'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,TickType,Survived
3,3,0.646154
2,2,0.385417
1,1,0.340426
4,4,0.323077
5,5,0.153846
0,0,0.068966


In [34]:
train[['OTTmembers', 'Survived']].groupby(['OTTmembers'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,OTTmembers,Survived
3,3,0.724138
1,1,0.562738
0,0,0.303538
2,2,0.16129


In [35]:
train[['Rank', 'Survived']].groupby(['Rank'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,Rank,Survived
3,3,0.792
1,1,0.705882
0,0,0.575
4,4,0.318182
2,2,0.156673


# Part 3. Modelling
## Data preparation

We're gonna split our training dataset into two datasets so as to get some information on how our models are going to perform on test data.

In [36]:
y_list = list(train['Survived'])
y = pd.Series(y_list, name='Survived')
X = train.drop(columns='Survived')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

## RandomForest

In [37]:
random_forest = RandomForestClassifier(bootstrap=True, n_estimators=700, criterion='entropy')
random_forest.fit(X_train, y_train)

print("Accuracy on train data: ", random_forest.score(X_train, y_train))
print("Accuracy on test data: ", random_forest.score(X_test, y_test))

Accuracy on train data:  0.8946629213483146
Accuracy on test data:  0.8379888268156425


## K-Neighbours

In [38]:
params={'n_neighbors' : range(1, 20), 'leaf_size' : range(1, 50)}

knn_grid = GridSearchCV(KNeighborsClassifier(), params, scoring='roc_auc')
knn_grid.fit(X_train, y_train)

print("Accuracy on train data: ", knn_grid.score(X_train, y_train))
print("Accuracy on test data: ", knn_grid.score(X_test, y_test))

Accuracy on train data:  0.8856183608982117
Accuracy on test data:  0.8519305019305019


## Ridge Classifier

In [39]:
params2 = {'alpha' : [0.00001, 0.0001, 0.001, 0.01, 1, 10, 100, 1000], 'normalize' : [True, False], 'random_state' : [0, 50, 100, 150, 200]}

r_grid = GridSearchCV(RidgeClassifier(), params2, scoring='roc_auc')
r_grid.fit(X_train, y_train)

print("Accuracy on train data: ", r_grid.score(X_train, y_train))
print("Accuracy on test data: ", r_grid.score(X_test, y_test))

Accuracy on train data:  0.8477334610730133
Accuracy on test data:  0.8774131274131274


## SVM

In [40]:
svm_classifier = svm.SVC(kernel="poly", C=3, degree=6)
svm_classifier.fit(X_train, y_train)

print("Accuracy on train data: ", svm_classifier.score(X_train, y_train))
print("Accuracy on test data: ", svm_classifier.score(X_test, y_test))

Accuracy on train data:  0.8932584269662921
Accuracy on test data:  0.7988826815642458


## XGBoost

In [41]:
gbm = xgb.XGBClassifier(
    learning_rate = 0.02,
    use_label_encoder=False,
    n_estimators= 2000,
    max_depth= 4,
    min_child_weight= 2,
    #gamma=1,
    gamma=0.9,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread= -1,
    eval_metric = 'error',
    scale_pos_weight=1).fit(X_train, y_train)

print("Accuracy on train data: ", gbm.score(X_train, y_train))
print("Accuracy on test data: ", gbm.score(X_test, y_test))

Accuracy on train data:  0.8679775280898876
Accuracy on test data:  0.8324022346368715


# Part 4. Final Model Choice
## SVM 

In [42]:
# Dividing our dataset into X and Y (ground truth).

Y_train_full = train['Survived']
X_train_full = train.drop(columns='Survived')
X_test_full = test.drop(columns='PassengerId')

X_train_full.shape, X_test_full.shape, Y_train_full.shape

((891, 9), (418, 9), (891,))

In [43]:
model = svm.SVC(kernel="poly", C=2)
model.fit(X_train_full, Y_train_full)

print("Accuracy on train data: ", model.score(X_train_full, Y_train_full))

Accuracy on train data:  0.8451178451178452


In [44]:
y_test_predicted = model.predict(X_test_full)
y_test_predicted

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [45]:
sub = pd.DataFrame({
    "PassengerId": comb[1]['PassengerId'],
    "Survived": y_test_predicted
})

sub.to_csv('submission.csv', index = False)