# Titanic
Patrick 🌰

In [1]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

from sklearn.svm import SVC, LinearSVC
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve


import warnings
warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
train_df = pd.read_csv('Datasets/train.csv')
test_df = pd.read_csv('Datasets/test.csv')
combine_df = pd.concat([train_df,test_df])

## 1.Feature Engineering

### 1.1 One-hot encoding on Title

In [4]:
train_df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [5]:
# test_df.head()

In [6]:
combine_df['Title'] = combine_df['Name'].apply(lambda x: x.split(', ')[1]).apply(lambda x: x.split('.')[0])
combine_df['Title'] = combine_df['Title'].replace(['Don','Dona', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col','Sir','Dr'],'Mr')
combine_df['Title'] = combine_df['Title'].replace(['Mlle','Ms'], 'Miss')
combine_df['Title'] = combine_df['Title'].replace(['the Countess','Mme','Lady','Dr'], 'Mrs')
df = pd.get_dummies(combine_df['Title'],prefix='Title')
combine_df = pd.concat([combine_df,df],axis=1)

In [7]:
combine_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Mr,0,0,1,0
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Mrs,0,0,0,1
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Miss,0,1,0,0
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Mrs,0,0,0,1
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Mr,0,0,1,0


### 1.2 Get the length of the name and split the length equally based on the frequency of the length values

In [8]:
combine_df['Name_Len'] = combine_df['Name'].apply(lambda x: len(x))
combine_df['Name_Len'] = pd.qcut(combine_df['Name_Len'],5)

In [9]:
combine_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Name_Len
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,Mr,0,0,1,0,"(19.0, 23.2]"
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,Mrs,0,0,0,1,"(32.0, 82.0]"
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,Miss,0,1,0,0,"(19.0, 23.2]"
3,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,Mrs,0,0,0,1,"(32.0, 82.0]"
4,35.0,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,Mr,0,0,1,0,"(23.2, 27.0]"


### 1.3 Dead_female_family & Survive_male_family

In [10]:
#Dead_female_family & Survive_male_family
combine_df['Surname'] = combine_df['Name'].apply(lambda x:x.split(',')[0])
dead_female_surname = list(set(combine_df[(combine_df.Sex=='female') & (combine_df.Age>=12)
                              & (combine_df.Survived==0) & ((combine_df.Parch>0) | (combine_df.SibSp > 0))]['Surname'].values))
survive_male_surname = list(set(combine_df[(combine_df.Sex=='male') & (combine_df.Age>=12)
                              & (combine_df.Survived==1) & ((combine_df.Parch>0) | (combine_df.SibSp > 0))]['Surname'].values))
combine_df['Dead_female_family'] = np.where(combine_df['Surname'].isin(dead_female_surname),0,1)
combine_df['Survive_male_family'] = np.where(combine_df['Surname'].isin(survive_male_surname),0,1)
combine_df = combine_df.drop(['Name','Surname'],axis=1)


In [11]:
combine_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Name_Len,Dead_female_family,Survive_male_family
0,22.0,,S,7.25,0,1,3,male,1,0.0,A/5 21171,Mr,0,0,1,0,"(19.0, 23.2]",1,1
1,38.0,C85,C,71.2833,0,2,1,female,1,1.0,PC 17599,Mrs,0,0,0,1,"(32.0, 82.0]",1,1
2,26.0,,S,7.925,0,3,3,female,0,1.0,STON/O2. 3101282,Miss,0,1,0,0,"(19.0, 23.2]",1,1
3,35.0,C123,S,53.1,0,4,1,female,1,1.0,113803,Mrs,0,0,0,1,"(32.0, 82.0]",1,1
4,35.0,,S,8.05,0,5,3,male,0,0.0,373450,Mr,0,0,1,0,"(23.2, 27.0]",1,1


### 1.4 Age & isChild

In [12]:
group = combine_df.groupby(['Title', 'Pclass'])['Age']
combine_df['Age'] = group.transform(lambda x: x.fillna(x.median()))
combine_df = combine_df.drop('Title',axis=1)
combine_df['IsChild'] = np.where(combine_df['Age']<=12,1,0)
# combine_df['Age'] = pd.cut(combine_df['Age'],5)
# combine_df = combine_df.drop('Age',axis=1)

In [13]:
combine_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Name_Len,Dead_female_family,Survive_male_family,IsChild
0,22.0,,S,7.25,0,1,3,male,1,0.0,A/5 21171,0,0,1,0,"(19.0, 23.2]",1,1,0
1,38.0,C85,C,71.2833,0,2,1,female,1,1.0,PC 17599,0,0,0,1,"(32.0, 82.0]",1,1,0
2,26.0,,S,7.925,0,3,3,female,0,1.0,STON/O2. 3101282,0,1,0,0,"(19.0, 23.2]",1,1,0
3,35.0,C123,S,53.1,0,4,1,female,1,1.0,113803,0,0,0,1,"(32.0, 82.0]",1,1,0
4,35.0,,S,8.05,0,5,3,male,0,0.0,373450,0,0,1,0,"(23.2, 27.0]",1,1,0


### 1.5 Ticket

In [14]:
combine_df['Ticket_Lett'] = combine_df['Ticket'].apply(lambda x: str(x)[0])
combine_df['Ticket_Lett'] = combine_df['Ticket_Lett'].apply(lambda x: str(x))

combine_df['High_Survival_Ticket'] = np.where(combine_df['Ticket_Lett'].isin(['1', '2', 'P']),1,0)
combine_df['Low_Survival_Ticket'] = np.where(combine_df['Ticket_Lett'].isin(['A','W','3','7']),1,0)
combine_df = combine_df.drop(['Ticket','Ticket_Lett'],axis=1)

In [15]:
combine_df.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Name_Len,Dead_female_family,Survive_male_family,IsChild,High_Survival_Ticket,Low_Survival_Ticket
0,22.0,,S,7.25,0,1,3,male,1,0.0,0,0,1,0,"(19.0, 23.2]",1,1,0,0,1
1,38.0,C85,C,71.2833,0,2,1,female,1,1.0,0,0,0,1,"(32.0, 82.0]",1,1,0,1,0
2,26.0,,S,7.925,0,3,3,female,0,1.0,0,1,0,0,"(19.0, 23.2]",1,1,0,0,0
3,35.0,C123,S,53.1,0,4,1,female,1,1.0,0,0,0,1,"(32.0, 82.0]",1,1,0,1,0
4,35.0,,S,8.05,0,5,3,male,0,0.0,0,0,1,0,"(23.2, 27.0]",1,1,0,0,1


### 1.6 Others

In [16]:

#Embarked
combine_df.Embarked = combine_df.Embarked.fillna('S')
df = pd.get_dummies(combine_df['Embarked'],prefix='Embarked')
combine_df = pd.concat([combine_df,df],axis=1).drop('Embarked',axis=1)

#FamilySize
combine_df['FamilySize'] = np.where(combine_df['SibSp']+combine_df['Parch']==0, 'Alone',
                                    np.where(combine_df['SibSp']+combine_df['Parch']<=3, 'Small', 'Big'))
df = pd.get_dummies(combine_df['FamilySize'],prefix='FamilySize')
combine_df = pd.concat([combine_df,df],axis=1).drop(['SibSp','Parch','FamilySize'],axis=1)


#Cabin
combine_df['Cabin_isNull'] = np.where(combine_df['Cabin'].isnull(),0,1)
combine_df = combine_df.drop('Cabin',axis=1)

#PClass
df = pd.get_dummies(combine_df['Pclass'],prefix='Pclass')
combine_df = pd.concat([combine_df,df],axis=1).drop('Pclass',axis=1)


#Sex
df = pd.get_dummies(combine_df['Sex'],prefix='Sex')
combine_df = pd.concat([combine_df,df],axis=1).drop('Sex',axis=1)

#Fare
combine_df['Fare'].fillna(combine_df['Fare'].dropna().median(),inplace=True)
combine_df['Low_Fare'] = np.where(combine_df['Fare']<=8.662,1,0)
combine_df['High_Fare'] = np.where(combine_df['Fare']>=26,1,0)
combine_df = combine_df.drop('Fare',axis=1)

In [17]:
combine_df.head()

Unnamed: 0,Age,PassengerId,Survived,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Name_Len,Dead_female_family,Survive_male_family,...,FamilySize_Big,FamilySize_Small,Cabin_isNull,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Low_Fare,High_Fare
0,22.0,1,0.0,0,0,1,0,"(19.0, 23.2]",1,1,...,0,1,0,0,0,1,0,1,1,0
1,38.0,2,1.0,0,0,0,1,"(32.0, 82.0]",1,1,...,0,1,1,1,0,0,1,0,0,1
2,26.0,3,1.0,0,1,0,0,"(19.0, 23.2]",1,1,...,0,0,0,0,0,1,1,0,1,0
3,35.0,4,1.0,0,0,0,1,"(32.0, 82.0]",1,1,...,0,1,1,1,0,0,1,0,0,1
4,35.0,5,0.0,0,0,1,0,"(23.2, 27.0]",1,1,...,0,0,0,0,0,1,0,1,1,0


In [18]:
combine_df.columns

Index(['Age', 'PassengerId', 'Survived', 'Title_Master', 'Title_Miss',
       'Title_Mr', 'Title_Mrs', 'Name_Len', 'Dead_female_family',
       'Survive_male_family', 'IsChild', 'High_Survival_Ticket',
       'Low_Survival_Ticket', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'FamilySize_Alone', 'FamilySize_Big', 'FamilySize_Small',
       'Cabin_isNull', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
       'Sex_male', 'Low_Fare', 'High_Fare'],
      dtype='object')

In [19]:
features = combine_df.drop(["PassengerId","Survived"], axis=1).columns
le = LabelEncoder()
for feature in features:
    le = le.fit(combine_df[feature])
    combine_df[feature] = le.transform(combine_df[feature])

In [None]:
X_all = combine_df.iloc[:891,:].drop(["PassengerId","Survived"], axis=1)
Y_all = combine_df.iloc[:891,:]["Survived"]
X_test = combine_df.iloc[891:,:].drop(["PassengerId","Survived"], axis=1)

logreg = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier(n_neighbors = 3)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier(n_estimators=300,min_samples_leaf=4,class_weight={0:0.745,1:0.255})
gbdt = GradientBoostingClassifier(n_estimators=500,learning_rate=0.03,max_depth=3)
# xgb = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.03)
xgb = XGBClassifier(learning_rate=0.05, max_depth= 2, n_estimators= 280)
lgb = LGBMClassifier(max_depth=3, n_estimators=500, learning_rate=0.02)
clfs = [logreg, svc, knn, decision_tree, random_forest, gbdt, xgb, lgb]

kfold = 10
cv_results = []
for classifier in clfs :
      cv_results.append(cross_val_score(classifier, X_all.values, y = Y_all.values, scoring = "accuracy", cv = kfold))

#     cv_results.append(cross_val_score(classifier, X_all.values, y = Y_all.values, scoring = "accuracy", cv = kfold, n_jobs=-1))

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

ag = ["LR","SVC",'KNN','decision_tree',"random_forest","GBDT","xgbGBDT", "LGB"]
cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,
                       "Algorithm":ag})

g = sns.barplot("CrossValMeans","Algorithm",data = cv_res, palette="Set3",orient = "h",**{'xerr':cv_std})
g.set_xlabel("Mean Accuracy")
g = g.set_title("Cross validation scores")

In [8]:
for i in range(7):
    print(ag[i],cv_means[i])
    
    
#     LR 0.8730793893996142
# SVC 0.8674489274770174
# KNN 0.8506829531267733
# decision_tree 0.8652142208602884
# random_forest 0.8620570877312451
# GBDT 0.8843408807172851
# xgbGBDT 0.8854267393031439

LR 0.8730793893996142
SVC 0.8674489274770174
KNN 0.8506829531267733
decision_tree 0.8652142208602884
random_forest 0.8620570877312451
GBDT 0.8843408807172851
xgbGBDT 0.8854267393031439


In [9]:
from sklearn.metrics import precision_score

class Bagging(object):
    
    def __init__(self,estimators):
        self.estimator_names = []
        self.estimators = []
        for i in estimators:
            self.estimator_names.append(i[0])
            self.estimators.append(i[1])
        self.clf = LogisticRegression()
    
    def fit(self, train_x, train_y):
        for i in self.estimators:
            i.fit(train_x,train_y)
        x = np.array([i.predict(train_x) for i in self.estimators]).T
        y = train_y
        self.clf.fit(x, y)
    
    def predict(self,x):
        x = np.array([i.predict(x) for i in self.estimators]).T
        #print(x)
        return self.clf.predict(x)
        
    
    def score(self,x,y):
        s = precision_score(y,self.predict(x))
        #print(s)
        return s

In [35]:

lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=300,min_samples_leaf=4,class_weight={0:0.745,1:0.255})
gbdt = GradientBoostingClassifier(n_estimators=500,learning_rate=0.03,max_depth=3)
xgbGBDT = XGBClassifier(max_depth=2, n_estimators=280, learning_rate=0.05)
clfs = [logreg, svc, knn, decision_tree, random_forest, gbdt, xgb]

bag = Bagging([('xgb',xgb),('lr',lr),('gbdt',gbdt), ("lgb", lgb),("xgbGBDT",xgbGBDT),("rf",rf)])

In [36]:
score = 0
for i in range(0,10):
    num_test = 0.20
    X_train, X_cv, Y_train, Y_cv = train_test_split(X_all.values, Y_all.values, test_size=num_test)
    bag.fit(X_train, Y_train)
    #Y_test = bag.predict(X_test)
    acc_xgb = round(bag.score(X_cv, Y_cv) * 100, 2)
    score+=acc_xgb
score/10

89.241

In [1]:
bag.fit(X_all.values, Y_all.values)
Y_test = bag.predict(X_test.values).astype(int)
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": Y_test
    })
submission.to_csv('submission.csv', index=False)

In [17]:
X_train

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 1, 0],
       [0, 0, 1, ..., 1, 0, 1],
       ...,
       [0, 1, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### GridSearch on XGBoost model ( Parallel Computing)

In [19]:
# %%time

# params = {'max_depth':range(2, 20), 'n_estimators':range(10, 500, 10), 'learning_rate':[0.01,0.02,0.03,0.05, 0.1, 0.25, 0.5]}

# xgbc_best = XGBClassifier()

# gs = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1)

# gs.fit(X_train, Y_train)


Fitting 5 folds for each of 6174 candidates, totalling 30870 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done 1033 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done 1383 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done 1833 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 2383 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 3033 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3783 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 5045 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 6174 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 7224 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 8374 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 10096 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done 11446 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 12896 tasks   

CPU times: user 1min 27s, sys: 4.48 s, total: 1min 31s
Wall time: 28min 12s


In [23]:
# print (gs.best_score_)
# print (gs.best_params_)
# # print (cross_val_score(gs, X_train, Y_train, cv=5).mean())
# # xgbc_best_y_predict = gs.predict(X_test)
# # rf_best_y_predict = clf.predict(X_test)

0.8820224719101124
{'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 280}


In [25]:
# Y_test = gs.predict(X_test.values).astype(int)

In [34]:
# xgbc_best_submission = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Survived':Y_test})
# xgbc_best_submission.to_csv('xgbc_best_submission.csv', index=False)