# Titanic Survival Prediction

#### Imports

In [197]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

#### Load train file

In [231]:
df_train = pd.read_csv("data/train.csv")

In [232]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Features

In [233]:
def features(df):
    df = df.drop(['PassengerId','Name','Ticket','Cabin','Embarked'],axis=1)
    df["Sex"] = df["Sex"].apply(lambda x: 1.0 if x=='female' else 0.0)
    df = df.fillna(-1.0)
    return df

from sklearn.model_selection import train_test_split
def get_train_test_data(df):
    feat = features(df)
    train, test = train_test_split(feat,test_size=0.3)
    y_train = train[['Survived']]
    X_train = train.drop('Survived',axis=1)
    y_test = test[['Survived']]
    X_test = test.drop('Survived',axis=1)
    return X_train, y_train, X_test, y_test

def get_test_data(df):
    feat = features(df)
    return feat

X_train, y_train, X_test, y_test = get_train_test_data(df_train)

In [234]:
len(X_test),len(X_train)

(268, 623)

In [235]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
263,1,0.0,40.0,0,0,0.0
66,2,1.0,29.0,0,0,10.5
824,3,0.0,2.0,4,1,39.6875
587,1,0.0,60.0,1,1,79.2
627,1,1.0,21.0,0,0,77.9583


### Gradient Boosted Decision Trees

In [236]:
class GBT:
    clf = None
    X_train = None
    y_train = None
    
    X_test = None
    y_test = None
    
    def set_train_data(self, X, y):
        self.X_train = X
        self.y_train = y

    def set_test_data(self, X, y):
        self.X_test = X
        self.y_test = y

    def __format_y(self, y):
        return y.values.reshape(-1,)

    def train(self):   
        self.clf = GradientBoostingClassifier(n_estimators=200,max_depth=3)
        self.clf.fit(self.X_train, self.__format_y(self.y_train))
        return self.clf

    def train_accuracy(self):
        return self.clf.score(self.X_train, self.__format_y(y_train))
        
    def test_accuracy(self):
        return self.clf.score(self.X_test, self.__format_y(y_test))
    
    def predict(self, X_test):
        return self.clf.predict(X_test)


In [237]:
gbt = GBT()
gbt.set_train_data(X_train, y_train)
gbt.set_test_data(X_test, y_test)
gbt.train()


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [238]:
gbt.train_accuracy()

0.9293739967897271

In [239]:
gbt.test_accuracy()

0.82089552238805974

### Write Test file

In [283]:
class Scoring():
    df_test=None
    X_test = None
    df_submit = None
    def read_test_file(self):
        self.df_test = pd.read_csv("data/test.csv")
        return self.df_test
    def gen_features(self):
        self.X_test = get_test_data(self.df_test)
        return self.X_test
    def score(self):
        pred = gbt.predict(self.X_test)
        df_submit = pd.DataFrame()
        df_submit['PassengerId'] = df_test['PassengerId']
        df_submit['Survived'] = pred
        self.df_submit = df_submit
        return self.df_submit
    def write_to_file(self):
        self.df_submit.to_csv("data/submit.csv",index=False)
    def read_back(self):
        return pd.read_csv("data/submit.csv")
    

In [284]:
scr = Scoring()
scr.read_test_file().head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [285]:
scr.gen_features().head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,0.0,34.5,0,0,7.8292
1,3,1.0,47.0,1,0,7.0
2,2,0.0,62.0,0,0,9.6875
3,3,0.0,27.0,0,0,8.6625
4,3,1.0,22.0,1,1,12.2875


In [286]:
scr.score().head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,0
4,896,1


In [287]:
scr.write_to_file()

In [289]:
scr.read_back().head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,0
4,896,1
