## Kaggle - Titanic competition

### Topic: fast summary of best ideas to build a running model

* Models: RandomForest
* Ensembling: none
* Tuning: GridSearch CV
* CV: default 3-fold
* Inspriation: https://www.kaggle.com/zlatankr/titanic-random-forest-82-78 by Zlatan Kremonic

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Loading data
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train = train.drop('PassengerId', axis=1)
train_y = train.pop('Survived')
test_id = test.pop('PassengerId')

In [3]:
# Preprocessing
def preprocess(df):
    # df = all_data
    df['Name_Length'] = df.Name.apply(len)
    df['Name_Title'] = df.Name.apply(lambda x: re.sub('(.*, )|(\..*)','',x))
    df['Age_Null_Flag'] = df.Age.apply(lambda x: 1 if pd.isnull(x) else 0)
    tmp = df.groupby(['Name_Title', 'Pclass']).Age
    df['Age'] = tmp.transform(lambda x: x.fillna(x.mean()))
    df.Age.fillna(df.Age.mean(), inplace=True)  # one record with NA remained, check why
    df['Fam_Size'] = np.where((df.SibSp + df.Parch) == 0 , 'Solo',
                     np.where((df.SibSp + df.Parch) <= 3,'Nuclear',
                               'Big'))
    df['Ticket_Letter'] = df.Ticket.apply(lambda x: str(x)[0])
    df['Ticket_Letter'] = np.where(
            df.Ticket_Letter.isin(['1', '2', '3', 'S', 'P', 'C', 'A']), df.Ticket_Letter,
                                   np.where(df.Ticket_Letter.isin(['W', '4', '7', '6', 'L', '5', '8']),
                                            'Low_ticket', 'Other_ticket'))
    df['Ticket_Len'] = df.Ticket.apply(len)
    df['Cabin_Letter'] = df.Cabin.apply(lambda x: str(x)[0])
    df['tmp'] = df.Cabin.apply(lambda x: str(x).split(' ')[-1][1:])
    df.tmp.replace('an', np.NaN, inplace = True)
    df['tmp'] = df.tmp.apply(lambda x: int(x) if not pd.isnull(x) and x != '' else np.NaN)
    df['Cabin_number'] = pd.qcut(df.tmp,3)
    df = pd.concat((df, pd.get_dummies(df.Cabin_number, prefix = 'Cabin_number')), axis = 1)
    df['Embarked'] = df.Embarked.fillna('S')
    df.Fare.fillna(df.Fare.mean(), inplace = True)
    object_columns = ['Sex', 'Embarked', 'Ticket_Letter', 'Cabin_Letter', 'Name_Title', 'Fam_Size']
    dummies = pd.get_dummies(df[object_columns])
    df = pd.concat([df, dummies], axis=1)
    df.drop(object_columns, axis=1, inplace=True)
    df.drop(['Name','SibSp','Parch','Ticket','Cabin','tmp','Cabin_number'], axis=1, inplace=True)
    return df
train_n = len(train)
all_data = train.append(test)
tmp = preprocess(all_data)
train = tmp[:train_n]
test = tmp[train_n:]

In [7]:
# Model tuning and fitting
rf = RandomForestClassifier(n_jobs=-1)
params = {"min_samples_leaf" : [1, 5, 10],
         "min_samples_split" : [2, 5, 10, 15],
         "n_estimators": [500, 800, 1200]}
gs = GridSearchCV(rf, params, n_jobs=-1)
gs.fit(train, train_y)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_samples_leaf': [1, 5, 10], 'n_estimators': [500, 800, 1200], 'min_samples_split': [2, 5, 10, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [8]:
# Results
print(gs.best_score_)
print(gs.best_params_)

0.842873176207
{'min_samples_leaf': 1, 'n_estimators': 1200, 'min_samples_split': 5}


In [9]:
# Making predictions
pred = gs.predict(test)
pred = pd.DataFrame({'PassengerId': test_id, 'Survived': pred})
pred.to_csv('submission.csv', index = False)