# Titanic (2/2): Prediction
- This is the Second Kernel which will use the Output of the first one to make predictions.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


## Import Cleaned Data

In [2]:
IN_CLOUD = False
INPUT_DIR = '../input/titanic-1-2-exploration-pre-processing' if IN_CLOUD else './data'

In [3]:
train_clean_df = pd.read_csv(f'{INPUT_DIR}/train_clean.csv')
test_clean_df  = pd.read_csv(f'{INPUT_DIR}/test_clean.csv' )

In [4]:
train_clean_df.sample(3)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,AgeCat,NbrRelatives,IsAlone,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Rare
494,0,3,1,21.0,8.05,1,0,True,0,0,1,0,0,1,0
797,1,3,0,31.0,8.6833,2,0,True,0,0,1,0,1,0,0
296,0,3,1,23.5,7.2292,2,0,True,1,0,0,0,0,1,0


#### Cut the Passenger Ids from the Test Dataset (because it's not a feature). We will append them later after we do our predictions:

In [5]:
test_passenger_ids = test_clean_df.PassengerId
test_clean_df = test_clean_df.drop('PassengerId', axis=1)

#### Do some Feature Selection and experiment with different features: 

In [6]:
drop_cols = ['NbrRelatives', 'Age']

train_clean_df.drop(drop_cols, axis=1, inplace=True)
test_clean_df.drop(drop_cols, axis=1, inplace=True)

In [7]:
train_clean_df.sample(3)

Unnamed: 0,Survived,Pclass,Sex,Fare,AgeCat,IsAlone,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Rare
666,0,2,1,13.0,2,True,0,0,1,0,0,1,0
316,1,2,0,26.0,2,False,0,0,1,0,1,0,0
268,1,1,0,153.4625,4,False,0,0,1,0,1,0,0


In [8]:
test_clean_df.sample(3)

Unnamed: 0,Pclass,Sex,Fare,IsAlone,AgeCat,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Rare
0,3,1,7.8292,True,2,0,1,0,0,0,1,0
294,3,1,9.5,True,3,0,0,1,0,0,1,0
409,3,0,13.775,False,0,0,0,1,0,1,0,0


## Modeling

In [9]:
train_y = train_clean_df.Survived
train_x = train_clean_df.drop('Survived', axis=1)

train_x.sample(3)

Unnamed: 0,Pclass,Sex,Fare,AgeCat,IsAlone,Embarked_C,Embarked_Q,Embarked_S,Title_Master,Title_Miss,Title_Mr,Title_Rare
191,2,1,13.0,1,True,0,0,1,0,0,1,0
125,3,1,11.2417,0,False,1,0,0,1,0,0,0
544,1,1,106.425,4,False,1,0,0,0,0,1,0


#### Imports related to modeling:

In [10]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn import metrics

  from numpy.core.umath_tests import inner1d


In [11]:
#import xgboost as xgb
#dmat = xgb.DMatrix(data=train_x, label=train_y)
#model = AdaBoostClassifier(n_estimators=50, learning_rate=1)
#model = RandomForestClassifier()
#model = xgb.XGBClassifier(learning_rate=0.04, n_estimators=100, max_depth=4)

In [12]:
GRID_SEARCH = False
if GRID_SEARCH:
    param_grid={
        'n_estimators': [x for x in range(50, 400, 50)],
        'max_features': ['auto', 'sqrt'],
        'max_depth': [4,5,6,7],
        'criterion': ['gini', 'entropy']
    }
    rfc = RandomForestClassifier()
    model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5)

    print('Fitting ... ')
    model.fit(train_x, train_y)
    print('Best Params: ', model.best_params_)
    print('CV results: ', model.cv_results_)


In [13]:
model = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=5, max_features='auto')
CROSS_VAL = True
if CROSS_VAL:
    scores = cross_val_score(model, train_x, train_y, cv=5)
    print(scores.mean())

0.8260604790361825


In [14]:
PREDICT = not GRID_SEARCH
if PREDICT:
    model.fit(train_x, train_y)
    predictions = model.predict(test_clean_df)

In [15]:
if PREDICT:
    OUTPUT_DIR = '' if IN_CLOUD else './data/'
    OUTPUT = True
    submission = pd.DataFrame({'PassengerId': test_passenger_ids, 'Survived': predictions})
    if OUTPUT:
        submission.to_csv(f'{OUTPUT_DIR}submission.csv', index=False)
        print('Done exporting !')
    print(submission.sample(5))

Done exporting !
     PassengerId  Survived
387         1279         0
37           929         0
172         1064         0
215         1107         0
401         1293         0
