# <center> Random Forest Classifier
### Import Preliminaries

In [45]:
%matplotlib inline

# Import modules
import pandas as pd
import numpy as np

# Import data
train_df = pd.DataFrame(pd.read_csv('https://www.dropbox.com/s/1yzzkc3j45x2ejm/featured_train_df.csv?dl=1',index_col='PassengerId'))
test_df = pd.DataFrame(pd.read_csv('https://www.dropbox.com/s/jjdgukejd3n3whp/featured_test_df.csv?dl=1', index_col='PassengerId'))

# Set pandas options
pd.set_option('precision',8)
pd.set_option('max_columns',100)
pd.set_option('max_rows',100)

dfs = [train_df, test_df]

### DataFrame to Values

In [46]:
# Convert pandas dataframe into prediction values and data
y = train_df.Survived.values
X = train_df[train_df.columns[1:]].values

X_test = test_df.values

### Setup Model

In [47]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

### Greedy Search Setup

In [48]:
from sklearn.grid_search import GridSearchCV

# Create the grid
grid = {'n_estimators':list(range(30,40)),
       'max_depth': list(range(8,10)),
       'min_samples_leaf':list(range(1,2))}

models = GridSearchCV(estimator=model, 
                           param_grid=grid,
                           scoring='accuracy',n_jobs=1,
                           refit=True, cv=10)

### Fit Models

In [49]:
models.fit(X,y)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 'max_depth': [8, 9], 'min_samples_leaf': [1]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

### Greedy Results

In [50]:
print('Accuracy of best parameters: %.3f'%models.best_score_)
print('Best parameters: %s' %models.best_params_)

Accuracy of best parameters: 0.844
Best parameters: {'max_depth': 9, 'min_samples_leaf': 1, 'n_estimators': 38}


### Tuned Model

In [51]:
classifier = RandomForestClassifier(max_depth=15, 
                                    min_samples_leaf=2,
                                    n_estimators=54)
classifier.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=15, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=54, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Model Prediction

In [52]:
# create predictions dataframe
prediction = pd.DataFrame(test_df.index)
survived = pd.DataFrame(classifier.predict(X_test),
                          columns=['Survived'])
prediction = pd.concat([prediction, survived],axis=1)
prediction.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


### K-Fold Cross Validation

In [53]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

crossvalidation = KFold(10, random_state=1)
scores = cross_val_score(classifier, X, y, 
                scoring = 'accuracy',
                cv = crossvalidation, n_jobs =1)

print ('Folds: %i, mean squared error: %.2f std: %.2f' 
% (len(scores),np.mean(np.abs(scores)),np.std(scores)))

Folds: 10, mean squared error: 0.83 std: 0.03


### Export Results

In [54]:
prediction.to_csv('Submissions/titantic_predictions_rfc_greedtuned.csv',index=False)

In [55]:
prediction.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


### Accuracy Results

<br> Feature Engineering 1 - Score: acc = 0.83, std: 0.05
<br> Feature Engineering 2 - Score: acc = 0.84, std: 0.05
<br> Feature Engineering 3 - Score: acc = 0.81, std: 0.06

### Kaggle Results

<br> Feature Engineering 1 - Score: 0.75119
<br> Feature Engineering 2 - Score: 0.70813
<br> Feature Engineering 3 - Score: 