# <center> Random Forest Classifier
### Import Preliminaries

In [1]:
%matplotlib inline

# Import modules
import pandas as pd
import numpy as np

# Import data
train_df = pd.DataFrame(pd.read_csv('https://www.dropbox.com/s/1yzzkc3j45x2ejm/featured_train_df.csv?dl=1',index_col='PassengerId'))
test_df = pd.DataFrame(pd.read_csv('https://www.dropbox.com/s/jjdgukejd3n3whp/featured_test_df.csv?dl=1', index_col='PassengerId'))

# Set pandas options
pd.set_option('precision',8)
pd.set_option('max_columns',100)
pd.set_option('max_rows',100)

dfs = [train_df, test_df]

### DataFrame to Values

In [2]:
# Convert pandas dataframe into prediction values and data
y = train_df.Survived.values
X = train_df[train_df.columns[1:]].values

X_test = test_df.values

### Setup Model

In [3]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

### Randomized Search CV Setup

In [4]:
from sklearn.grid_search import RandomizedSearchCV

# Create the grid
grid = {'n_estimators':list(range(1,100)),
       'max_depth': list(range(1,50)),
       'min_samples_leaf': list(range(1,10))}

models = RandomizedSearchCV(estimator=model, 
                           param_distributions=grid,
                           scoring='accuracy',n_jobs=1,
                           refit=True, cv=10)



### Fit Models

In [5]:
models.fit(X,y)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,...7, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
          pre_dispatch='2*n_jobs', rando

### Greedy Results

In [6]:
print('Accuracy of best parameters: %.3f'%models.best_score_)
print('Best parameters: %s' %models.best_params_)

Accuracy of best parameters: 0.816
Best parameters: {'n_estimators': 62, 'min_samples_leaf': 2, 'max_depth': 35}


### Tuned Model

In [7]:
classifier = RandomForestClassifier(max_depth=11, 
                                    min_samples_leaf=2,
                                    n_estimators=58)
classifier.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=58, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

### Model Prediction

In [8]:
# create predictions dataframe
prediction = pd.DataFrame(test_df.index)
survived = pd.DataFrame(classifier.predict(X_test),
                          columns=['Survived'])
prediction = pd.concat([prediction, survived],axis=1)
prediction.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


### K-Fold Cross Validation

In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

crossvalidation = KFold(10, random_state=1)
scores = cross_val_score(classifier, X, y, 
                scoring = 'accuracy',
                cv = crossvalidation, n_jobs =1)

print ('Folds: %i, accuracy: %.2f std: %.2f' 
% (len(scores),np.mean(np.abs(scores)),np.std(scores)))

NameError: name 'classifier' is not defined

In [16]:
(scores)

array([ 0.78651685,  0.78651685,  0.78651685,  0.85393258,  0.86516854,
        0.83146067,  0.80898876,  0.76404494,  0.84269663,  0.84090909])

### Export Results

In [10]:
prediction.to_csv('Submissions/titantic_predictions_rfc_randtuned.csv',index=False)

In [11]:
prediction.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


### Accuracy Results

<br> Feature Engineering 1 - Score: acc = 0.83, std: 0.05
<br> Feature Engineering 2 - Score: acc = 0.84, std: 0.05
<br> Feature Engineering 3 - Score: acc = 0.83, std: 0.05

### Kaggle Results

<br> Feature Engineering 1 - Score: 0.75119
<br> Feature Engineering 2 - Score: 0.70813
<br> Feature Engineering 3 - Score: 