In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
import csv as csv

In [3]:
pwd

'C:\\Users\\hemanthkumar.k'

In [4]:
# Load the data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

In [5]:
# Create a list of the features we will eventually want for our model
features = ['Age', 'SibSp','Parch','Fare','male','embarked_Q','embarked_S','Pclass_2', 'Pclass_3']

In [6]:
# Create an encoder
sex_encoder = preprocessing.LabelEncoder()

# Fit the encoder to the train data so it knows that male = 1
sex_encoder.fit(train['Sex'])

# Apply the encoder to the training data
train['male'] = sex_encoder.transform(train['Sex'])

# Apply the encoder to the training data
test['male'] = sex_encoder.transform(test['Sex'])

In [7]:
# Convert the Embarked training feature into dummies using one-hot
# and leave one first category to prevent perfect collinearity
train_embarked_dummied = pd.get_dummies(train["Embarked"], prefix='embarked', drop_first=True)

# Convert the Embarked test feature into dummies using one-hot
# and leave one first category to prevent perfect collinearity
test_embarked_dummied = pd.get_dummies(test["Embarked"], prefix='embarked', drop_first=True)

# Concatenate the dataframe of dummies with the main dataframes
train = pd.concat([train, train_embarked_dummied], axis=1)
test = pd.concat([test, test_embarked_dummied], axis=1)

In [8]:
# Convert the Pclass training feature into dummies using one-hot
# and leave one first category to prevent perfect collinearity
train_Pclass_dummied = pd.get_dummies(train["Pclass"], prefix='Pclass', drop_first=True)

# Convert the Pclass test feature into dummies using one-hot
# and leave one first category to prevent perfect collinearity
test_Pclass_dummied = pd.get_dummies(test["Pclass"], prefix='Pclass', drop_first=True)

# Concatenate the dataframe of dummies with the main dataframes
train = pd.concat([train, train_Pclass_dummied], axis=1)
test = pd.concat([test, test_Pclass_dummied], axis=1)

In [11]:
# Create an imputer object
age_imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)

# Fit the imputer object on the training data
age_imputer.fit(train['Age'].values.reshape(-1, 1))

# Apply the imputer object to the training and test data
train['Age'] = age_imputer.transform(train['Age'].values.reshape(-1, 1))
test['Age'] = age_imputer.transform(test['Age'].values.reshape(-1, 1))

In [13]:
# Create an imputer object
fare_imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)

# Fit the imputer object on the training data
fare_imputer.fit(train['Fare'].values.reshape(-1, 1))

# Apply the imputer object to the training and test data
train['Fare'] = fare_imputer.transform(train['Fare'].values.reshape(-1, 1))
test['Fare'] = fare_imputer.transform(test['Fare'].values.reshape(-1, 1))

In [18]:
# Create a dictionary containing all the candidate values of the parameters
parameter_grid = dict(n_estimators=list(range(1, 5001, 1000)),
                      criterion=['gini','entropy'],
                      max_features=list(range(1, len(features), 2)),
                      max_depth= [None] + list(range(5, 25, 1)))

# Creata a random forest object
random_forest = RandomForestClassifier(random_state=0, n_jobs=-1)

# Create a gridsearch object with 5-fold cross validation, and uses all cores (n_jobs=-1)
clf = GridSearchCV(estimator=random_forest, param_grid=parameter_grid, cv=2, verbose=1, n_jobs=10)

In [None]:
random_forest.accurecy

In [19]:
# Nest the gridsearchCV in a 3-fold CV for model evaluation
cv_scores = cross_val_score(clf, train[features], train['Survived'])

# Print results
print('Accuracy scores:', cv_scores)
print('Mean of score:', np.mean(cv_scores))
print('Variance of scores:', np.var(cv_scores))

Fitting 2 folds for each of 840 candidates, totalling 1680 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  6.4min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 14.3min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 25.7min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed: 42.1min
[Parallel(n_jobs=10)]: Done 1680 out of 1680 | elapsed: 57.4min finished


Fitting 2 folds for each of 840 candidates, totalling 1680 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  6.3min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 15.7min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 27.0min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed: 41.8min
[Parallel(n_jobs=10)]: Done 1680 out of 1680 | elapsed: 56.4min finished


Fitting 2 folds for each of 840 candidates, totalling 1680 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.3min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  6.0min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 13.7min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 24.6min
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed: 40.2min
[Parallel(n_jobs=10)]: Done 1680 out of 1680 | elapsed: 56.0min finished


Accuracy scores: [0.77104377 0.83501684 0.82154882]
Mean of score: 0.8092031425364757
Variance of scores: 0.0007583000474882243


In [None]:
# Retrain the model on the whole dataset
clf.fit(train[features], train['Survived'])

# Predict who survived in the test dataset
predictions = clf.predict(test[features])

Fitting 2 folds for each of 840 candidates, totalling 1680 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  5.9min


In [None]:
predictions