In [1]:
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Load data into numpy array (numeric data only)
sex = lambda x: 0.0 if x == b"male" else 1.0  # male = 0, female = 1
embarked = lambda x: 0.0 if x == b'C' else 2.0 if x == b'Q' else 1.0 # C = 0, S = 1, Q = 2; fill missing embarked values with S
data = np.genfromtxt(open("../data/train.csv"), delimiter=',', skip_header=1, usecols=(1, 2, 5, 6, 7, 8, 10, 12), converters={5: sex, 12: embarked})

# Fill missing age and fare (testing data only) values with averages for each class
def fill_missing(data):
    class1 = data[:, 0] == 1
    class2 = data[:, 0] == 2
    class3 = data[:, 0] == 3

    avg_age1 = np.nanmean(data[class1, 2])
    avg_age2 = np.nanmean(data[class2, 2])
    avg_age3 = np.nanmean(data[class3, 2])
    avg_fare1 = np.nanmean(data[class1, 5])
    avg_fare2 = np.nanmean(data[class2, 5])
    avg_fare3 = np.nanmean(data[class3, 5])

    age_nans = np.isnan(data[:, 2])
    fare_nans = np.isnan(data[:, 5])

    data[age_nans & class1, 2] = avg_age1
    data[age_nans & class2, 2] = avg_age2
    data[age_nans & class3, 2] = avg_age3
    data[fare_nans & class1, 5] = avg_fare1
    data[fare_nans & class2, 5] = avg_fare2
    data[fare_nans & class3, 5] = avg_fare3

X = data[:, 1:]
y = data[:, 0]
fill_missing(X)

# Perform kfold of 5 splits
kf = StratifiedKFold(n_splits=5)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose some parameter combinations to try
parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(accuracy_score)

# Run the grid search
clf = GridSearchCV(RandomForestClassifier(), parameters, scoring=acc_scorer, n_jobs=7, cv=kf)
clf.fit(X, y)

print(clf.best_params_)

{'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 5, 'min_samples_split': 3, 'n_estimators': 9}


In [3]:
from sklearn.metrics import confusion_matrix

predictions = clf.predict(X)
tn, fp, fn, tp = confusion_matrix(y, predictions).ravel()
print("tn, fp, fn, tp")
print(tn, fp, fn, tp)
print(accuracy_score(y, predictions))

tn, fp, fn, tp
513 36 79 263
0.8709315375982043


In [6]:
for train_index, test_index in kf.split(X, y): # loop over each fold
    
    train_y = y[train_index]
    train_X = X[train_index]
    
    test_X = X[test_index]
    test_y = y[test_index]

    model = RandomForestClassifier(**clf.best_params_)
    model = model.fit(train_X, train_y)
    
    #print(model_tree.score(train_x, train_y))
    predictions = model.predict(test_X)
    
    conf_matrix = confusion_matrix(test_y, predictions)
    tn, fp, fn, tp = confusion_matrix(test_y, predictions).ravel()
    
    print(model.score(test_X, test_y))

    print('tn  fp  fn  tp')
    print((tn, fp, fn, tp))
    print()

0.776536312849162
tn  fp  fn  tp
(90, 20, 20, 49)

0.7988826815642458
tn  fp  fn  tp
(93, 17, 19, 50)

0.8314606741573034
tn  fp  fn  tp
(95, 15, 15, 53)

0.7921348314606742
tn  fp  fn  tp
(103, 7, 30, 38)

0.847457627118644
tn  fp  fn  tp
(102, 7, 20, 48)

