In [61]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [62]:
names = (['B1', 'B2', 'B3'] + ['C' + str(k) for k in range(1, 140)] + ['YEAR'] +
         ['C140', 'C141', 'C142'] + ['CT' + str(k) for k in range(1, 27)] +
         ['CH' + str(k) for k in range(1, 5)] + ['Class']
        )
feature_names = names[:-1]

In [63]:
dat = pd.read_csv('train.txt',  delimiter='\s+', names=names).dropna()

In [64]:
dat.head(5)

Unnamed: 0,B1,B2,B3,C1,C2,C3,C4,C5,C6,C7,...,CT22,CT23,CT24,CT25,CT26,CH1,CH2,CH3,CH4,Class
0,0,0,1,1.26,1.17,0.72,4.59,0.45,0.765,0.54,...,3,2,2,1,3,0,0,0,0,1
1,0,1,0,0.45,0.81,0.0,0.0,0.0,0.855,0.0,...,3,2,1,1,3,0,0,0,0,1
2,0,0,1,0.54,2.88,0.0,0.0,0.0,0.765,0.0,...,3,2,2,1,3,0,0,0,0,1
3,0,0,1,0.81,1.35,0.45,0.0,0.0,0.0,0.72,...,3,2,2,1,3,0,0,0,0,1
4,0,0,1,0.9,1.17,0.765,0.0,0.0,0.63,0.81,...,4,3,2,1,3,0,0,0,0,1


In [65]:
dat.shape

(12000, 177)

## Random forest grid search

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
import itertools as it

In [67]:
X = dat.drop(['Class'], axis=1); y = dat['Class']

In [68]:
X

Unnamed: 0,B1,B2,B3,C1,C2,C3,C4,C5,C6,C7,...,CT21,CT22,CT23,CT24,CT25,CT26,CH1,CH2,CH3,CH4
0,0,0,1,1.260,1.17,0.720,4.590,0.45,0.765,0.540,...,4,3,2,2,1,3,0,0,0,0
1,0,1,0,0.450,0.81,0.000,0.000,0.00,0.855,0.000,...,5,3,2,1,1,3,0,0,0,0
2,0,0,1,0.540,2.88,0.000,0.000,0.00,0.765,0.000,...,4,3,2,2,1,3,0,0,0,0
3,0,0,1,0.810,1.35,0.450,0.000,0.00,0.000,0.720,...,4,3,2,2,1,3,0,0,0,0
4,0,0,1,0.900,1.17,0.765,0.000,0.00,0.630,0.810,...,5,4,3,2,1,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0,1,0,0.450,1.17,0.630,1.215,0.45,0.990,0.540,...,4,3,2,2,1,3,0,0,0,0
11996,0,1,0,5.085,0.90,0.000,0.000,0.00,0.000,0.945,...,5,4,3,2,1,3,0,0,0,0
11997,0,0,1,1.170,1.71,0.630,0.000,0.00,0.720,0.000,...,4,3,2,1,1,3,0,0,0,0
11998,0,0,1,0.810,1.17,0.000,0.000,0.00,0.990,0.810,...,5,5,3,2,2,4,0,0,0,0


In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), test_size=0.2, random_state=0)

In [79]:
np.sum(y_train == 1) / len(y_train)

0.475

In [80]:
np.sum(y_test == 1) / len(y_test)

0.4925

In [18]:
n_sample = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.1, random_state=1)
rf = RandomForestClassifier()
cross_val_score(rf, X, y.ravel(), cv=cv)

array([0.72166667, 0.71166667, 0.72416667, 0.73083333, 0.72833333])

In [9]:
kf = KFold(n_splits=5)

params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 50, None],
    'min_samples_split': [2, 4, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['auto', 'sqrt'],
}

allNames = sorted(params)
combinations = it.product(*(params[Name] for Name in allNames))
combi = list(combinations)
accs = []

for para in combi:
    dict_para = dict(zip(allNames, list(para)))
    dict_acc = dict_para.copy()
    clf = RandomForestClassifier(**dict_para)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores.append(score)
    mean_score = np.mean(scores)
    dict_acc.update({
        'acc': mean_score
    })
    accs.append(dict_acc)

In [10]:
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 20)


In [11]:
accs = pd.DataFrame(accs)

In [12]:
accs

Unnamed: 0,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,acc
0,10.0,auto,1,2,50,0.738417
1,10.0,auto,1,2,100,0.741917
2,10.0,auto,1,2,200,0.740167
3,10.0,auto,1,4,50,0.737167
4,10.0,auto,1,4,100,0.739917
...,...,...,...,...,...,...
211,,sqrt,5,4,100,0.735667
212,,sqrt,5,4,200,0.735750
213,,sqrt,5,10,50,0.734000
214,,sqrt,5,10,100,0.736250


In [13]:
accs.loc[accs['acc']==max(accs['acc'])]

Unnamed: 0,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,acc
10,10.0,auto,3,2,100,0.743667


In [18]:
kf = KFold(n_splits=5)

params = {
    'n_estimators': [120, 130, 140],
    'max_depth': [10, 12, 15, None],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [2, 3],
    'max_features': ['auto', 'sqrt'],
}

allNames = sorted(params)
combinations = it.product(*(params[Name] for Name in allNames))
combi = list(combinations)
accs = []

for para in combi:
    dict_para = dict(zip(allNames, list(para)))
    dict_acc = dict_para.copy()
    clf = RandomForestClassifier(**dict_para)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores.append(score)
    mean_score = np.mean(scores)
    dict_acc.update({
        'acc': mean_score
    })
    accs.append(dict_acc)

In [19]:
accs = pd.DataFrame(accs)

In [20]:
accs.loc[accs['acc']==max(accs['acc'])]

Unnamed: 0,max_depth,max_features,min_samples_leaf,min_samples_split,n_estimators,acc
11,10.0,auto,3,3,140,0.744583


## Test the final model

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), test_size=0.2, random_state=0)

In [119]:
clf = RandomForestClassifier(max_depth=10, max_features='auto', min_samples_leaf=3, min_samples_split=3, n_estimators=140)

In [120]:
clf.fit(X_train, y_train)

RandomForestClassifier(max_depth=10, min_samples_leaf=3, min_samples_split=3,
                       n_estimators=140)

In [121]:
clf.score(X_train, y_train)

0.8203125

In [122]:
clf.score(X_test, y_test)

0.7566666666666667