In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")

In [2]:
names = (['B1', 'B2', 'B3'] + ['C' + str(k) for k in range(1, 140)] + ['YEAR'] +
         ['C140', 'C141', 'C142'] + ['CT' + str(k) for k in range(1, 27)] +
         ['CH' + str(k) for k in range(1, 5)] + ['Class']
        )
feature_names = names[:-1]

In [3]:
dat = pd.read_csv('train.txt',  delimiter='\s+', names=names).dropna()

In [4]:
dat.head(5)

Unnamed: 0,B1,B2,B3,C1,C2,C3,C4,C5,C6,C7,...,CT22,CT23,CT24,CT25,CT26,CH1,CH2,CH3,CH4,Class
0,0,0,1,1.26,1.17,0.72,4.59,0.45,0.765,0.54,...,3,2,2,1,3,0,0,0,0,1
1,0,1,0,0.45,0.81,0.0,0.0,0.0,0.855,0.0,...,3,2,1,1,3,0,0,0,0,1
2,0,0,1,0.54,2.88,0.0,0.0,0.0,0.765,0.0,...,3,2,2,1,3,0,0,0,0,1
3,0,0,1,0.81,1.35,0.45,0.0,0.0,0.0,0.72,...,3,2,2,1,3,0,0,0,0,1
4,0,0,1,0.9,1.17,0.765,0.0,0.0,0.63,0.81,...,4,3,2,1,3,0,0,0,0,1


In [5]:
dat.shape

(12000, 177)

## Clean data

In [6]:
B = [np.where(dat.loc[k, ['B1', 'B2', 'B3']]==1)[0][0] for k in range(dat.shape[0])]

In [7]:
dat_clean = dat.copy()

In [17]:
dat_clean['B'] = B

In [21]:
dat = dat_clean.drop(['CH2', 'CH4'], axis=1)

In [22]:
dat

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,CT21,CT22,CT23,CT24,CT25,CT26,CH1,CH3,Class,B
0,1.260,1.17,0.720,4.590,0.45,0.765,0.540,0.495,0.810,0.54,...,4,3,2,2,1,3,0,0,1,2
1,0.450,0.81,0.000,0.000,0.00,0.855,0.000,1.170,0.000,0.63,...,5,3,2,1,1,3,0,0,1,1
2,0.540,2.88,0.000,0.000,0.00,0.765,0.000,0.000,0.000,0.72,...,4,3,2,2,1,3,0,0,1,2
3,0.810,1.35,0.450,0.000,0.00,0.000,0.720,0.900,0.855,0.54,...,4,3,2,2,1,3,0,0,1,2
4,0.900,1.17,0.765,0.000,0.00,0.630,0.810,0.000,0.000,0.72,...,5,4,3,2,1,3,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.450,1.17,0.630,1.215,0.45,0.990,0.540,0.000,0.000,0.72,...,4,3,2,2,1,3,0,0,0,1
11996,5.085,0.90,0.000,0.000,0.00,0.000,0.945,0.000,0.000,0.00,...,5,4,3,2,1,3,0,0,1,1
11997,1.170,1.71,0.630,0.000,0.00,0.720,0.000,0.000,0.000,0.72,...,4,3,2,1,1,3,0,0,0,2
11998,0.810,1.17,0.000,0.000,0.00,0.990,0.810,0.000,0.000,0.00,...,5,5,3,2,2,4,0,0,1,2


In [121]:
X = dat[['B1', 'B2', 'B3']]; y = dat['Class']

In [34]:
rf = RandomForestClassifier()
rf.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [38]:
rf.predict([[1,0,0]])

array([0])

In [20]:
n_sample = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.1, random_state=1)
rf = RandomForestClassifier()
cross_val_score(rf, X, y.ravel(), cv=cv)

array([0.7       , 0.7325    , 0.70166667, 0.695     , 0.71      ])

In [25]:
y001 = y[(dat['B1']==0) & (dat['B2']==0) & (dat['B3']==1)]
np.sum(y001 == 1) / len(y001)

0.6473590201582036

In [26]:
y010 = y[(dat['B1']==0) & (dat['B2']==1) & (dat['B3']==0)]
np.sum(y010 == 1) / len(y010)

0.6255204506490326

In [122]:
y100 = y[(dat['B1']==1) & (dat['B2']==0) & (dat['B3']==0)]
np.sum(y100 == 1) / len(y100)

0.16283141570785392

Because the data in 001 and 010 are balance (class 1 takes 62, 64% respectively) and the data in 100 is not balance (class 1 16%). We treat the first two data sets together and treat the last separately.

In [None]:
dat = dat[dat['B3']]

## Random forest and grid search


In [57]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
import itertools as it

In [40]:
X = dat.drop(['Class'], axis=1); y = dat['Class']

In [41]:
X

Unnamed: 0,B1,B2,B3,C1,C2,C3,C4,C5,C6,C7,...,CT21,CT22,CT23,CT24,CT25,CT26,CH1,CH2,CH3,CH4
0,0,0,1,1.260,1.17,0.720,4.590,0.45,0.765,0.540,...,4,3,2,2,1,3,0,0,0,0
1,0,1,0,0.450,0.81,0.000,0.000,0.00,0.855,0.000,...,5,3,2,1,1,3,0,0,0,0
2,0,0,1,0.540,2.88,0.000,0.000,0.00,0.765,0.000,...,4,3,2,2,1,3,0,0,0,0
3,0,0,1,0.810,1.35,0.450,0.000,0.00,0.000,0.720,...,4,3,2,2,1,3,0,0,0,0
4,0,0,1,0.900,1.17,0.765,0.000,0.00,0.630,0.810,...,5,4,3,2,1,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0,1,0,0.450,1.17,0.630,1.215,0.45,0.990,0.540,...,4,3,2,2,1,3,0,0,0,0
11996,0,1,0,5.085,0.90,0.000,0.000,0.00,0.000,0.945,...,5,4,3,2,1,3,0,0,0,0
11997,0,0,1,1.170,1.71,0.630,0.000,0.00,0.720,0.000,...,4,3,2,1,1,3,0,0,0,0
11998,0,0,1,0.810,1.17,0.000,0.000,0.00,0.990,0.810,...,5,5,3,2,2,4,0,0,0,0


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y.ravel(), test_size=0.1, random_state=1)

In [43]:
np.sum(y_train == 1) / len(y_train)

0.4784259259259259

In [44]:
np.sum(y_test == 1) / len(y_test)

0.4791666666666667

In [18]:
n_sample = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.1, random_state=1)
rf = RandomForestClassifier()
cross_val_score(rf, X, y.ravel(), cv=cv)

array([0.72166667, 0.71166667, 0.72416667, 0.73083333, 0.72833333])

In [46]:
kf = KFold(n_splits=5)

params = {
    'n_estimators': [50, 80, 100, 500],
    'max_depth': [10, 50, None],
    'min_samples_split': [2, 4, 6],
    'max_features': ['auto', 'sqrt'],
}

allNames = sorted(params)
combinations = it.product(*(params[Name] for Name in allNames))
combi = list(combinations)
accs = []

for para in combi:
    dict_para = dict(zip(allNames, list(para)))
    dict_acc = dict_para.copy()
    clf = RandomForestClassifier(**dict_para)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.values[train_index], X.values[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        scores.append(score)
    mean_score = np.mean(score)
    dict_acc.update({
        'acc': mean_score
    })
    accs.append(dict_acc)

In [59]:
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 20)


In [50]:
accs = pd.DataFrame(accs)

In [51]:
accs

Unnamed: 0,max_depth,max_features,min_samples_split,n_estimators,acc
0,10.0,auto,2,50,0.717083
1,10.0,auto,2,80,0.72
2,10.0,auto,2,100,0.72125
3,10.0,auto,2,500,0.719583
4,10.0,auto,4,50,0.721667
5,10.0,auto,4,80,0.714583
6,10.0,auto,4,100,0.7175
7,10.0,auto,4,500,0.715417
8,10.0,auto,6,50,0.721667
9,10.0,auto,6,80,0.72125


In [52]:
clf.predict()

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=6,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [60]:
X

Unnamed: 0,B1,B2,B3,C1,C2,C3,C4,C5,C6,C7,...,CT21,CT22,CT23,CT24,CT25,CT26,CH1,CH2,CH3,CH4
0,0,0,1,1.260,1.17,0.720,4.590,0.45,0.765,0.540,...,4,3,2,2,1,3,0,0,0,0
1,0,1,0,0.450,0.81,0.000,0.000,0.00,0.855,0.000,...,5,3,2,1,1,3,0,0,0,0
2,0,0,1,0.540,2.88,0.000,0.000,0.00,0.765,0.000,...,4,3,2,2,1,3,0,0,0,0
3,0,0,1,0.810,1.35,0.450,0.000,0.00,0.000,0.720,...,4,3,2,2,1,3,0,0,0,0
4,0,0,1,0.900,1.17,0.765,0.000,0.00,0.630,0.810,...,5,4,3,2,1,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0,1,0,0.450,1.17,0.630,1.215,0.45,0.990,0.540,...,4,3,2,2,1,3,0,0,0,0
11996,0,1,0,5.085,0.90,0.000,0.000,0.00,0.000,0.945,...,5,4,3,2,1,3,0,0,0,0
11997,0,0,1,1.170,1.71,0.630,0.000,0.00,0.720,0.000,...,4,3,2,1,1,3,0,0,0,0
11998,0,0,1,0.810,1.17,0.000,0.000,0.00,0.990,0.810,...,5,5,3,2,2,4,0,0,0,0


In [86]:
X001 = X.iloc[test_index]
y001 = y.iloc[test_index]
y001 = y001[(X001['B1']==0) & (X001['B2']==0) & (X001['B3']==1)]
X001 = X001[(X001['B1']==0) & (X001['B2']==0) & (X001['B3']==1)]

X010 = X.iloc[test_index]
y010 = y.iloc[test_index]
y010 = y010[(X010['B1']==0) & (X010['B2']==1) & (X010['B3']==0)]
X010 = X010[(X010['B1']==0) & (X010['B2']==1) & (X010['B3']==0)]

X100 = X.iloc[test_index]
y100 = y.iloc[test_index]
y100 = y100[(X100['B1']==1) & (X100['B2']==0) & (X100['B3']==0)]
X100 = X100[(X100['B1']==1) & (X100['B2']==0) & (X100['B3']==0)]

In [76]:
y001_pred = clf.predict(X001)

In [77]:
np.sum(y001_pred == y001) / len(y001)

0.6374549819927972

In [78]:
np.sum(y001 == 1) / len(y001)

0.6338535414165666

In [87]:
y100_pred = clf.predict(X100)
print(np.sum(y100_pred == y100) / len(y100))
print(np.sum(1 == y100) / len(y100))

0.8276797829036635
0.17639077340569878


In [100]:
print(dat['C2'].quantile(0.93))
print(dat['C2'].quantile(0.95))

31.22279999999984
80.31374999999952


In [92]:
dat['C1'].max()

347.58

## log transform

In [120]:
dat = pd.read_csv('train.txt',  delimiter='\s+', names=names).dropna()

In [112]:
dat_discrete = dat[['B1', 'B2', 'B3', 'YEAR']]

In [113]:
dat_cont = dat.drop(['B1', 'B2', 'B3', 'YEAR', 'Class'], axis=1)
y = dat['Class']

In [114]:
dat_cont = dat_cont.transform(lambda x: np.log(x+1))

In [116]:
X = np.concatenate([dat_discrete, dat_cont], axis=1)

In [117]:
n_sample = X.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.1, random_state=1)
rf = RandomForestClassifier()
cross_val_score(rf, X, y.ravel(), cv=cv)

array([0.72583333, 0.71583333, 0.735     , 0.7225    , 0.73      ])

In [119]:
dat_cont

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,CT21,CT22,CT23,CT24,CT25,CT26,CH1,CH2,CH3,CH4
0,0.815365,0.774727,0.542324,1.720979,0.371564,0.568151,0.431782,0.402126,0.593327,0.431782,...,1.609438,1.386294,1.098612,1.098612,0.693147,1.386294,0.0,0.0,0.0,0.0
1,0.371564,0.593327,0.000000,0.000000,0.000000,0.617885,0.000000,0.774727,0.000000,0.488580,...,1.791759,1.386294,1.098612,0.693147,0.693147,1.386294,0.0,0.0,0.0,0.0
2,0.431782,1.355835,0.000000,0.000000,0.000000,0.568151,0.000000,0.000000,0.000000,0.542324,...,1.609438,1.386294,1.098612,1.098612,0.693147,1.386294,0.0,0.0,0.0,0.0
3,0.593327,0.854415,0.371564,0.000000,0.000000,0.000000,0.542324,0.641854,0.617885,0.431782,...,1.609438,1.386294,1.098612,1.098612,0.693147,1.386294,0.0,0.0,0.0,0.0
4,0.641854,0.774727,0.568151,0.000000,0.000000,0.488580,0.593327,0.000000,0.000000,0.542324,...,1.791759,1.609438,1.386294,1.098612,0.693147,1.386294,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,0.371564,0.774727,0.488580,0.795252,0.371564,0.688135,0.431782,0.000000,0.000000,0.542324,...,1.609438,1.386294,1.098612,1.098612,0.693147,1.386294,0.0,0.0,0.0,0.0
11996,1.805827,0.641854,0.000000,0.000000,0.000000,0.000000,0.665262,0.000000,0.000000,0.000000,...,1.791759,1.609438,1.386294,1.098612,0.693147,1.386294,0.0,0.0,0.0,0.0
11997,0.774727,0.996949,0.488580,0.000000,0.000000,0.542324,0.000000,0.000000,0.000000,0.542324,...,1.609438,1.386294,1.098612,0.693147,0.693147,1.386294,0.0,0.0,0.0,0.0
11998,0.593327,0.774727,0.000000,0.000000,0.000000,0.688135,0.593327,0.000000,0.000000,0.000000,...,1.791759,1.791759,1.386294,1.098612,1.098612,1.609438,0.0,0.0,0.0,0.0
