In [1]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import KFold

# Seed the random number generator:
np.random.seed(1)

In [2]:
def E(y, predictions):
    return np.sum(np.not_equal(y, predictions)) / len(y)

In [4]:
# Load training text file
Data_train = np.loadtxt('training_data.txt', skiprows = 1)
X_train= Data_train[0:20000, 1:]
y_train= Data_train[0:20000, 0]
X_test = np.loadtxt('test_data.txt', skiprows = 1)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)


# Normalize training data
for i in range(len(X_train[0])):
    col = X_train[:, i]
    mean = np.mean(col)
    std = np.std(col)
    X_train[:, i] = (col - mean) / std
    X_test[:, i] = (X_test[:,i] - mean) / std
    
print('normalized')

kf = KFold(n_splits=3, shuffle=True, random_state=0)
in_index, out_index = list(kf.split(X_train))[0]
print("IN:", in_index, "OUT:", out_index)
X_in, X_out = X_train[in_index], X_train[out_index]
y_in, y_out = y_train[in_index], y_train[out_index]
print(X_in.shape, X_out.shape)
out_index_1, out_index_2 = list(kf.split(X_out))[0]
print("OUT1:", out_index_1, "OUT2:", out_index_2)
X_out_1, X_out_2 = X_out[out_index_1], X_out[out_index_2]
y_out_1, y_out = y_out[out_index_1], y_out[out_index_2]
print(X_out_1.shape, X_out_2.shape)

X_train shape: (20000, 1000)
y_train shape: (20000,)
normalized
IN: [    0     1     2 ..., 19996 19997 19998] OUT: [    8     9    12 ..., 19990 19995 19999]
(13333, 1000) (6667, 1000)
OUT1: [   0    2    3 ..., 6663 6664 6666] OUT2: [   1    4    6 ..., 6660 6662 6665]
(4444, 1000) (2223, 1000)


normalized


In [10]:
tuned_parameters = {'n_estimators': [1000], 'max_depth': [50]}
clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, scoring = 'accuracy')
clf.fit(X_in, y_in)
print(clf.best_score_)
print(clf.best_params_)

0.82449561239
{'max_depth': 50, 'n_estimators': 1000}


In [11]:
prediction = clf.predict(X_out_1)
submission = [[i+1, int(prediction[i])] for i in range(len(prediction))]
submission.insert(0, ['Id','Prediction'])
with open('forest_1014pm_prediction_training.csv', 'w') as f:
    for line in submission:
        f.write(','.join(map(str, line)) + '\n')

In [15]:
pred = clf.predict(X_test)
submission = [[i+1, int(pred[i])] for i in range(len(pred))]
submission.insert(0, ['Id','Prediction'])
with open('forest_1014pm_prediction_testing.csv', 'w') as f:
    for line in submission:
        f.write(','.join(map(str, line)) + '\n')