In [1]:
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [7]:
# Split y
y_train = np.loadtxt('training_data.txt', skiprows = 1)[:, 0]

# split into in and out
kf = KFold(n_splits=3, shuffle=True, random_state=0)
in_index, out_index = list(kf.split(y_train))[0]
print("IN:", in_index, "OUT:", out_index)
y_in, y_out = y_train[in_index], y_train[out_index]
print(y_in.shape, y_out.shape)

# split into out1 and out2
out_index_1, out_index_2 = list(kf.split(y_out))[0]
print("OUT1:", out_index_1, "OUT2:", out_index_2)
y_out_1, y_out_2 = y_out[out_index_1], y_out[out_index_2]
print(y_out_1.shape, y_out_2.shape)

IN: [    0     1     2 ..., 19996 19997 19998] OUT: [    8     9    12 ..., 19990 19995 19999]
(13333,) (6667,)
OUT1: [   0    2    3 ..., 6663 6664 6666] OUT2: [   1    4    6 ..., 6660 6662 6665]
(4444,) (2223,)


In [9]:
# Load prediction files from different classifiers
N_CLF = 3
X_out1 = np.ndarray((4444, N_CLF))
X_out2 = np.ndarray((2223, N_CLF))
X_test = np.ndarray((10000, N_CLF))

In [10]:
# Neural net
X_out1[:, 0] = np.loadtxt("out1_neuralnet.csv", skiprows=1, delimiter=',')[:, 1]
X_out2[:, 0] = np.loadtxt("out2_neuralnet.csv", skiprows=1, delimiter=',')[:, 1]
X_test[:, 0] = np.loadtxt("test_neuralnet.csv", skiprows=1, delimiter=',')[:, 1]
# Logistic
# X_out1[:, 1] = np.loadtxt("out1_logic.csv", skiprows=1, delimiter=',')[:, 1]
# X_out2[:, 1] = np.loadtxt("out2_logic.csv", skiprows=1, delimiter=',')[:, 1]
# X_test[:, 1] = np.loadtxt("test_logic.csv", skiprows=1, delimiter=',')[:, 1]
# Gradient Boosting
X_out1[:, 1] = np.loadtxt("out1_gdboosting.csv", skiprows=1, delimiter=',')[:, 1]
X_out2[:, 1] = np.loadtxt("out2_gdboosting.csv", skiprows=1, delimiter=',')[:, 1]
X_test[:, 1] = np.loadtxt("test_gdboosting.csv", skiprows=1, delimiter=',')[:, 1]
# Random Forest
X_out1[:, 2] = np.loadtxt("out1_forest.csv", skiprows=1, delimiter=',')[:, 1]
X_out2[:, 2] = np.loadtxt("out2_forest.csv", skiprows=1, delimiter=',')[:, 1]
X_test[:, 2] = np.loadtxt("test_forest.csv", skiprows=1, delimiter=',')[:, 1]
# SVM
# X_train[:, 3] = np.loadtxt("svm_train_1225.csv", skiprows=1, delimiter=',')[:, 1]
# X_test[:, 3] = np.loadtxt("svm_test_1225.csv", skiprows=1, delimiter=',')[:, 1]

In [12]:
# Logistic regression on predictions
tuned_parameters = {'C': [0.001, 0.004, 0.006, 0.01, 0.05]}
clf = GridSearchCV(LogisticRegression(), tuned_parameters, scoring = 'accuracy', verbose=1)
clf.fit(X_out1,y_out_1)
print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
0.848334833483
{'C': 0.004}


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    0.4s finished


In [13]:
clf = LogisticRegression(C=0.004)
clf.fit(X_out1, y_out_1)
# clf.predict(X_out2)
print(clf.score(X_out2, y_out_2))

0.853351327036


In [16]:
print(clf.coef_)

[[ 0.89229366  0.63260869  0.63240789]]


In [14]:
def make_submission_file(pred, filename):
    super_threshold_indices = pred >= 0.5
    pred.fill(0)
    pred[super_threshold_indices] = 1

    submission = [[i+1, int(pred[i])] for i in range(len(pred))]
    submission.insert(0, ['Id','Prediction'])
    with open(filename, 'w') as f:
        for line in submission:
            f.write(','.join(map(str, line)) + '\n')

In [15]:
pred = clf.predict(X_test)
make_submission_file(pred, "test_predictions_ensemble.csv")

In [56]:
# Predict the test file
pred = clf.predict(X_test)
super_threshold_indices = pred >= 0.5
pred.fill(0)
pred[super_threshold_indices] = 1

submission = [[i+1, int(pred[i])] for i in range(len(pred))]
submission.insert(0, ['Id','Prediction'])
with open('ensemble_test_prediction.csv', 'w') as f:
    for line in submission:
        f.write(','.join(map(str, line)) + '\n')

In [66]:
def mismatch(y1, y2):
    return np.sum(np.not_equal(y1, y2))
print(mismatch(pred, X_test[:, 0]))
print(mismatch(pred, X_test[:, 1]))
print(mismatch(pred, X_test[:, 2]))
print(mismatch(pred, X_test[:, 3]))


train_pred = clf.predict(X_train)
print(mismatch(train_pred, y_train))
print(mismatch(X_train[:, 2], y_train))

667
584
313
650
1890
1765
