In [1]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
# Load training and testing text file
Data_train = np.loadtxt('training_data.txt', skiprows = 1)
X_train = Data_train[:, 1:]
y_train = Data_train[:, 0]
X_test = np.loadtxt("test_data.txt", skiprows = 1)
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

# Normalize training and testing data
for i in range(len(X_train[0])):
    col = X_train[:, i]
    mean = np.mean(col)
    std = np.std(col)
    X_train[:, i] = (col - mean) / std
    X_test[:, i] = (X_test[:, i] - mean) / std

X_train shape: (20000, 1000)
y_train shape: (20000,)
X_test shape: (10000, 1000)


In [3]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3, shuffle=True, random_state=0)
in_index, out_index = list(kf.split(X_train))[0]
print("IN:", in_index, "OUT:", out_index)
X_in, X_out = X_train[in_index], X_train[out_index]
y_in, y_out = y_train[in_index], y_train[out_index]
print(X_in.shape, X_out.shape)

IN: [    0     1     2 ..., 19996 19997 19998] OUT: [    8     9    12 ..., 19990 19995 19999]
(13333, 1000) (6667, 1000)


In [4]:
out_index_1, out_index_2 = list(kf.split(X_out))[0]
print("OUT1:", out_index_1, "OUT2:", out_index_2)
X_out_1, X_out_2 = X_out[out_index_1], X_out[out_index_2]
y_out_1, y_out = y_out[out_index_1], y_out[out_index_2]
print(X_out_1.shape, X_out_2.shape)

OUT1: [   0    2    3 ..., 6663 6664 6666] OUT2: [   1    4    6 ..., 6660 6662 6665]
(4444, 1000) (2223, 1000)


In [15]:
tuned_parameters = {'n_estimators': [30, 50, 100]}
clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, scoring = 'accuracy', verbose=1)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  7.8min finished


0.7986
{'n_estimators': 100}


In [16]:
tuned_parameters = {'n_estimators': [200, 500]}
clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, scoring = 'accuracy', verbose=1)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 29.8min finished


0.8376
{'n_estimators': 500}


In [None]:
tuned_parameters = {'n_estimators': [1000]}
clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, scoring = 'accuracy', verbose=1)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [7]:
clf = GradientBoostingClassifier(n_estimators=500, verbose=0)
clf.fit(X_in, y_in)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [9]:
def make_submission_file(pred, filename):
    super_threshold_indices = pred >= 0.5
    pred.fill(0)
    pred[super_threshold_indices] = 1

    submission = [[i+1, int(pred[i])] for i in range(len(pred))]
    submission.insert(0, ['Id','Prediction'])
    with open(filename, 'w') as f:
        for line in submission:
            f.write(','.join(map(str, line)) + '\n')

In [10]:
pred1 = clf.predict(X_out_1)
pred2 = clf.predict(X_out_2)
make_submission_file(pred1, "out1_gdboosting.csv")
make_submission_file(pred2, "out2_gdboosting.csv")

In [None]:
# Train on the entire training dataset
clf = GradientBoostingClassifier(n_estimators=500, verbose=0)
clf.fit(X_train, y_train)

In [None]:
# Predict test file
pred = clf.predict(X_test)
make_submission_file(pred, "test_gdboosting.csv")