In [35]:
import csv
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
def load_data(filename):
    return np.loadtxt(open(filename), delimiter=',', skiprows=1)

In [16]:
def scale_scores(y_score):
    max_score, min_score = max(y_score), min(y_score)
    min_capped = max_score - (max_score - y_score) * (max_score) / (max_score - min_score)
    return min_capped / max_score

In [5]:
# Load training data
data = load_data('train_2008.csv')
X = data[:, 1:-1]
y = data[:, -1]

In [28]:
N = len(y) #64667
D = len(X[0]) #381

# Split to train / test (validation) data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [7]:
X_sample, y_sample = X_train[:1000], y_train[:1000]

In [34]:
gbr = GradientBoostingRegressor(n_estimators=120)
gbr.fit(X_train, y_train)
y_score = gbr.predict(X_test)
roc_auc_score(y_test, y_score)

0.7798763085378984

In [10]:
## For submission:

# Train on the entire training data
gbr = GradientBoostingRegressor(max_depth=5)
gbr.fit(X, y)

# Load real test data
test_data = load_data('test_2008.csv')
X_test_real = test_data[:, 1:]

# Predict on the actual test data, scale to get rid of negative values
y_test_scores = scale_scores(gbr.predict(X_test_real))

In [23]:
# Logistic Regression
lgr = LogisticRegression(C = 0.1, penalty = 'l1')
lgr.fit(X, y)
pred_prob = lgr.predict_proba(X_test_real)
y_scores_lgr = pred_prob[:,1]



In [24]:
# Random Forest Regression
regr = RandomForestRegressor(n_estimators = 100, max_features = 26)
regr.fit(X, y)
y_score_regr = regr.predict(X_test_real)

In [25]:
y_scores_gbr = gbr.predict(X_test_real)
y_average = 0.4 * y_scores_gbr + 0.3 * y_scores_lgr + 0.3 * y_score_regr
y_test_scores = scale_scores(y_average)

In [26]:
with open ('sub3.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(['id', 'target'])
    for i in range(0, 16000):
        writer.writerow([i, y_test_scores[i]])

In [None]:
# Grid search
gbr = GradientBoostingRegressor()
param_grid={'n_estimators': 50, 100, 150, 200, 400, 800, 1000}
gbr = GridSearchCV(estimator=gbr, cv=5,param_grid=param_grid, n_jobs=4)

In [27]:
print(y_scores_gbr)
print(y_scores_lgr)
print(y_score_regr)

[0.26947856 0.11066097 0.15139422 ... 0.19965127 0.04050552 0.2417453 ]
[0.28844685 0.09388635 0.15038621 ... 0.16143706 0.07628423 0.30016567]
[0.47 0.16 0.09 ... 0.3  0.07 0.29]


In [38]:
gbr = GradientBoostingRegressor()
# eliminate n_estimators 500 - 300 ideal
# eliminate max_depth 7 - 6 ideal
param_grid={'n_estimators': [100, 150, 200, 300], 'max_depth': [4, 5, 6]}

gbr = GridSearchCV(estimator=gbr, cv=5, param_grid=param_grid, n_jobs=4)
gbr.fit(X_train, y_train)
y_score_gbr = gbr.predict(X_test)
print(roc_auc_score(y_test, y_score_gbr))
print(gbr.best_params_)

0.7843227687801987
{'max_depth': 6}
