In [1]:
# Imports
import numpy as np
from sklearn import *
import mltools as ml
import time

In [2]:
X = np.genfromtxt('./X_train.txt', delimiter=',')
Y = np.genfromtxt('./Y_train.txt', delimiter=',')
X_test = np.genfromtxt('./X_test.txt', delimiter=',')
X,Y = ml.shuffleData(X,Y)
Xtr,Xva,Ytr,Yva = ml.splitData(X,Y,0.75)

In [3]:
#Logistic repressions
logistic_regression =linear_model.LogisticRegression(random_state=0, solver='liblinear')
logistic_regression.fit(Xtr, Ytr)
print(logistic_regression)
print("")

logistic_regression.predict(Xtr)
logistic_regression.predict_proba(Xtr)
print "Training error:", (1 - logistic_regression.score(Xtr, Ytr))

logistic_regression.predict(Xva)
logistic_regression.predict_proba(Xva)
print "Validation error:", (1 - logistic_regression.score(Xva, Yva))

logistic_regression_roc = metrics.roc_auc_score(Yva, logistic_regression.predict_proba(Xva)[:,1])
print("ROC: " + str(logistic_regression_roc))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

Training error: 0.41530447278606075
Validation error: 0.4288793103448276
ROC: 0.5940327975471522


In [4]:
# K-Nearest
knn = neighbors.KNeighborsClassifier(n_neighbors=16)
knn.fit(Xtr, Ytr)
knn_roc = metrics.roc_auc_score(Yva, knn.predict_proba(Xva)[:,1])
print(knn)
print("")
print("Training error: " + str(1 - knn.score(Xtr, Ytr)))
print("Validation error: " + str(1 - knn.score(Xva, Yva)))
print("ROC: " + str(knn_roc))
print("")

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=16, p=2,
           weights='uniform')

Training error: 0.38099514999101847
Validation error: 0.45905172413793105
ROC: 0.5520492660039023



In [5]:
# Random Forest
random_forest = ensemble.RandomForestClassifier(n_estimators=700, min_samples_leaf=30)
random_forest.fit(Xtr,Ytr)
random_forest_roc = metrics.roc_auc_score(Yva, random_forest.predict_proba(Xva)[:,1])
print(random_forest)
print("")
print("Training error: " + str(1 - random_forest.score(Xtr, Ytr)))
print("Validation error: " + str(1 - random_forest.score(Xva, Yva)))
print("ROC: " + str(random_forest_roc))
print("")

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=30, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Training error: 0.19328183941081367
Validation error: 0.3604525862068966
ROC: 0.7183708073957075



In [6]:
gradient_boost = ensemble.GradientBoostingClassifier(learning_rate=0.1, n_estimators=70, max_depth=3, max_leaf_nodes=8)
gradient_boost.fit(Xtr,Ytr)
gradient_boost_roc = metrics.roc_auc_score(Yva, gradient_boost.predict_proba(Xva)[:,1])
print(gradient_boost)
print("")
print("Training error: " + str(1-(gradient_boost.score(Xtr, Ytr))))
print("Validation error: " + str(1-(gradient_boost.score(Xva, Yva))))
print("Roc: " + str(gradient_boost_roc))

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=8,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=70,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

Training error: 0.25076342733968027
Validation error: 0.3410560344827587
Roc: 0.7282861423394964


Since given these parameter optimizations, with their validation it can be said that the Gradient Boosting algorithm would yield the best output, followed by Random Forest algorithm, Logistic regression, and K-Nearest Neighbor. Based on the ouputs we should add more weight to Gradient Boosting when combining the different classifers and less weight to K-Nearest Neighbors. 

In [9]:
weights = {gradient_boost: 72,
          random_forest: 19,
          logistic_regression: 5,
          knn: 4}
print("Weights: " + str(weights.values()))
yhat_list = []
for classifer, weight in weights.items():
    yhat = classifer.predict_proba(X_test)[:,1]
    for i in range(weight):
        yhat_list.append(yhat)
yhat_average = np.mean(np.array(yhat_list), axis=0)

Weights: [5, 4, 72, 19]


In [10]:
np.savetxt('Y_submit.txt',np.vstack( (np.arange(len(yhat_average)) , yhat_average) ).T,
           '%d, %.2f',header='Id,Predicted',comments='',delimiter=',');