In [2]:
import numpy as np
import math as m
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb

In [3]:
def makeSubmission(preds):
    new_index = np.arange(16384,32769,1)
    id_col = pd.DataFrame(new_index, columns=['id'], dtype='int32')
    y_hat = pd.DataFrame(preds, columns=['Y'])
    frames = [id_col, y_hat]
    pred = pd.concat(frames, axis=1)
    return pred

In [3]:
data = pd.read_csv('train.csv')
data.drop(['id'], axis=1, inplace=True)
garbage = ['F25', 'F26', 'F27']
data.drop(garbage, axis=1, inplace=True)
dups = ['F12', 'F13', 'F17', 'F22', 'F24']
data.drop(dups, axis=1, inplace=True)

In [4]:
dtrain = xgb.DMatrix(data.drop('Y', axis=1), label=data.Y)

In [26]:
depths = [6, 7, 8, 9, 10]
bsts = []

for i in range(len(depths)):
    param = {'max_depth':depths[i], 'eta':0.7, 'gamma':0.8, 'silent':1, 
             'objective':'binary:logistic', 'early_stopping_rounds':5}
    num_round = 500
    bsts.append(xgb.train(param, dtrain, num_round))

In [28]:
for i in range(len(bsts)):
    test = pd.read_csv('test.csv')
    test.drop(['id'], axis=1, inplace=True)
    dups = ['F12', 'F13', 'F17', 'F22', 'F24']
    test.drop(dups, axis=1, inplace=True)
    dtest = xgb.DMatrix(test)
    pred = makeSubmission(bsts[i].predict(dtest))
    filename = 'xgboost_pred_depth_' + str(depths[i])
    pred.to_csv(filename, encoding='utf-8', index=False)

Those all improved my scores. Lets do the same thing with the log of the data. We are also going to lower eta, increase the number of rounds, and remove early stopping.

In [5]:
data = pd.read_csv('train.csv')
data.drop(['id'], axis=1, inplace=True)
garbage = ['F25', 'F26', 'F27']
data.drop(garbage, axis=1, inplace=True)
dups = ['F12', 'F13', 'F17', 'F22', 'F24']
data.drop(dups, axis=1, inplace=True)
features = np.log(data.iloc[:,1:] + 1)
labels = data.Y

In [6]:
dtrain = xgb.DMatrix(features, label=labels)
depths = [6, 7, 8, 9, 10]
bsts = []

for i in range(len(depths)):
    param = {'max_depth':depths[i], 'eta':0.2, 'gamma':0.8, 'silent':1, 
             'objective':'binary:logistic'}
    num_round = 700
    bsts.append(xgb.train(param, dtrain, num_round))
    
for i in range(len(bsts)):
    test = pd.read_csv('test.csv')
    test.drop(['id'], axis=1, inplace=True)
    dups = ['F12', 'F13', 'F17', 'F22', 'F24']
    test.drop(dups, axis=1, inplace=True)
    test = np.log(test + 1)
    dtest = xgb.DMatrix(test)
    pred = makeSubmission(bsts[i].predict(dtest))
    filename = 'xgboost_pred_depth_' + str(depths[i]) + 'logdata'
    pred.to_csv(filename, encoding='utf-8', index=False)

Those did better, lets do it again, more depth and higher gamma.

In [5]:
data = pd.read_csv('train.csv')
data.drop(['id'], axis=1, inplace=True)
garbage = ['F25', 'F26', 'F27']
data.drop(garbage, axis=1, inplace=True)
dups = ['F12', 'F13', 'F17', 'F22', 'F24']
data.drop(dups, axis=1, inplace=True)
features = np.log(data.iloc[:,1:] + 1)
labels = data.Y

In [6]:
dtrain = xgb.DMatrix(features, label=labels)
depths = [9, 10, 11, 12, 13]
bsts = []

for i in range(len(depths)):
    param = {'max_depth':depths[i], 'eta':0.2, 'gamma':2, 'silent':1, 
             'objective':'binary:logistic'}
    num_round = 700
    bsts.append(xgb.train(param, dtrain, num_round))
    
for i in range(len(bsts)):
    test = pd.read_csv('test.csv')
    test.drop(['id'], axis=1, inplace=True)
    dups = ['F12', 'F13', 'F17', 'F22', 'F24']
    test.drop(dups, axis=1, inplace=True)
    test = np.log(test + 1)
    dtest = xgb.DMatrix(test)
    pred = makeSubmission(bsts[i].predict(dtest))
    filename = 'xgboost_pred_depth_' + str(depths[i]) + '_logdata_highergamma' 
    pred.to_csv(filename, encoding='utf-8', index=False)

Those scored a little bit higher. The highest had depth 12. Lets raise gamma again, and go one level deeper.

In [7]:
data = pd.read_csv('train.csv')
data.drop(['id'], axis=1, inplace=True)
garbage = ['F25', 'F26', 'F27']
data.drop(garbage, axis=1, inplace=True)
dups = ['F12', 'F13', 'F17', 'F22', 'F24']
data.drop(dups, axis=1, inplace=True)
features = np.log(data.iloc[:,1:] + 1)
labels = data.Y
dtrain = xgb.DMatrix(features, label=labels)
depths = [10, 11, 12, 13, 14]
bsts = []

for i in range(len(depths)):
    param = {'max_depth':depths[i], 'eta':0.2, 'gamma':3, 'silent':1, 
             'objective':'binary:logistic'}
    num_round = 800
    bsts.append(xgb.train(param, dtrain, num_round))
    
for i in range(len(bsts)):
    test = pd.read_csv('test.csv')
    test.drop(['id'], axis=1, inplace=True)
    dups = ['F12', 'F13', 'F17', 'F22', 'F24']
    test.drop(dups, axis=1, inplace=True)
    test = np.log(test + 1)
    dtest = xgb.DMatrix(test)
    pred = makeSubmission(bsts[i].predict(dtest))
    filename = 'xgboost_pred_depth_' + str(depths[i]) + '_logdata_highergamma_again' 
    pred.to_csv(filename, encoding='utf-8', index=False)

The scores went back down a little bit. Lets lower gamma, eta, and rounds.

In [4]:
data = pd.read_csv('train.csv')
data.drop(['id'], axis=1, inplace=True)
garbage = ['F25', 'F26', 'F27']
data.drop(garbage, axis=1, inplace=True)
dups = ['F12', 'F13', 'F17', 'F22', 'F24']
data.drop(dups, axis=1, inplace=True)
features = np.log(data.iloc[:,1:] + 1)
labels = data.Y
dtrain = xgb.DMatrix(features, label=labels)
depths = [10, 11, 12, 13, 14]
bsts = []

for i in range(len(depths)):
    param = {'max_depth':depths[i], 'eta':0.1, 'gamma':1.8, 'silent':1, 
             'objective':'binary:logistic'}
    num_round = 600
    bsts.append(xgb.train(param, dtrain, num_round))
    
for i in range(len(bsts)):
    test = pd.read_csv('test.csv')
    test.drop(['id'], axis=1, inplace=True)
    dups = ['F12', 'F13', 'F17', 'F22', 'F24']
    test.drop(dups, axis=1, inplace=True)
    test = np.log(test + 1)
    dtest = xgb.DMatrix(test)
    pred = makeSubmission(bsts[i].predict(dtest))
    filename = 'xgboost_pred_depth_' + str(depths[i]) + '_logdata_lowergamma' 
    pred.to_csv(filename, encoding='utf-8', index=False)

raise rounds again?