In [12]:
%matplotlib notebook
import pandas as pd
import numpy as np

import csv
import os
import sys

from sklearn import ensemble
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.externals import joblib
from sklearn.metrics import log_loss, make_scorer
from sklearn.grid_search import GridSearchCV
import xgboost as xgb

import matplotlib.pyplot as plt
import random; random.seed(2016)
import time; start_time = time.time()


In [13]:

print('Load data...')
train = pd.read_csv('data/train.csv')
target = train['target']
train = train.drop(['ID','target'],axis=1)
test = pd.read_csv('data/test.csv')
id_test = test['ID']
test = test.drop(['ID'],axis=1)
num_train = train.shape[0]

Load data...


In [14]:
# Define a function, dummify, that will replace categorical features with dummy columns. Return the new dataset,
# the names of the dummy columns, and the rows with null values for each categorical variable
def dummify(name,series):
    prefix_string = name + '_'
    dummies = pd.get_dummies(series,prefix=prefix_string)
    dummy_column_names = dummies.columns.values
    #Get a list of all rows containing nulls. After dummifying these rows will just have all zeros for dummy variable
    get_nulls = np.where(series.isnull() == True)[0].tolist()

    return dummies, dummy_column_names, get_nulls

def fill_nan_null(val):
    ret_fill_nan_null = 0.0
    if val == True:
        ret_fill_nan_null = 1.0
    return ret_fill_nan_null

In [15]:
# Drop features with > 0.9 correlation. Keep feature with fewest NaNs

corr = train.corr()

to_drop = set()

for col in corr.columns.values:
    if col in to_drop:
        continue

    col_list = corr[col][(corr[col] > 0.9) & (corr[col] < 1)].index.tolist()
    col_set = set(col_list)
    col_set.difference_update(to_drop)
    if (len(col_list) == 0) or (len(col_set) == 0):
        continue

    col_list.append(col)
    lowest_na_count = train[col_list[0]].isnull().sum()
    best_col = col_list[0]
    for option in col_list:
        na_count = train[option].isnull().sum()
        if na_count < lowest_na_count:
            lowest_na_count = na_count
            best_col = option
    col_list.remove(best_col)
    to_drop.update(col_list)


(114393, 131)
(114321, 131)


In [17]:
print "Adding features..."

train_test_data = pd.concat([train,test],axis=0, ignore_index=True)

train_test_data_types = train_test_data.dtypes[:]


for i in range(len(train_test_data_types)):
    train_test_data[str(train_test_data_types.index[i])+'_nan_'] = train_test_data[str(train_test_data_types.index[i])].map(lambda x:fill_nan_null(pd.isnull(x)))
    
train_test_data['NA_num'] = train_test_data.isnull().sum(axis=1)


Adding features...


In [20]:
drop_correlated = True

if drop_correlated == True:
    print 'Drop Correlated ...'
    train_test_data.drop(list(to_drop),axis=1, inplace = True)


print('Clearing...')
train_dummy_columns = {}
train_nulls_dict = {}
test_dummy_columns = {}
test_nulls_dict = {}

cleaned_train_test_data = train_test_data.copy()


for (train_name, train_series) in train_test_data.iteritems():
    if train_name == 'v22':
        #v22 has too many options to dummify, instead: factorize
        cleaned_train_test_data[train_name], tmp_indexer = pd.factorize(train_test_data[train_name])
        #but now we have -1 values (NaN)    
    elif train_series.dtype == 'O':
        print 'Dummifying ' + train_name
        cleaned_train_test_data.drop(train_name,axis=1,inplace = True)

        train_dummies, train_dummy_list, train_null_list = dummify(train_name,train_series)
        cleaned_train_test_data = pd.concat([cleaned_train_test_data,train_dummies], axis = 1)
        
        train_dummy_columns[train_name] = train_dummy_list
        train_nulls_dict[train_name] = train_null_list
        
    else:
        #for int or float: fill NaN
        tmp_len = len(train_test_data[train_series.isnull()])
        if tmp_len>0:
            cleaned_train_test_data.loc[train_series.isnull(), train_name] = train_series.mean()


Clearing...
Dummifying v3
Dummifying v24
Dummifying v30
Dummifying v31
Dummifying v47
Dummifying v52
Dummifying v56
Dummifying v66
Dummifying v71
Dummifying v74
Dummifying v75
Dummifying v79
Dummifying v91
Dummifying v107
Dummifying v110
Dummifying v112
Dummifying v113
Dummifying v125


In [None]:
# cleaned_train.isnull().sum(axis=1)
# with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
#    print cleaned_train.iloc[0]

(228714, 228)

In [26]:
train = cleaned_train_test_data.iloc[:num_train]
test = cleaned_train_test_data.iloc[num_train:]


y_train = target


def flog_loss(ground_truth, predictions):
    flog_loss_ = log_loss(ground_truth, predictions) #, eps=1e-15, normalize=True, sample_weight=None)
    return flog_loss_
LL  = make_scorer(flog_loss, greater_is_better=False)

g={'ne':500,'md':40,'mf':60,'rs':2016}
etc = ensemble.ExtraTreesClassifier(n_estimators=g['ne'], max_depth=g['md'], max_features=g['mf'], random_state=g['rs'], criterion='entropy', min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)      
etr = ensemble.ExtraTreesRegressor(n_estimators=g['ne'], max_depth=g['md'], max_features=g['mf'], random_state=g['rs'], min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)      
rfc = ensemble.RandomForestClassifier(n_estimators=g['ne'], max_depth=g['md'], max_features=g['mf'], random_state=g['rs'], criterion='entropy', min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)
rfr = ensemble.RandomForestRegressor(n_estimators=g['ne'], max_depth=g['md'], max_features=g['mf'], random_state=g['rs'], min_samples_split= 4, min_samples_leaf= 2, verbose = 0, n_jobs =-1)
xgr = xgb.XGBRegressor(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'], missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85, objective='reg:linear')
xgc = xgb.XGBClassifier(n_estimators=g['ne'], max_depth=g['md'], seed=g['rs'], missing=np.nan, learning_rate=0.02, subsample=0.9, colsample_bytree=0.85, objective='binary:logistic') 
clf = {'etc':etc, 'etr':etr, 'rfc':rfc, 'rfr':rfr, 'xgr':xgr, 'xgc':xgc} 

In [27]:
print test.shape
print train.shape

(114393, 586)
(114321, 586)


In [28]:
y_pred=[]
best_score = 0.0
id_results = id_test[:]
for c in clf:
    if c[:1] != "x": #not xgb
        model = GridSearchCV(estimator=clf[c], param_grid={}, n_jobs =-1, cv=2, verbose=0, scoring=LL)
        model.fit(train, y_train.values)
        if c[-1:] != "c": #not classifier
            y_pred = model.predict(test)
            print("Ensemble Model: ", c, " Best CV score: ", model.best_score_, " Time: ", round(((time.time() - start_time)/60),2))
        else: #classifier
            best_score = (log_loss(y_train.values, model.predict_proba(train)))*-1
            y_pred = model.predict_proba(test)[:,1]
            print("Ensemble Model: ", c, " Best CV score: ", best_score, " Time: ", round(((time.time() - start_time)/60),2))
    else: #xgb
        X_fit, X_eval, y_fit, y_eval= train_test_split(train, y_train, test_size=0.35, train_size=0.65, random_state=g['rs'])
        model = clf[c]
        model.fit(X_fit, y_fit.values, early_stopping_rounds=20, eval_metric="logloss", eval_set=[(X_eval, y_eval)], verbose=0)
        if c == "xgr": #xgb regressor
            best_score = (log_loss(y_train.values, model.predict(train)))*-1
            y_pred = model.predict(test)
        else: #xgb classifier
            best_score = (log_loss(y_train.values, model.predict_proba(train)))*-1
            y_pred = model.predict_proba(test)[:,1]
        print("Ensemble Model: ", c, " Best CV score: ", best_score, " Time: ", round(((time.time() - start_time)/60),2))

    for i in range(len(y_pred)):
        if y_pred[i]<0.0:
            y_pred[i] = 0.0
        if y_pred[i]>1.0:
            y_pred[i] = 1.0
    df_in = pd.DataFrame({"ID": id_test, c: y_pred})
    id_results = pd.concat([id_results, df_in[c]], axis=1)

('Ensemble Model: ', 'etr', ' Best CV score: ', -0.47234427554051223, ' Time: ', 25.42)
('Ensemble Model: ', 'rfr', ' Best CV score: ', -0.46986564580673634, ' Time: ', 43.75)
('Ensemble Model: ', 'xgc', ' Best CV score: ', -0.25415441861034482, ' Time: ', 81.91)
('Ensemble Model: ', 'rfc', ' Best CV score: ', -0.19538655725974971, ' Time: ', 106.91)
('Ensemble Model: ', 'etc', ' Best CV score: ', -0.16260258019671914, ' Time: ', 154.65)
('Ensemble Model: ', 'xgr', ' Best CV score: ', nan, ' Time: ', 209.57)


In [29]:
id_results['avg'] = id_results.drop('ID', axis=1).apply(np.average, axis=1)
id_results['min'] = id_results.drop('ID', axis=1).apply(min, axis=1)
id_results['max'] = id_results.drop('ID', axis=1).apply(max, axis=1)
id_results['diff'] = id_results['max'] - id_results['min']
for i in range(10):
    print(i, len(id_results[id_results['diff']>(i/10)]))
id_results.to_csv("results_analysis.csv", index=False)
ds = id_results[['ID','avg']]
ds.columns = ['ID','PredictedProb']
ds.to_csv('submission.csv',index=False)

(0, 114393)
(1, 114393)
(2, 114393)
(3, 114393)
(4, 114393)
(5, 114393)
(6, 114393)
(7, 114393)
(8, 114393)
(9, 114393)


In [30]:
ds.head()

Unnamed: 0,ID,PredictedProb
0,0,0.472591
1,1,0.808729
2,2,0.79894
3,7,0.530312
4,10,0.796838
