In [None]:
%matplotlib notebook
import pandas as pd
import numpy as np
import math
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import csv

from sklearn.externals import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn import decomposition, linear_model, grid_search, metrics, ensemble

import xgboost as xgb

In [None]:
#read in data
train_df = pd.read_csv('data/train.csv')

In [None]:
#Create a training dataset. Features (x) should include all columns except target and ID. Target (y) should only 
#include the target column
train_x = train_df.drop(['target','ID'], axis=1)
train_x.head()
train_y = train_df.target

In [None]:
# Find which feaetures have categorical values (dtype = object) and which have continuous values (dtype = float)
# Save the names of categorical feature columns to categorical_columns
g = train_x.columns.to_series().groupby(train_x.dtypes).groups 
g_keys = g.keys()
categorical_columns = g[g_keys[0]]
# categorical_columns

In [None]:
# Calculate the mean of the target variable to look for class imbalance. The mean is ~ 0.76 so ~76% of data is 
# target value 1
train_df.target.mean()

In [None]:
# Check the number of values that each categorical feature can have
for variable in categorical_columns:
    print variable + " " +  str(len(train_df[variable].unique()))

## Variable 22 does not seem useable. It has 18,211 unique categorical variables. Going to drop it unless I find a better way to work with it

In [None]:
# remove variable 22
train_x.drop('v22',axis = 1, inplace = True)
g = train_x.columns.to_series().groupby(train_x.dtypes).groups 
g_keys = g.keys()
categorical_columns = g[g_keys[0]]


In [None]:
# Define a function, dummify, that will replace categorical features with dummy columns. Return the new dataset,
# the names of the dummy columns, and the rows with null values for each categorical variable
def dummify(column_name,dataset):
    prefix_string = column_name + '_'
    dummies = pd.get_dummies(train_x[column_name],prefix=prefix_string)
    dummy_column_names = dummies.columns.values
    #Get a list of all rows containing nulls. After dummifying these rows will just have all zeros for dummy variable
    get_nulls = np.where(dataset[column_name].isnull() == True)[0].tolist()

    dataset.drop(column_name, axis = 1, inplace = True)

    return pd.concat([dataset,dummies], axis = 1), dummy_column_names, get_nulls



In [None]:
# Iterate through each categorical variable. On each iteration, call dummify for that variable and load
# column names and null list into a dictionary
dummified_train_x = train_x.copy()
dummy_columns = {}
nulls_dict = {}
for column in categorical_columns:
    print 'Dummifying ' + column
    dummified_train_x,temp_dummy_list, temp_null_list = dummify(column,dummified_train_x)
    dummy_columns[column] = temp_dummy_list
    nulls_dict[column] = temp_null_list
    

# Check which categorical variables have missing (null) values that will need to be filled, track those features in
# variables_to_predict
variables_to_predict = []
for key in nulls_dict:
    print key + ' ' + str(len(nulls_dict[key]))
    if len(nulls_dict[key]) > 0:
        variables_to_predict.append(key)

In [None]:
# Print a sample row to see dummified features are correctly in place
#with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
#    print dummified_train_x.iloc[0]

In [None]:
imp = Imputer(strategy='mean', axis=0)
temp = imp.fit_transform(dummified_train_x)

imputed_dummified_train_x = pd.DataFrame(temp)
imputed_dummified_train_x.columns = dummified_train_x.columns

In [None]:
# Define a function to predict missing categorical values
# Impute the mean for missing continuous variables
# May change in the future to also predict continuous variables
def predict_values(column_name,dataset,nulls_list, variable, force=False):

    dataset_test = dataset.iloc[nulls_list]
    dataset_test_target = dataset_test[column_name]
    dataset_test_features = dataset_test.drop(column_name,axis=1)

    dataset_train = dataset.drop(nulls_list)
    dataset_train_target = dataset_train[column_name]
    dataset_train_features = dataset_train.drop(column_name,axis=1)

    filename = 'data/model/model_' + variable+'.pkl'
    if force or not os.path.exists(filename):
        print 'Training ' + variable
        estimator = Pipeline([("imputer", Imputer(strategy="mean",axis=0)),
                      ("forest", RandomForestRegressor(random_state=0,n_estimators=10))])
        estimator.fit(dataset_train_features, dataset_train_target)
        score = cross_val_score(estimator, dataset_train_features, dataset_train_target).mean()
        print("Score with the entire dataset = %.2f" % score)
        joblib.dump(estimator, filename)
    else:
        print 'Loading ' + variable
        estimator = joblib.load(filename)

    dataset.loc[nulls_list,column_name] = estimator.predict(dataset_test_features)


In [None]:
# predict all categorical missing values
for variable in variables_to_predict:
    predict_values(dummy_columns[variable].tolist(),imputed_dummified_train_x, nulls_dict[variable], variable)

In [None]:
# try initial params and approach from Dmitry M. 
# https://www.kaggle.com/director/bnp-paribas-cardif-claims-management/simple-xgboost-0-46146



# XGBoost params:
xgboost_params = { 
   "objective": "binary:logistic",
   "booster": "gbtree",
   "eval_metric": "auc",
   "eta": 0.01, # 0.06, #0.01,
   #"min_child_weight": 240,
   "subsample": 0.75,
   "colsample_bytree": 0.68,
   "max_depth": 7
}

print('Load data...')
train = imputed_dummified_train_x
target = train_y
test = pd.read_csv('data/test.csv')
ids = test['ID'].values
test = test.drop(['ID'],axis=1)
#
#Still disregarding v22
test.drop('v22',axis = 1, inplace = True)

print('Dummify...')
test_dummy_columns = {}
test_nulls_dict = {}

for column in categorical_columns:
    # print 'Dummifying ' + column
    test,temp_dummy_list, temp_null_list = dummify(column,test)
    test_dummy_columns[column] = temp_dummy_list
    test_nulls_dict[column] = temp_null_list

#Impute missing continuous variables
print('Impute...')
imp = Imputer(strategy='mean', axis=0)
temp = imp.fit_transform(test)
test_columns = test.columns
test = pd.DataFrame(temp)
test.columns = test_columns

test_variables_to_predict = []
for key in test_nulls_dict:
    #print key + ' ' + str(len(test_nulls_dict[key]))
    if len(test_nulls_dict[key]) > 0:
        test_variables_to_predict.append(key)
    


In [None]:
# predict all categorical missing values
for variable in test_variables_to_predict:
    predict_values(test_dummy_columns[variable].tolist(),test, test_nulls_dict[variable], variable)  

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train.values, target.values, test_size=0.15)


# xgtrain = xgb.DMatrix(train.values, target.values)
# xgtest = xgb.DMatrix(test.values)

xgtrain = xgb.DMatrix(X_train, y_train)
xgvalid = xgb.DMatrix(X_valid, y_valid)

#Now let's fit the model
print('Fit the model...')
boost_round = 2000 #1800 CHANGE THIS BEFORE START

filename = 'data/model/xgb.pkl'
if not os.path.exists(filename):
    print 'Training....'
    eval_dict = dict()

    clf = xgb.train(xgboost_params, xgtrain, num_boost_round=boost_round, verbose_eval=50, maximize=False,
                    evals_result=eval_dict, early_stopping_rounds=100, evals=[(xgvalid, 'valid')])
    # Use the following when not using validation a.k.a. for final run
    #     clf = xgb.train(xgboost_params,xgtrain,num_boost_round=boost_round,verbose_eval=True,maximize=False)
    joblib.dump(clf, filename)
else:
    print 'Loading...'
    clf = joblib.load(filename)

# ------------------
#print(eval_dict)

score_list = [float(s) for s in eval_dict['valid']['auc']]
print(max(score_list))   # There are best score (max() - for 'auc', but min() for logloss)
#same result:
print(clf.best_score) 
#and best xgboost_round:
print(clf.best_iteration)


plt.plot(score_list)
plt.show()
 
#----------------    
# Code for predicting on test data and saving prediction result
# #Make predict
# print('Predict...')
# test_preds = clf.predict(xgtest, ntree_limit=clf.best_iteration)
# # Save results
# #
# predictions_file = open("data/team_GAF_result.csv", "w")
# open_file_object = csv.writer(predictions_file)
# open_file_object.writerow(["ID", "PredictedProb"])
# open_file_object.writerows(zip(ids, test_preds))
# predictions_file.close()
# #
# print('Done.')

In [None]:
#----------------    
# Code for predicting on test data and saving prediction result
# Make predict
print('Predict...')
filename = 'data/model/xgb.pkl'
clf = joblib.load(filename)
test_preds = clf.predict(xgtest, ntree_limit=clf.best_iteration)
# Save results
#
predictions_file = open("data/team_GAF_result.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["ID", "PredictedProb"])
open_file_object.writerows(zip(ids, test_preds))
predictions_file.close()
#
print('Done.')