In [1]:
import pandas as pd
import numpy as np

from itertools import tee, izip
import os, sys
import scipy

import xgboost as xgb

from sklearn import cross_validation, metrics # Additional scklearn functions
from sklearn.grid_search import GridSearchCV # Perforing grid search

In [1]:
train = pd.read_csv('/home/ec2-user/Kaggle/facebook_Jul_2016/feature_engineered_input/hr_wk_mth/train_hr_wk_mth.csv')
test = pd.read_csv('/home/ec2-user/Kaggle/facebook_Jul_2016/feature_engineered_input/hr_wk_mth/test_hr_wk_mth.csv')

print train.head()
print test.head()

NameError: name 'pd' is not defined

In [2]:
# a helper function takes an iterable, return its stepwise pair tuple in a list
# [1, 2, 3, 4, 5] -> [(1, 2), (2, 3), (3, 4), (4, 5)]
def pairwise(iterable):
    floor, ceiling = tee(iterable)
    next(ceiling, None)
    return izip(floor, ceiling)

# a helper function takes a df, a column name, a list of floor/ceiling values 
# "split" the df based on the given column, using the list of floor/ceiling values, return list of df
#
# flr_clg will be floor EXclusive, ceiling INclusive - need special treatment for the first split
def split_df_rows_on_col_ranges(df, col, flr_clg):
    splitted_df = []
    first = True
    for fc in flr_clg:
        if first:
            splitted_df.append(df[ (df[col] >= fc[0]) & (df[col] <= fc[1]) ])
            first = False
        else:
            splitted_df.append(df[ (df[col] > fc[0]) & (df[col] <= fc[1]) ])
    return splitted_df

# a helper function takes x bars, cut y bars inside and return a dictionary of grids
def cut_y_bars_in_x_bar(x_bars, y, y_bin_tuple):
    gridDict = {}
    xidx = 0
    for xbar in x_bars:        
        # getting list of N bars (grids here already) based on y values, all within 1 xbar
        y_bars_in_xbar = split_df_rows_on_col_ranges(xbar, y, y_bin_tuple)
            
        yidx = 0
        for grid in y_bars_in_xbar:
            gridDict[(xidx, yidx)] = grid # gather output with x,y index
            yidx = yidx + 1     
        xidx = xidx + 1
    return gridDict

'''
input parameters for def get_grids():
- train_file - input filename with path for the training set

- test_file - input filename with path for the test set

- outputFile - boolean that tells whether you want NxM files as output or a dict of pd.DataFrame
               as output, format would be (x_idx, y_idx) : df_for_grid. If you want file as output
               then x_idx, y_idx will appear in output files' name

- train_output - only used if the 3rd parameter is set to True, will be the path to store NxM files for training set,
                 each file contains a grid of data points

- test_output - only used if the 3rd parameter is set to True, will be the path to store NxM files for testing set,
                each file contains a grid of data points

- n - NxM grid, the N value, for x axis

- m - NxM grid, the M value, for y axis

- x - column name of the x coordinate in input file

- y - column name of the y coordinate in input file

'''
def get_grids(train_file, test_file, outputFile = False, train_output = None, test_output = None, n = 10, m = 10, x = 'x', y = 'y'):
    train = pd.read_csv(train_file)
    test = pd.read_csv(test_file)
    
    # getting the cutoff values for x and y axis, using training set ONLY - because of the IMPORTANT ASSUMPTION -
    # TESTING SET IS SUBSET OF TRAINING SET IN TERMS OF X AND Y COORDINATES
    x_count, x_cutoff = np.histogram(train[x], bins = n)
    y_count, y_cutoff = np.histogram(train[y], bins = m)

    # transform cutoff values into step-wise tuples
    x_bin_tuple = [(floor, ceiling) for floor, ceiling in pairwise(x_cutoff)]
    y_bin_tuple = [(floor, ceiling) for floor, ceiling in pairwise(y_cutoff)]

    train_x_splits = split_df_rows_on_col_ranges(train, x, x_bin_tuple) # getting list of N bars based on x values for train
    test_x_splits = split_df_rows_on_col_ranges(test, x, x_bin_tuple) # getting list of N bars based on x values for test

    # within each bar (overall N) splitted based on x, there will be M splits based on y - each one is a grid
    trainDict = cut_y_bars_in_x_bar(train_x_splits, y, y_bin_tuple)
    testDict = cut_y_bars_in_x_bar(test_x_splits, y, y_bin_tuple)

    if outputFile:
        for key in trainDict:
            filename = 'train_' + 'x' + str(key[0]) + '_y' + str(key[1]) + '.csv'
            fullpath = os.path.join(train_output, filename)
            trainDict[key].to_csv(fullpath, index = False)
        for key in testDict:
            filename = 'test_' + 'x' + str(key[0]) + '_y' + str(key[1]) + '.csv'
            fullpath = os.path.join(test_output, filename)
            testDict[key].to_csv(fullpath, index = False)
    return (trainDict, testDict)

# Generate grids and store in dictionary (and files)

In [3]:
#train_g, test_g = get_grids('/home/ec2-user/Kaggle/facebook_Jul_2016/feature_engineered_input/hr_wk_mth/train_hr_wk_mth.csv',
#                '/home/ec2-user/Kaggle/facebook_Jul_2016/feature_engineered_input/hr_wk_mth/test_hr_wk_mth.csv',
#                outputFile = True, 
#                train_output = '/home/ec2-user/Kaggle/facebook_Jul_2016/feature_engineered_input/hr_wk_mth/50_50_grid/train',
#                test_output = '/home/ec2-user/Kaggle/facebook_Jul_2016/feature_engineered_input/hr_wk_mth/50_50_grid/test',
#                n = 50, m = 50, x = 'x', y = 'y')

In [4]:
train_g, test_g = get_grids('/home/ec2-user/Kaggle/facebook_Jul_2016/feature_engineered_input/hr_wk_mth/train_hr_wk_mth.csv',
                '/home/ec2-user/Kaggle/facebook_Jul_2016/feature_engineered_input/hr_wk_mth/test_hr_wk_mth.csv',
                n = 50, m = 50)

# Train/Validation/Test split

In [5]:
print test_g[(6,8)].shape # number of testing set data points in grid 1, 8)
print len(test_g) # number of grids form testing set
print test_g[(6,8)].head()

print train_g[(6,8)].shape
print len(train_g)
print train_g[(6,8)].head()

(2753, 8)
2500
       row_id       x       y  accuracy    time  hour  weekday  month
4394     4394  1.3877  1.7159        29  817057    10        1      7
5557     5557  1.3798  1.7341        12  947451    23        7     10
16597   16597  1.2697  1.7303        51  957863     5        1     11
19386   19386  1.2186  1.6120        83  815774    13        7      7
20132   20132  1.3330  1.6060       289  941808     1        4     10
(9343, 9)
2500
       row_id       x       y  accuracy    time    place_id  hour  weekday  \
11683   11683  1.2002  1.6124        36  398887  6855596818     1        5   
13290   13290  1.2360  1.6957        20  632924  6472715293    13        6   
19390   19390  1.3616  1.7906        58  114062  8610202964     6        3   
20089   20089  1.2124  1.7503        14  148596  2495375117     5        6   
20203   20203  1.2912  1.6980       162  627651  3707683630    21        2   

       month  
11683     10  
13290      3  
19390      3  
20089      4  
20203 

In [6]:
train_68_pd = train_g[(6,8)][['x','y','accuracy','time', 'hour', 'weekday', 'month', 'place_id']]
train_68 = train_68_pd.as_matrix()

test_68_pd = test_g[(6,8)][['x','y','accuracy','time', 'hour', 'weekday', 'month']]
test_68 = test_68_pd.as_matrix()

print train_68_pd.shape

print train_68[0:3, :]
print type(train_68)

train_sz = train_68.shape
print train_sz

print test_68[0:3, :]

(9343, 8)
[[  1.20020000e+00   1.61240000e+00   3.60000000e+01   3.98887000e+05
    1.00000000e+00   5.00000000e+00   1.00000000e+01   6.85559682e+09]
 [  1.23600000e+00   1.69570000e+00   2.00000000e+01   6.32924000e+05
    1.30000000e+01   6.00000000e+00   3.00000000e+00   6.47271529e+09]
 [  1.36160000e+00   1.79060000e+00   5.80000000e+01   1.14062000e+05
    6.00000000e+00   3.00000000e+00   3.00000000e+00   8.61020296e+09]]
<type 'numpy.ndarray'>
(9343, 8)
[[  1.38770000e+00   1.71590000e+00   2.90000000e+01   8.17057000e+05
    1.00000000e+01   1.00000000e+00   7.00000000e+00]
 [  1.37980000e+00   1.73410000e+00   1.20000000e+01   9.47451000e+05
    2.30000000e+01   7.00000000e+00   1.00000000e+01]
 [  1.26970000e+00   1.73030000e+00   5.10000000e+01   9.57863000e+05
    5.00000000e+00   1.00000000e+00   1.10000000e+01]]


# Factorize Place_id into xgboost compatible format (0, 1, 2...)

In [7]:
place_id = train_68[:,7] # original class labels before factorization
data = scipy.delete(train_68, 7, 1)  # delete the original class label column - cannot be used by xgboost

print place_id[0:3]
print data.shape

[  6.85559682e+09   6.47271529e+09   8.61020296e+09]
(9343, 7)


In [8]:
factorized_placeID, unique_placeID = pd.factorize(place_id)
overall_num_of_classes = len(unique_placeID) # needed by xgboost multiclass objective

print overall_num_of_classes

660


In [9]:
# reshape so dimension matches for appending to original dataset
class_np_horizontal = np.reshape(factorized_placeID, (train_sz[0], 1)) 
print class_np_horizontal.shape

# add factorized labels as the last column of original matrix
data = np.hstack((data, class_np_horizontal))
print data.shape

(9343, 1)
(9343, 8)


# Convert Train/Test/Validation sets to xgboost compatible data structure dMatrix

In [10]:
num_train = int(train_sz[0] * 0.8)
num_validation = int(train_sz[0] * 0.2)
# num_test will be all the testing set

train_68_set = data[0:num_train, :]
validate_68_set = data[num_train:, :]

print train_68_set.shape
print validate_68_set.shape
print test_68.shape

(7474, 8)
(1869, 8)
(2753, 7)


In [11]:
train_X = train_68_set[:,0:6]
train_Y = train_68_set[:, 7]
print train_X.shape

validate_X = validate_68_set[:,0:6]
validate_Y = validate_68_set[:, 7]
print validate_Y.shape

test_X = test_68

(7474, 6)
(1869,)


# Set xgboost model parameters

In [12]:
# convert data to xgboost compatible structure - DMatrix
xg_train = xgb.DMatrix( train_X, label = train_Y)
xg_validation = xgb.DMatrix(validate_X, label = validate_Y)
xg_test = xgb.DMatrix(test_X)

In [13]:
# setup parameters for xgboost with a python dictionary
param = {}

# use softmax multi-class classification
param['objective'] = 'multi:softprob' # tells boosted trees to output probability

# param[booster] - default is set to "gbtree" - gradient boosted tree

# scale weight of positive examples
param['eta'] = 0.05 # Parameters for Tree Booster - Booster parameter

param['max_depth'] = 6 # Parameters for Tree Booster - Booster parameter

param['silent'] = 1 # whether to print logs
param['nthread'] = 36 # parallelism

param['eval_metric'] = 'merror'

param['num_class'] = overall_num_of_classes # number of classes 

In [14]:
# specify traing/testing set for model training 
watchlist = [ (xg_train,'train'), (xg_validation, 'validation') ]

# specify the number of weak classifiers (base boosters) in the ensemble #
num_round = overall_num_of_classes

# Train xgboost model - takes a while

In [15]:
# train the model
bst_return_prob = xgb.train(param, xg_train, num_round, watchlist)

[0]	train-merror:0.625903	validation-merror:0.651150
[1]	train-merror:0.575863	validation-merror:0.596041
[2]	train-merror:0.539336	validation-merror:0.573569
[3]	train-merror:0.515654	validation-merror:0.552167
[4]	train-merror:0.495050	validation-merror:0.531835
[5]	train-merror:0.479529	validation-merror:0.522739
[6]	train-merror:0.470029	validation-merror:0.516854
[7]	train-merror:0.459995	validation-merror:0.513644
[8]	train-merror:0.451565	validation-merror:0.507758
[9]	train-merror:0.445545	validation-merror:0.502943
[10]	train-merror:0.439791	validation-merror:0.500268
[11]	train-merror:0.434841	validation-merror:0.497057
[12]	train-merror:0.429489	validation-merror:0.496522
[13]	train-merror:0.423200	validation-merror:0.496522
[14]	train-merror:0.417982	validation-merror:0.493312
[15]	train-merror:0.412764	validation-merror:0.490637
[16]	train-merror:0.409419	validation-merror:0.488497
[17]	train-merror:0.404469	validation-merror:0.486891
[18]	train-merror:0.401258	validation-

# Store xgboost model for future use

In [16]:
# save xgb model object so it can be reloaded and used to make predictions without training
bst_return_prob.save_model('/home/ec2-user/Kaggle/facebook_Jul_2016/models/k_posted.model') 

# dump model - weak classifiers built - in this case would be decision tree stumps 
bst_return_prob.dump_model('/home/ec2-user/Kaggle/facebook_Jul_2016/models/k_posted_raw.txt')

# Make predictions using xgboost model - takes a while

In [16]:
# prediction for testing set
preds_prob = bst_return_prob.predict(xg_test)
print preds_prob.shape

(2753, 660)


# Convert xgboost outputs (probability matrix) back to original place_id

In [17]:
sortedProbIdx = np.argsort(preds_prob)

top3prob = sortedProbIdx[:, -3:]

print top3prob
print top3prob.shape

[[ 60 180  24]
 [ 21  52  43]
 [469   3  21]
 ..., 
 [ 37  81  17]
 [250  22  17]
 [147  10  23]]
(2753, 3)


In [18]:
finalResults = unique_placeID[top3prob]
print finalResults.shape
print type(finalResults)

np.set_printoptions(precision=10)
print finalResults[0:3]

if 4789348989 in place_id:
    print True

(2753, 3)
<type 'numpy.ndarray'>
[[  4.7893489890e+09   8.7431971440e+09   9.1778043280e+09]
 [  6.1334719400e+09   2.7398940110e+09   8.3790478080e+09]
 [  4.5131063140e+09   2.4953751170e+09   6.1334719400e+09]]
True
