In [1]:
import os
import sys
import numpy as np

In [2]:
def filename_with_percentage(filename, percentage):
    if percentage == 1:
        return filename
    else:
        fileparts = filename.split('.')
        fileparts[0] += '-' + str(percentage)
        return '.'.join(fileparts)

In [3]:
percentage = 0.25
file_clicks_basename = filename_with_percentage('yoochoose-clicks.dat', percentage)
file_buys_basename = filename_with_percentage('yoochoose-buys.dat', percentage)
file_what_to_buy_features_basename = filename_with_percentage('wtb.csv', percentage)
file_buy_or_not_features_basename = filename_with_percentage('bon.npy', percentage)
data_path = 'data'
cache_path = 'cache'
file_clicks = os.path.join(data_path, file_clicks_basename)
file_buys = os.path.join(data_path, file_buys_basename)
file_what_to_buy_features = os.path.join(cache_path, file_what_to_buy_features_basename)
file_buy_or_not_features = os.path.join(cache_path, file_buy_or_not_features_basename)

In [4]:
effective_columns_names = ['Session ID', 'Timestamp', 'Item ID']

In [5]:
from src.main.read_and_write_data import read_clicks

clicks = read_clicks(file_clicks, usecols=effective_columns_names)\
    .sort_values('Timestamp')\
    .reset_index(drop=True)

FileNotFoundError: [Errno 2] No such file or directory: 'data/yoochoose-clicks-0.25.dat'

In [6]:
from src.main.preprocess_data import df_group_by_session_id

clicks_grouped_by_session_id, clicks_grouped_by_session_id_keys = df_group_by_session_id(clicks)

NameError: name 'clicks' is not defined

In [7]:
from src.main.read_and_write_data import features_from_csv, features_to_csv
from src.main.feature_extraction import extract_what_to_buy

if os.path.isfile(file_what_to_buy_features):
        what_to_buy = features_from_csv(file_what_to_buy_features, ['Session ID', 'Item ID'],
                                        {'F3': 'Counts', 'F6': 'Sequent Clicks', 'F7': 'Time Difference'})
else:
    what_to_buy = extract_what_to_buy(clicks_grouped_by_session_id)
    features_to_csv(file_what_to_buy_features, what_to_buy.values())

Extracting feature: F3
Extraction completed in 30 minutes 15 seconds
Extracting feature: F6
Extraction completed in 130 minutes 22 seconds
Extracting feature: F7
Extraction completed in 165 minutes 36 seconds


In [8]:
from src.main.feature_extraction import extract_buy_or_not

if os.path.isfile(file_buy_or_not_features):
        buy_or_not = np.load(file_buy_or_not_features)
else:
    buy_or_not = extract_buy_or_not(clicks_grouped_by_session_id, what_to_buy)
    np.save(file_buy_or_not_features, buy_or_not)

Extracting feature: P2
Extraction completed in 0 minutes 12 seconds
Extracting feature: P3
Extraction completed in 9 minutes 54 seconds
Extracting feature: P1
Extraction completed in 0 minutes 2 seconds
Extracting feature: P6
Extraction completed in 0 minutes 25 seconds
Extracting feature: P4
Extraction completed in 12 minutes 35 seconds
Extracting feature: P10
Extraction completed in 0 minutes 0 seconds
Extracting feature: P11
Extraction completed in 16 minutes 31 seconds
Extracting feature: P5
Extraction completed in 0 minutes 0 seconds


In [9]:
from src.main.read_and_write_data import read_buys
from src.main.feature_extraction import extract_buys

buys = read_buys(file_buys, usecols=effective_columns_names)
_, buys_grouped_by_session_id_keys = df_group_by_session_id(buys)
buys_result = extract_buys(clicks_grouped_by_session_id_keys, buys_grouped_by_session_id_keys)

In [10]:
for feature_index in xrange(buy_or_not.shape[1]):
    print np.sort(buy_or_not[:, feature_index])

[   1.    1.    1. ...,  200.  200.  200.]
[   1.    1.    1. ...,  137.  200.  200.]
[  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   6.32272840e+07
   1.82740688e+08   2.45938000e+08]
[       0.        0.        0. ...,  3599942.  3599977.  3599988.]
[  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   3.40282347e+38
   3.40282347e+38   3.40282347e+38]
[   1.    1.    1. ...,  137.  200.  200.]
[   1.    1.    1. ...,  137.  200.  200.]
[  0.00000000e+00   0.00000000e+00   0.00000000e+00 ...,   3.40282347e+38
   3.40282347e+38   3.40282347e+38]


In [11]:
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [12]:
kfold = KFold(buys_result.size, n_folds=4, shuffle=True, random_state=42)
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced')
param_grid_rf = {
    'n_estimators': np.arange(start=5, stop=60, step=5),
    'min_samples_split': np.logspace(2, 8, num=7, base=2)
}
gs_rf = GridSearchCV(rf, param_grid=param_grid_rf, cv=kfold, scoring='roc_auc', verbose=100)
gs_rf.fit(buy_or_not, buys_result)

Fitting 4 folds for each of 77 candidates, totalling 308 fits
[CV] min_samples_split=4.0, n_estimators=5 ...........................
[CV] .. min_samples_split=4.0, n_estimators=5, score=0.568580 -  14.5s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   14.6s
[CV] min_samples_split=4.0, n_estimators=5 ...........................
[CV] .. min_samples_split=4.0, n_estimators=5, score=0.570850 -  15.1s
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:   29.7s
[CV] min_samples_split=4.0, n_estimators=5 ...........................
[CV] .. min_samples_split=4.0, n_estimators=5, score=0.570148 -  14.3s
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed:   44.0s
[CV] min_samples_split=4.0, n_estimators=5 ...........................
[CV] .. min_samples_split=4.0, n_estimators=5, score=0.567840 -  15.2s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   59.2s
[CV] min_samples_split=4.0, n_estimators=10 ..........................
[CV] . min_samples_split=4.0, n_estimators=10, scor

GridSearchCV(cv=sklearn.cross_validation.KFold(n=2312432, n_folds=4, shuffle=True, random_state=42),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': array([ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55]), 'min_samples_split': array([   4.,    8.,   16.,   32.,   64.,  128.,  256.])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=100)

In [13]:
print gs_rf.best_score_
print gs_rf.best_params_

0.747772478066
{'min_samples_split': 256.0, 'n_estimators': 55}


In [14]:
xgb = XGBClassifier(n_estimators=500, subsample=0.8, colsample_bytree=0.5)
param_grid_xgb = {
    'max_depth': np.arange(4, 7),
    'min_child_weight': np.arange(1, 4)
}
gs_xgb = GridSearchCV(xgb, param_grid=param_grid_xgb, cv=kfold, scoring='roc_auc', verbose=100)
gs_xgb.fit(buy_or_not, buys_result)

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] max_depth=4, min_child_weight=1 .................................
[CV] ........ max_depth=4, min_child_weight=1, score=0.765001 - 4.1min
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:  4.1min
[CV] max_depth=4, min_child_weight=1 .................................
[CV] ........ max_depth=4, min_child_weight=1, score=0.764348 - 4.0min
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:  8.2min
[CV] max_depth=4, min_child_weight=1 .................................
[CV] ........ max_depth=4, min_child_weight=1, score=0.764048 - 4.0min
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed: 12.2min
[CV] max_depth=4, min_child_weight=1 .................................
[CV] ........ max_depth=4, min_child_weight=1, score=0.764266 - 4.0min
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed: 16.2min
[CV] max_depth=4, min_child_weight=2 .................................
[CV] ........ max_depth=4, min_child_weight=2, score=

GridSearchCV(cv=sklearn.cross_validation.KFold(n=2312432, n_folds=4, shuffle=True, random_state=42),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([4, 5, 6]), 'min_child_weight': array([1, 2, 3])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=100)

In [15]:
print gs_xgb.best_score_
print gs_xgb.best_params_

0.764418708158
{'max_depth': 4, 'min_child_weight': 3}
