In [1]:
import sys
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error

from sklearn.feature_extraction import DictVectorizer
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfTransformer

import matplotlib.pyplot as plt
from xgboost import XGBClassifier, plot_importance



### Read train and test datatset

In [3]:
train = pd.read_csv("/Users/Home/Documents/Python Scripts/Black Friday/train_bf.csv") 
test = pd.read_csv("/Users/Home/Documents/Python Scripts/Black Friday/test_bf.csv") 


### Create copies of datasets to tweak the data for model deployment

In [4]:
train_bf = train.copy()
test_bf = test.copy()

### Create label from original dataset for model parameter to train with the copied datatset

In [5]:
train_pur_reg = np.array(train["Purchase"])
test_pur_reg = np.array(test["Purchase"])

In [6]:
train_bf.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,550068.0,550068.0,550068.0,550068.0,376430.0,166821.0,550068.0
mean,1003029.0,8.076707,0.409653,5.40427,9.842329,12.668243,9263.968713
std,1727.592,6.52266,0.49177,3.936211,5.08659,4.125338,5023.065394
min,1000001.0,0.0,0.0,1.0,2.0,3.0,12.0
25%,1001516.0,2.0,0.0,1.0,5.0,9.0,5823.0
50%,1003077.0,7.0,0.0,5.0,9.0,14.0,8047.0
75%,1004478.0,14.0,1.0,8.0,15.0,16.0,12054.0
max,1006040.0,20.0,1.0,20.0,18.0,18.0,23961.0


In [7]:
test_bf.describe()

Unnamed: 0,User_ID,Occupation,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
count,233599.0,233599.0,233599.0,233599.0,161255.0,71037.0,233599.0
mean,1003029.0,8.085407,0.41007,5.276542,9.849586,12.669454,9318.28001
std,1726.505,6.521146,0.491847,3.73638,5.094943,4.125944,4972.71383
min,1000001.0,0.0,0.0,1.0,2.0,3.0,185.0
25%,1001527.0,2.0,0.0,1.0,5.0,9.0,5861.0
50%,1003070.0,7.0,0.0,5.0,9.0,14.0,8059.0
75%,1004477.0,14.0,1.0,8.0,15.0,16.0,12060.0
max,1006040.0,20.0,1.0,18.0,18.0,18.0,23961.0


In [8]:
train_bf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [9]:
test_bf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,,8370
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,,15200
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,,1422
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,,1057
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0,7969


### Fill in the missing values in both datasets

In [10]:
train_bf = train_bf.fillna(train_bf['Product_Category_2'].value_counts().index[0])
train_bf = train_bf.fillna(train_bf['Product_Category_3'].value_counts().index[0])


test_bf = test_bf.fillna(test_bf['Product_Category_2'].value_counts().index[0])
test_bf = test_bf.fillna(test_bf['Product_Category_3'].value_counts().index[0])

### Drop purchase from copy dataset as it is being used as a label from the original dataset

In [11]:
train_bf.drop('Purchase',axis=1)
test_bf.drop('Purchase',axis=1)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,8.0
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,8.0
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,8.0
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,8.0
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0
5,1000013,P00350442,M,46-50,1,C,3,1,2,3.0,15.0
6,1000013,P00155442,M,46-50,1,C,3,1,1,11.0,15.0
7,1000013,P0094542,M,46-50,1,C,3,1,2,4.0,9.0
8,1000015,P00161842,M,26-35,7,A,1,0,10,13.0,16.0
9,1000022,P00067942,M,18-25,15,A,4+,0,5,14.0,8.0


### Converting categorical values to numerical

In [12]:
train_cat_to_num = {"Stay_In_Current_City_Years": {"4+": 4,},
              "Age": {"0-17": 15, "18-25": 21, "26-35": 30, "36-45": 40,"46-50": 48, "51-55": 12, "55+":60 }}

test_cat_to_num = {"Stay_In_Current_City_Years": {"4+": 4,},
              "Age": {"0-17": 15, "18-25": 21, "26-35": 30, "36-45": 40,"46-50": 48, "51-55": 12, "55+":60 }}

In [13]:
from sklearn.preprocessing import LabelEncoder

train_make = LabelEncoder()
train_bf["City_Category_Num"] = train_make.fit_transform(train_bf["City_Category"])
train_bf[["City_Category_Num", "City_Category"]].head(11)

test_make = LabelEncoder()
test_bf["City_Category_Num"] = test_make.fit_transform(test_bf["City_Category"])
test_bf[["City_Category_Num", "City_Category"]].head(11)

Unnamed: 0,City_Category_Num,City_Category
0,1,B
1,2,C
2,1,B
3,1,B
4,2,C
5,2,C
6,2,C
7,2,C
8,0,A
9,0,A


In [14]:
train_bf.replace(train_cat_to_num, inplace=True)
train_bf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,City_Category_Num
0,1000001,P00069042,F,15,10,A,2,0,3,8.0,8.0,8370,0
1,1000001,P00248942,F,15,10,A,2,0,1,6.0,14.0,15200,0
2,1000001,P00087842,F,15,10,A,2,0,12,8.0,8.0,1422,0
3,1000001,P00085442,F,15,10,A,2,0,12,14.0,8.0,1057,0
4,1000002,P00285442,M,60,16,C,4,0,8,8.0,8.0,7969,2


In [15]:
test_bf.replace(test_cat_to_num, inplace=True)
test_bf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,City_Category_Num
0,1000004,P00128942,M,48,7,B,2,1,1,11.0,8.0,8370,1
1,1000009,P00113442,M,30,17,C,0,0,3,5.0,8.0,15200,2
2,1000010,P00288442,F,40,1,B,4,1,5,14.0,8.0,1422,1
3,1000010,P00145342,F,40,1,B,4,1,4,9.0,8.0,1057,1
4,1000011,P00053842,F,30,1,C,1,0,4,5.0,12.0,7969,2


### Converting gender categorical to binary

In [16]:
train_bf["Gender"] = np.where(train_bf["Gender"].str.contains("F"), 1,0)

test_bf["Gender"] = np.where(test_bf["Gender"].str.contains("F"), 1,0)

In [17]:
train_bf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,City_Category_Num
0,1000001,P00069042,1,15,10,A,2,0,3,8.0,8.0,8370,0
1,1000001,P00248942,1,15,10,A,2,0,1,6.0,14.0,15200,0
2,1000001,P00087842,1,15,10,A,2,0,12,8.0,8.0,1422,0
3,1000001,P00085442,1,15,10,A,2,0,12,14.0,8.0,1057,0
4,1000002,P00285442,0,60,16,C,4,0,8,8.0,8.0,7969,2


In [18]:
test_bf.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,City_Category_Num
0,1000004,P00128942,0,48,7,B,2,1,1,11.0,8.0,8370,1
1,1000009,P00113442,0,30,17,C,0,0,3,5.0,8.0,15200,2
2,1000010,P00288442,1,40,1,B,4,1,5,14.0,8.0,1422,1
3,1000010,P00145342,1,40,1,B,4,1,4,9.0,8.0,1057,1
4,1000011,P00053842,1,30,1,C,1,0,4,5.0,12.0,7969,2


### Convert all columns to string datatype

In [19]:
train_bf = train_bf.applymap(str)
train_bf.dtypes
print (train_bf.dtypes)

User_ID                       object
Product_ID                    object
Gender                        object
Age                           object
Occupation                    object
City_Category                 object
Stay_In_Current_City_Years    object
Marital_Status                object
Product_Category_1            object
Product_Category_2            object
Product_Category_3            object
Purchase                      object
City_Category_Num             object
dtype: object


In [20]:
test_bf = test_bf.applymap(str)
test_bf.dtypes
print (test_bf.dtypes)

User_ID                       object
Product_ID                    object
Gender                        object
Age                           object
Occupation                    object
City_Category                 object
Stay_In_Current_City_Years    object
Marital_Status                object
Product_Category_1            object
Product_Category_2            object
Product_Category_3            object
Purchase                      object
City_Category_Num             object
dtype: object


### Encode categorical variables with label encoding

In [21]:
train_bf = np.array(train_bf)

for i in range(train_bf.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_bf[:,i]))
    train_bf[:, i] = lbl.transform(train_bf[:, i])
train_bf = train_bf.astype(int)

In [22]:
test_bf = np.array(test_bf)

for i in range(test_bf.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(test_bf[:,i]))
    test_bf[:, i] = lbl.transform(test_bf[:, i])
test_bf = test_bf.astype(int)

# XGB Boost Model

In [23]:
params = {}
params["min_child_weight"] = 20
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 10
params["nthread"] = 6
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 67

plst = list(params.items())
num_rounds = 2540

In [24]:
xgtrain = xgb.DMatrix(train_bf,label=train_pur_reg)
model = xgb.train(plst, xgtrain, num_rounds)

xgtest = xgb.DMatrix(test_bf,label=test_pur_reg)

In [25]:
pred=model.predict(xgtest)

In [26]:
test['Purchase'] = pred
test.to_csv('final_xgb.csv', columns=['User_ID', 'Product_ID', 'Purchase'], index=False)

In [27]:
early_stopping=10
cv=xgb.cv(params,xgtrain,50,nfold=5,early_stopping_rounds=early_stopping, verbose_eval=1)
print(cv.mean(),cv.std())

[0]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[1]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[2]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[3]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[4]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[5]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[6]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[7]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[8]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
[9]	train-rmse:8996.77+3.17625	test-rmse:8996.77+12.6942
test-rmse-mean     8996.766406
test-rmse-std        12.694213
train-rmse-mean    8996.774805
train-rmse-std        3.176247
dtype: float64 test-rmse-mean    NaN
test-rmse-std     NaN
train-rmse-mean   NaN
train-rmse-std    NaN
dtype: float64


### XGB Boost CV score

# Transformer Model 

In [None]:
transformer = TfidfTransformer()
tfidf_train = transformer.fit_transform(train_bf.astype(int))

tfidf_test = transformer.fit_transform(test_bf.astype(int))

In [None]:
input_tfidf_train = tfidf_train.toarray()
input_tfidf_test = tfidf_test.toarray()

In [None]:
params = {}
params["min_child_weight"] = 10
params["subsample"] = 0.7
params["colsample_bytree"] = 0.7
params["scale_pos_weight"] = 0.8
params["silent"] = 1
params["max_depth"] = 12
params["nthread"] = 6
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["base_score"] = 1800
params["eval_metric"] = "rmse"
params["seed"] = 0

plst = list(params.items())
num_rounds = 1800

In [None]:
xgtrain = xgb.DMatrix(input_tfidf_train,label=train_pur_reg)
model = xgb.train(plst, xgtrain, num_rounds)

xgtest = xgb.DMatrix(input_tfidf_test,label=test_pur_reg)

In [None]:
pred=model.predict(xgtest)

In [None]:
test['Purchase'] = pred
test.to_csv('final_trans.csv', columns=['User_ID', 'Product_ID', 'Purchase'], index=False)

In [None]:
xgb.cv(params, xgtrain, num_rounds, nfold=5, seed=5500)