<h1 style="color:#2e658b; text-align:center; font-size:250%">BLACK FRIDAY HACKATHON</h1>

In [53]:
#########################
# Importing the Libraries
#########################

import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import math
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [3]:
#####################
# Loading the dataset
#####################

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# Saving id variables to create final submission

ids_test = test['User_ID'].copy()
product_ids_test = test['Product_ID'].copy()

In [5]:
# Training Set

train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [6]:
# Testing Set

test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3
0,1000004,P00128942,M,46-50,7,B,2,1,1,11.0,
1,1000009,P00113442,M,26-35,17,C,0,0,3,5.0,
2,1000010,P00288442,F,36-45,1,B,4+,1,5,14.0,
3,1000010,P00145342,F,36-45,1,B,4+,1,4,9.0,
4,1000011,P00053842,F,26-35,1,C,1,0,4,5.0,12.0


In [7]:
# Stats about Train varibale

print("No of Rows:", len(train))                            
print("No of Columns:", len(train.columns))
print("\nColumns:-\n{}".format(np.array(train.columns)))
print("\nStats of 'Purchase' Variable:-\n{}".format(train['Purchase'].describe()))

No of Rows: 550068
No of Columns: 12

Columns:-
['User_ID' 'Product_ID' 'Gender' 'Age' 'Occupation' 'City_Category'
 'Stay_In_Current_City_Years' 'Marital_Status' 'Product_Category_1'
 'Product_Category_2' 'Product_Category_3' 'Purchase']

Stats of 'Purchase' Variable:-
count    550068.000000
mean       9263.968713
std        5023.065394
min          12.000000
25%        5823.000000
50%        8047.000000
75%       12054.000000
max       23961.000000
Name: Purchase, dtype: float64


In [8]:
# Looking at the dypes

train.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [9]:
# Categorical Features
print("Categorical Features:-\n{}".format(np.array(train.dtypes[train.dtypes == 'object'].index)))

# Numerical Features
print("\nNumerical Features:-\n{}".format(np.array(train.dtypes[train.dtypes != 'object'].index)))

Categorical Features:-
['Product_ID' 'Gender' 'Age' 'City_Category' 'Stay_In_Current_City_Years']

Numerical Features:-
['User_ID' 'Occupation' 'Marital_Status' 'Product_Category_1'
 'Product_Category_2' 'Product_Category_3' 'Purchase']


In [10]:
# Renaming Columns

my_cols = {'Stay_In_Current_City_Years': 'City_Stay', 'Product_Category_1': 'Prod_Cat1', 
           'Product_Category_2': 'Prod_Cat2', 'Product_Category_3': 'Prod_Cat3', 'City_Category': 'City_Cat'}

train.rename(columns=my_cols, inplace=True)
test.rename(columns=my_cols, inplace=True)

train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Cat,City_Stay,Marital_Status,Prod_Cat1,Prod_Cat2,Prod_Cat3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


---

## Feature Engineering

In [11]:
#############
# NULL Values
#############

print("Prod_Cat2 Null values", train['Prod_Cat2'].isnull().sum())
print("Percentage", (train['Prod_Cat2'].isnull().sum()/len(train)*100), "%")

print("\nProd_Cat3 Null values", train['Prod_Cat3'].isnull().sum())
print("Percentage", (train['Prod_Cat3'].isnull().sum()/len(train)*100), "%")

Prod_Cat2 Null values 173638
Percentage 31.5666426696 %

Prod_Cat3 Null values 383247
Percentage 69.6726586531 %


In [12]:
### Dropping Prod_Cat2 and Prod_Cat3 (Because of High Percentage of Missing Values)

train.drop(['Prod_Cat2', 'Prod_Cat3'], inplace=True, axis=1)
test.drop(['Prod_Cat2', 'Prod_Cat3'], inplace=True, axis=1)

In [13]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Cat,City_Stay,Marital_Status,Prod_Cat1,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,7969


---

In [None]:
### Age Variable

In [14]:
train['Age'].value_counts()

26-35    219587
36-45    110013
18-25     99660
46-50     45701
51-55     38501
55+       21504
0-17      15102
Name: Age, dtype: int64

In [15]:
age_dict = {'26-35': 30, '36-45': 38, '46-50': 48, '18-25': 21, '51-55': 53, '55+': 60, '0-17': 15}

train['Age'] = train['Age'].apply(lambda i: age_dict[i])
test['Age'] = test['Age'].apply(lambda i: age_dict[i])

In [16]:
train.Age.value_counts()

30    219587
38    110013
21     99660
48     45701
53     38501
60     21504
15     15102
Name: Age, dtype: int64

---

In [None]:
### Gender Variable

In [17]:
train.Gender.value_counts()

M    414259
F    135809
Name: Gender, dtype: int64

In [18]:
gender_dict = {'M': 1, 'F': 0}

train['Gender'] = train['Gender'].apply(lambda i: gender_dict[i])
test['Gender'] = test['Gender'].apply(lambda i: gender_dict[i])

In [19]:
train.Gender.value_counts()

1    414259
0    135809
Name: Gender, dtype: int64

---

In [None]:
### City_Category Variable

In [20]:
train['City_Cat'].value_counts()

B    231173
C    171175
A    147720
Name: City_Cat, dtype: int64

In [21]:
train = pd.get_dummies(train, columns=['City_Cat'], drop_first=True)
test = pd.get_dummies(test, columns=['City_Cat'], drop_first=True)

In [22]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Stay,Marital_Status,Prod_Cat1,Purchase,City_Cat_B,City_Cat_C
0,1000001,P00069042,0,15,10,2,0,3,8370,0,0
1,1000001,P00248942,0,15,10,2,0,1,15200,0,0
2,1000001,P00087842,0,15,10,2,0,12,1422,0,0
3,1000001,P00085442,0,15,10,2,0,12,1057,0,0
4,1000002,P00285442,1,60,16,4+,0,8,7969,0,1


---

In [None]:
### Curr_City_Stay Variable

In [23]:
train.City_Stay.value_counts()

1     193821
2     101838
3      95285
4+     84726
0      74398
Name: City_Stay, dtype: int64

In [24]:
City_Stay_dict = {'0':0, '1':1, '2':2, '3':3, '4+':4}

train['City_Stay'] = train['City_Stay'].apply(lambda i: City_Stay_dict[i])
test['City_Stay'] = test['City_Stay'].apply(lambda i: City_Stay_dict[i])

In [25]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Stay,Marital_Status,Prod_Cat1,Purchase,City_Cat_B,City_Cat_C
0,1000001,P00069042,0,15,10,2,0,3,8370,0,0
1,1000001,P00248942,0,15,10,2,0,1,15200,0,0
2,1000001,P00087842,0,15,10,2,0,12,1422,0,0
3,1000001,P00085442,0,15,10,2,0,12,1057,0,0
4,1000002,P00285442,1,60,16,4,0,8,7969,0,1


---

In [None]:
### User_ID Variable

In [26]:
len(train.User_ID.unique())         # Tota number of Unique User_ID = 5891

5891

In [27]:
# Label Encoding User_IDs

le = LabelEncoder()
train['User_ID'] = le.fit_transform(train['User_ID'])
test['User_ID'] = le.transform(test['User_ID'])

---

In [None]:
#### Product_ID Variable

In [28]:
print("Train Set(Unique ID):", len(train.Product_ID.unique()))  # 3631 unique values in train - Product_ID

print("Test Set(Unique ID):", len(test.Product_ID.unique()))    # 3491 unique values in train - Product_ID

Train Set(Unique ID): 3631
Test Set(Unique ID): 3491


In [29]:
# New Product IDs which are in Test set But not in Train set

New_Product_IDs = list(set(test.Product_ID.unique()) - set(train.Product_ID.unique()))

In [30]:
# Label Encoding Product_IDs

le = LabelEncoder()

train['Product_ID'] = le.fit_transform(train['Product_ID'])               # Transform the Train Product_IDs

test.ix[test['Product_ID'].isin(New_Product_IDs), 'Product_ID'] = -1      # New Product_IDs in Test are set to -1
New_Product_IDs.append(-1)                                                # Append -1 to New_Product_IDs

test.ix[~test['Product_ID'].isin(New_Product_IDs), 'Product_ID'] = \
le.transform(test.ix[~test['Product_ID'].isin(New_Product_IDs), 'Product_ID'])  # Product_IDs in Test which are in Train
                                                                                # are transformed according to Train_Data

In [31]:
train.head()        # 46 new product ids in test set and 186 products absent in test

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Stay,Marital_Status,Prod_Cat1,Purchase,City_Cat_B,City_Cat_C
0,0,672,0,15,10,2,0,3,8370,0,0
1,0,2376,0,15,10,2,0,1,15200,0,0
2,0,852,0,15,10,2,0,12,1422,0,0
3,0,828,0,15,10,2,0,12,1057,0,0
4,1,2734,1,60,16,4,0,8,7969,0,1


---

In [None]:
### Purchase Variable

In [32]:
# Reducing boundaries to decrease RMSE

cutoff_purchase = np.percentile(train['Purchase'], 99.9)    # 99.9 percentile
train.ix[train['Purchase'] > cutoff_purchase, 'Purchase'] = cutoff_purchase

In [33]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Stay,Marital_Status,Prod_Cat1,Purchase,City_Cat_B,City_Cat_C
0,0,672,0,15,10,2,0,3,8370.0,0,0
1,0,2376,0,15,10,2,0,1,15200.0,0,0
2,0,852,0,15,10,2,0,12,1422.0,0,0
3,0,828,0,15,10,2,0,12,1057.0,0,0
4,1,2734,1,60,16,4,0,8,7969.0,0,1


In [34]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Stay,Marital_Status,Prod_Cat1,City_Cat_B,City_Cat_C
0,3,1196,1,48,7,2,1,1,1,0
1,8,1043,1,30,17,0,0,3,0,1
2,9,2764,0,38,1,4,1,5,1,0
3,9,1358,0,38,1,4,1,4,1,0
4,10,529,0,30,1,1,0,4,0,1


---

In [None]:
####################################
####### Count Features #############
####################################


### Counts the number of elements in that particular group 

# Eg. Age_Count contains the value counts of an age group:
#     Like '0-17' -> 15102

count_features = '''
Age_Count
Occupation_Count
Product_Category_1_Count
Product_Category_2_Count
Product_Category_3_Count
User_ID_Count
Product_ID_Count
'''

In [35]:
def getCountFeatures(compute_df, count_df, var_name):
    
    groups_dict = dict(count_df[var_name].value_counts())
    
    count_list = []
    for index, row in compute_df.iterrows():
        name = row[var_name]
        count_list.append(groups_dict.get(name))
    
    return count_list

In [36]:
train["Age_Count"] = getCountFeatures(train, train, "Age")
test["Age_Count"] = getCountFeatures(test, train, "Age")

train["Occupation_Count"] = getCountFeatures(train, train, "Occupation")
test["Occupation_Count"] = getCountFeatures(test, train, "Occupation")

train["Prod_Cat1_Count"] = getCountFeatures(train, train, "Prod_Cat1")
test["Prod_Cat1_Count"] = getCountFeatures(test, train, "Prod_Cat1")

train["User_ID_Count"] = getCountFeatures(train, train, "User_ID")
test["User_ID_Count"] = getCountFeatures(test, train, "User_ID")

---

In [None]:
####################################
####### Price Features #############
####################################


price_features = '''
MinPrice
MaxPrice
MeanPrice
25PercPrice
75PercPrice
'''

## For these variables:-
# User_ID
# Product_ID
# Product_Category_1

# For eg. Userer_ID_MinPrice, Product_ID_25PercPrice, etc.

In [37]:
def getPurchaseVar(compute_df, purchase_df, var_name):
    
    group = purchase_df.groupby(var_name)['Purchase']
    
    max_dict = dict(group.max())
    min_dict = dict(group.min())
    mean_dict = dict(group.mean())
    twentyfive_dict = dict(group.quantile(0.25))
    seventyfive_dict = dict(group.quantile(0.75))
        
    min_list = []
    max_list = []
    mean_list = []
    twentyfive_list = []
    seventyfive_list = []
        
    for index, row in compute_df.iterrows():
        name = row[var_name]
        min_list.append(min_dict.get(name))
        max_list.append(max_dict.get(name))
        mean_list.append(mean_dict.get(name))
        twentyfive_list.append(twentyfive_dict.get(name))
        seventyfive_list.append(seventyfive_dict.get(name))

    return min_list, max_list, mean_list, twentyfive_list, seventyfive_list

In [39]:
# User_ID

min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = \
getPurchaseVar(train, train, "User_ID")
        
train["User_ID_MinPrice"] = min_price_list
train["User_ID_MaxPrice"] = max_price_list
train["User_ID_MeanPrice"] = mean_price_list
train["User_ID_25PercPrice"] = twentyfive_price_list
train["User_ID_75PercPrice"] = seventyfive_price_list


min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = \
getPurchaseVar(test, train, "User_ID")

test["User_ID_MinPrice"] = min_price_list
test["User_ID_MaxPrice"] = max_price_list
test["User_ID_MeanPrice"] = mean_price_list
test["User_ID_25PercPrice"] = twentyfive_price_list
test["User_ID_75PercPrice"] = seventyfive_price_list

In [41]:
# Product_ID

min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = \
getPurchaseVar(train, train, "Product_ID")
        
train["Product_ID_MinPrice"] = min_price_list
train["Product_ID_MaxPrice"] = max_price_list
train["Product_ID_MeanPrice"] = mean_price_list
train["Product_ID_25PercPrice"] = twentyfive_price_list
train["Product_ID_75PercPrice"] = seventyfive_price_list


min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = \
getPurchaseVar(test, train, "Product_ID")

test["Product_ID_MinPrice"] = min_price_list
test["Product_ID_MaxPrice"] = max_price_list
test["Product_ID_MeanPrice"] = mean_price_list
test["Product_ID_25PercPrice"] = twentyfive_price_list
test["Product_ID_75PercPrice"] = seventyfive_price_list

In [43]:
# Product_Category_1

min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = \
getPurchaseVar(train, train, "Prod_Cat1")

train["Prod_Cat1_MinPrice"] = min_price_list
train["Prod_Cat1_MaxPrice"] = max_price_list
train["Prod_Cat1_MeanPrice"] = mean_price_list
train["Prod_Cat1_25PercPrice"] = twentyfive_price_list
train["Prod_Cat1_75PercPrice"] = seventyfive_price_list


min_price_list, max_price_list, mean_price_list, twentyfive_price_list, seventyfive_price_list = \
getPurchaseVar(test, train, "Prod_Cat1")

test["Prod_Cat1_MinPrice"] = min_price_list
test["Prod_Cat1_MaxPrice"] = max_price_list
test["Prod_Cat1_MeanPrice"] = mean_price_list
test["Prod_Cat1_25PercPrice"] = twentyfive_price_list
test["Prod_Cat1_75PercPrice"] = seventyfive_price_list

In [44]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Stay,Marital_Status,Prod_Cat1,Purchase,City_Cat_B,...,Product_ID_MinPrice,Product_ID_MaxPrice,Product_ID_MeanPrice,Product_ID_25PercPrice,Product_ID_75PercPrice,Prod_Cat1_MinPrice,Prod_Cat1_MaxPrice,Prod_Cat1_MeanPrice,Prod_Cat1_25PercPrice,Prod_Cat1_75PercPrice
0,0,672,0,15,10,2,0,3,8370.0,0,...,2648.0,13716.0,11870.863436,10804.0,13475.5,2638.0,13717.0,10096.705734,8198.0,13211.0
1,0,2376,0,15,10,2,0,1,15200.0,0,...,3880.0,19701.0,16304.030981,15312.0,19264.0,3790.0,19708.0,13606.218596,11546.0,15812.0
2,0,852,0,15,10,2,0,12,1422.0,0,...,343.0,1776.0,1237.892157,1043.75,1423.75,342.0,1778.0,1350.859894,1071.0,1723.0
3,0,828,0,15,10,2,0,12,1057.0,0,...,365.0,1778.0,1455.140762,1378.0,1736.0,342.0,1778.0,1350.859894,1071.0,1723.0
4,1,2734,1,60,16,4,0,8,7969.0,0,...,3920.0,10073.0,7692.763547,6174.5,8082.5,1939.0,10082.0,7498.958078,6036.0,9722.0


In [45]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Stay,Marital_Status,Prod_Cat1,City_Cat_B,City_Cat_C,...,Product_ID_MinPrice,Product_ID_MaxPrice,Product_ID_MeanPrice,Product_ID_25PercPrice,Product_ID_75PercPrice,Prod_Cat1_MinPrice,Prod_Cat1_MaxPrice,Prod_Cat1_MeanPrice,Prod_Cat1_25PercPrice,Prod_Cat1_75PercPrice
0,3,1196,1,48,7,2,1,1,1,0,...,3933.0,19708.0,15781.11859,12067.5,19248.5,3790.0,19708.0,13606.218596,11546.0,15812.0
1,8,1043,1,30,17,0,0,3,0,1,...,3077.0,13714.0,11746.665354,10731.25,13493.25,2638.0,13717.0,10096.705734,8198.0,13211.0
2,9,2764,0,38,1,4,1,5,1,0,...,1734.0,8900.0,5731.338028,5147.0,7027.0,1713.0,8907.0,6240.088178,5242.0,7156.0
3,9,1358,0,38,1,4,1,4,1,0,...,813.0,3526.0,1943.266667,1459.0,2129.5,684.0,3556.0,2329.659491,2058.0,2837.0
4,10,529,0,30,1,1,0,4,0,1,...,684.0,3556.0,2585.590829,2115.0,3421.5,684.0,3556.0,2329.659491,2058.0,2837.0


---

In [None]:
##############################################
# Preparing Feature Matrix and Response Vector
##############################################

In [46]:
X = train.drop('Purchase', axis=1)
y = train['Purchase']

---

## Modelling

In [None]:
############################################################################
# Parameters Set 1
param2 = {'objective': 'reg:linear', 'booster': 'gbtree', 'silent': 1,
          'max_depth': 10, 'eta': 0.1, 'nthread': 4, 'subsample': 0.8,
          'colsample_bytree': 0.8, 'min_child_weight': 20,
          'max_delta_step': 0, 'gamma': 0}

num_boost_rounds = 690  # Number of boosting iterations
############################################################################


############################################################################
# Parameters Set 2
param1 = {'objective': 'reg:linear', 'silent': 1, 'max_depth': 10,
          'eta': 0.03, 'subsample': 0.8, 'min_child_weight': 10,
          'seed': 0}

num_boost_rounds = 1100  # Number of boosting iterations
############################################################################

---

In [54]:
#########
# DMatrix
#########

# DMatrix is a internal data structure that is used by XGBoost
# It is optimized for both memory efficiency and training speed.
# It is used for storing our train and test data in an efficient manner for XGBoost.

dtrain = xgb.DMatrix(data=X.values, label=y)         # Stores our training set
dtest = xgb.DMatrix(data=test.values, label=None)    # Stores our testing set (Testing set has no labels)

---

In [55]:
############################
# Function for XGBoost Model
############################

def XGBoost_Model(dtrain,  dtest,  num_boost_rounds,  param_dict, seed_no=0):

    param_dict["seed"] = seed_no

    regressor = xgb.train(params=param_dict, dtrain=dtrain, num_boost_rounds=num_boost_rounds)
    
    test_pred = regressor.predict(dtest)
    
    return test_pred

Running model..


---

In [None]:
# For running XGBoost once (USING PARAMETER SET 1)
# Saving it to test_pred1

print("Model Running...")
test_preds1 = XGBoost_Model(dtrain, dtest, num_boost_rounds, param1, seed_no=0)
print("Done OK.")

---

In [None]:
# For running XGBoost n times with n seeds and taking mean predictions of each row (USING PARAMETER SET 2)
# Saving it to test_pred2

seeds = [1122, 2244, 3366, 4488, 5500]  # Random Seed Numbers (In this case 5 seeds)

test_preds = np.zeros((len(test), len(seeds))) # A 2-D array of 0's for storing our 5 predictions for each row
                                               # Initially these predictions are set to zero
                                               # Eg.[ [0,0,0,0,0], [0,0,0,0,0], [0,0,0,0,0], ........ ] 

for run in range(len(seeds)):

    sys.stdout.write("\rXGB RUN:{}/{}".format(run+1, len(seeds)))   # For writing to the screen eg. RUN:1/5
    sys.stdout.flush()                                              # For flushing out the output

    test_preds[:, run] = XGBoost_Model(dtrain, dtest, num_boost_round, param2, seed_no=seeds[run])
    

test_pred2 = np.mean(test_preds, axis=1)  # Taking mean prediction of each row    

---

In [56]:
# Submission files

submit1 = pd.DataFrame({'User_ID': ids_test, 'Product_ID': product_ids_test, 'Purchase': test_pred1})
submit1 = submit1[['User_ID', 'Product_ID', 'Purchase']]

submit2 = pd.DataFrame({'User_ID': ids_test, 'Product_ID': product_ids_test, 'Purchase': test_pred2})
submit2 = submit2[['User_ID', 'Product_ID', 'Purchase']]

In [57]:
submit1.ix[submit['Purchase'] < 0, 'Purchase'] = 12
submit1.to_csv("Submissions/final_solution-2.csv", index=False)

submit2.ix[submit['Purchase'] < 0, 'Purchase'] = 12
submit2.to_csv("Submissions/final_solution-3.csv", index=False)

---