In [1]:
!pip install xgboost



In [3]:
#########################
# Importing the Libraries
#########################

import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import math
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [4]:
#####################
# Loading the dataset
#####################

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
# Saving id variables to create final submission

ids_test = test['User_ID'].copy()
product_ids_test = test['Product_ID'].copy()

In [6]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [7]:
# Looking at the columns
print("Columns:-> ", train.columns)
print("\nNumber of Columns: ", len(train.columns))
print("Number of Rows: ", len(train))

Columns:->  Index(['User_ID', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Product_Category_3', 'Purchase'],
      dtype='object')

Number of Columns:  12
Number of Rows:  550068


<h2>Missing Value Treatment</h2>

In [9]:
# --- Missing Data with Percentages ---
#######################################

total = train.isnull().sum().sort_values(ascending=False)
percent = round(train.isnull().sum()/train.isnull().count() * 100, 2).sort_values(ascending=False)

missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(15)

Unnamed: 0,Total,Percent
Product_Category_3,383247,69.67
Product_Category_2,173638,31.57
Purchase,0,0.0
Product_Category_1,0,0.0
Marital_Status,0,0.0
Stay_In_Current_City_Years,0,0.0
City_Category,0,0.0
Occupation,0,0.0
Age,0,0.0
Gender,0,0.0


In [10]:
### Dropping Prod_Cat2 and Prod_Cat3 (Because of High Percentage of Missing Values)

train.drop(['Product_Category_2', 'Product_Category_3'], inplace=True, axis=1)
test.drop(['Product_Category_2', 'Product_Category_3'], inplace=True, axis=1)

In [11]:
## converting age variable to numeric
train['Age'] = train['Age'].map({'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6}).astype(int)
test['Age'] = test['Age'].map({'0-17':0, '18-25':1, '26-35':2, '36-45':3, '46-50':4, '51-55':5, '55+':6}).astype(int)

In [12]:
# City to Numeric 
train['City_Category'] = train['City_Category'].map({'A':0, 'B':1, 'C':2}).astype(int)
test['City_Category'] = test['City_Category'].map({'A':0, 'B':1, 'C':2}).astype(int)

In [13]:
# Converting gender to numeric
train['Gender'] = train['Gender'].map({'F':0, 'M':1}).astype(int)
test['Gender'] = test['Gender'].map({'F':0, 'M':1}).astype(int)

In [14]:
# Converting Stay_In_Current_City_Years  to numeric
train['Stay_In_Current_City_Years'] = train['Stay_In_Current_City_Years'].map({'0':0, '1':1, '2':2, '3':3, '4+':4}).astype(int)
test['Stay_In_Current_City_Years'] = test['Stay_In_Current_City_Years'].map({'0':0, '1':1, '2':2, '3':3, '4+':4}).astype(int)

In [15]:
# Label Encoding User_IDs
le = LabelEncoder()
train['User_ID'] = le.fit_transform(train['User_ID'])
test['User_ID'] = le.transform(test['User_ID'])

In [16]:
print("Train Set(Unique ID):", len(train.Product_ID.unique()))  # 3631 unique values in train - Product_ID

print("Test Set(Unique ID):", len(test.Product_ID.unique()))    # 3491 unique values in train - Product_ID

Train Set(Unique ID): 3631
Test Set(Unique ID): 3491


In [17]:
# New Product IDs which are in Test set But not in Train set

New_Product_IDs = list(set(test.Product_ID.unique()) - set(train.Product_ID.unique()))

In [18]:
# Label Encoding Product_IDs

le = LabelEncoder()

train['Product_ID'] = le.fit_transform(train['Product_ID'])               # Transform the Train Product_IDs

test.ix[test['Product_ID'].isin(New_Product_IDs), 'Product_ID'] = -1      # New Product_IDs in Test are set to -1
New_Product_IDs.append(-1)                                                # Append -1 to New_Product_IDs

test.ix[~test['Product_ID'].isin(New_Product_IDs), 'Product_ID'] = \
le.transform(test.ix[~test['Product_ID'].isin(New_Product_IDs), 'Product_ID'])  # Product_IDs in Test which are in Train
                                                                                # are transformed according to Train_Data

In [19]:
# Reducing boundaries to decrease RMSE

cutoff_purchase = np.percentile(train['Purchase'], 99.9)    # 99.9 percentile
train.ix[train['Purchase'] > cutoff_purchase, 'Purchase'] = cutoff_purchase

In [20]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Purchase
0,0,672,0,0,10,0,2,0,3,8370.0
1,0,2376,0,0,10,0,2,0,1,15200.0
2,0,852,0,0,10,0,2,0,12,1422.0
3,0,828,0,0,10,0,2,0,12,1057.0
4,1,2734,1,6,16,2,4,0,8,7969.0


In [21]:
test.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1
0,3,1196,1,4,7,1,2,1,1
1,8,1043,1,2,17,2,0,0,3
2,9,2764,0,3,1,1,4,1,5
3,9,1358,0,3,1,1,4,1,4
4,10,529,0,2,1,2,1,0,4


In [22]:
#Labelling dataset
X = train[:]
y = train['Purchase']

# Dropping Purchase from Feature Matrix i.e train data which is now stored in X
X.drop(["Purchase"], axis=1, inplace=True)

In [27]:
# Parameters of Binga
param2 = {'objective': 'reg:linear', 'booster': 'gbtree', 'silent': 1,
          'max_depth': 10, 'eta': 0.1, 'nthread': 4, 'subsample': 0.8,
          'colsample_bytree': 0.8, 'min_child_weight': 20,
          'max_delta_step': 0, 'gamma': 0}

num_boost_round = 690  # Number of boosting iterations

In [24]:
#Creating a DMatrix
# DMatrix is a internal data structure that is used by XGBoost
# It is optimized for both memory efficiency and training speed.
# It is used for storing our train and test data in an efficient manner for XGBoost.

dtrain = xgb.DMatrix(data=X.values, label=y)         # Stores our training set
dtest = xgb.DMatrix(data=test.values, label=None)    # Stores our testing set (Testing set has no labels)

In [28]:
# Function for XGBoost Model
############################

def XGBoost_Model(dtrain,  dtest,  num_boost_rounds,  param_dict, seed_no=0):

    param_dict["seed"] = seed_no

    regressor = xgb.train(params=param_dict, dtrain=dtrain, num_boost_round=num_boost_round)
    
    test_pred = regressor.predict(dtest)
    
    return test_pred

In [29]:
# For running XGBoost n times with n seeds and taking mean predictions of each row

seeds = [1122, 2244, 3366, 4488, 5500]  # Random Seed Numbers (In this case 5 seeds)

test_preds = np.zeros((len(test), len(seeds)))  # A 2-D array of 0's for storing our 5 predictions for each row
                                                # Initially these predictions are set to zero
                                                # Eg.[ [0,0,0,0,0], [0,0,0,0,0], [0,0,0,0,0], ........ ] 

for run in range(len(seeds)):

    sys.stdout.write("\rXGB RUN:{}/{}".format(run+1, len(seeds)))   # For writing to the screen eg. RUN:1/5
    sys.stdout.flush()                                              # For flushing out the output

    test_preds[:, run] = XGBoost_Model(dtrain, dtest, num_boost_rounds, param2, seed_no=seeds[run])
    

test_preds = np.mean(test_preds, axis=1)  # Taking mean prediction of each row    

XGB RUN:5/5

In [30]:
# Submission file

submit = pd.DataFrame({'User_ID': ids_test, 'Product_ID': product_ids_test, 'Purchase': test_preds})
submit = submit[['User_ID', 'Product_ID', 'Purchase']]

In [31]:
submit.ix[submit['Purchase'] < 0, 'Purchase'] = 12     # Changing min prediction to min value in train
submit.to_csv("final_solution-1.csv", index=False)