Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics
import smogn

Data Collection and Processing

In [2]:
# load data
dataset = pd.read_csv('sales_dataset.csv')
dataset.head(3)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,order_id,order_date,status,item_id,sku,qty_ordered,price,value,discount_amount,total,...,SSN,Phone No.,Place Name,County,City,State,Zip,Region,User Name,Discount_Percent
0,100354678,10/1/2020,received,574772,oasis_Oasis-064-36,20,89.9,1798.0,0.0,1798.0,...,627-31-5251,405-959-1129,Vinson,Harmon,Vinson,OK,73571,South,jwtitus,0.0
1,100354678,10/1/2020,received,574774,Fantastic_FT-48,10,19.0,190.0,0.0,190.0,...,627-31-5251,405-959-1129,Vinson,Harmon,Vinson,OK,73571,South,jwtitus,0.0
2,100354680,10/1/2020,complete,574777,mdeal_DMC-610-8,8,149.9,1199.2,0.0,1199.2,...,627-31-5251,405-959-1129,Vinson,Harmon,Vinson,OK,73571,South,jwtitus,0.0


Data Preprocessing

In [3]:
# keep relevant features
dataset = dataset[['order_date', 'Customer_Since', 'category', 'Gender', 'age', 'Region', 'State', 'price', 'Discount_Percent', 'total']]
dataset.head(3)

Unnamed: 0,order_date,Customer_Since,category,Gender,age,Region,State,price,Discount_Percent,total
0,10/1/2020,8/22/2006,Men's Fashion,F,43,South,OK,89.9,0.0,1798.0
1,10/1/2020,8/22/2006,Men's Fashion,F,43,South,OK,19.0,0.0,190.0
2,10/1/2020,8/22/2006,Men's Fashion,F,43,South,OK,149.9,0.0,1199.2


In [4]:
# seperate month and year, drop years
dataset[['order_month','order_date', 'order_year']] = dataset.order_date.str.split("/", expand=True)
dataset[['customer_since_month','customer_since_date', 'customer_since_year']] = dataset.Customer_Since.str.split("/", expand=True)
dataset.drop('Customer_Since', axis=1, inplace=True)

In [5]:
dataset.columns

Index(['order_date', 'category', 'Gender', 'age', 'Region', 'State', 'price',
       'Discount_Percent', 'total', 'order_month', 'order_year',
       'customer_since_month', 'customer_since_date', 'customer_since_year'],
      dtype='object')

In [6]:
# get information about the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286392 entries, 0 to 286391
Data columns (total 14 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_date            286392 non-null  object 
 1   category              286392 non-null  object 
 2   Gender                286392 non-null  object 
 3   age                   286392 non-null  int64  
 4   Region                286392 non-null  object 
 5   State                 286392 non-null  object 
 6   price                 286392 non-null  float64
 7   Discount_Percent      286392 non-null  float64
 8   total                 286392 non-null  float64
 9   order_month           286392 non-null  object 
 10  order_year            286392 non-null  object 
 11  customer_since_month  286392 non-null  object 
 12  customer_since_date   286392 non-null  object 
 13  customer_since_year   286392 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 30

###### Categorical Features - months | category | gender | region | state 

Label Encoding

In [7]:
encoder = LabelEncoder()

In [8]:
cat_features = ['category', 'Gender', 'Region', 'State', 'order_date', 'order_month', 'customer_since_date', 'customer_since_month']

for cat_feature in cat_features:
    dataset[cat_feature] = encoder.fit_transform(dataset[cat_feature])

In [9]:
dataset.head(3)

Unnamed: 0,order_date,category,Gender,age,Region,State,price,Discount_Percent,total,order_month,order_year,customer_since_month,customer_since_date,customer_since_year
0,0,8,0,43,2,36,89.9,0.0,1798.0,1,2020,10,14,2006
1,0,8,0,43,2,36,19.0,0.0,190.0,1,2020,10,14,2006
2,0,8,0,43,2,36,149.9,0.0,1199.2,1,2020,10,14,2006


Split train and test dataset

In [10]:
X = dataset.drop(columns='total', axis=1)
Y = dataset['total']

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=2)

Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

#Creating StandardScaler Object
scaler = preprocessing.StandardScaler() 


X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) 

Model Training

In [13]:
regressor = XGBRegressor()

In [14]:
regressor.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

Evaluation

In [15]:
# prediction on training data
training_data_prediction = regressor.predict(X_train)
# R squared Value
r2_train = metrics.r2_score(Y_train, training_data_prediction)
print('R Squared value = ', r2_train)

R Squared value =  0.9379658260524534


In [16]:
# prediction on test data
test_data_prediction = regressor.predict(X_test)
# R squared Value
r2_test = metrics.r2_score(Y_test, test_data_prediction)
print('R Squared value = ', r2_test)

R Squared value =  0.8682961776778834


In [17]:
import pickle 
pickle_out = open("xgbregressor.pkl", "wb")
pickle.dump(regressor, pickle_out)
