In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, accuracy_score, f1_score
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import matplotlib.pyplot as plt
import numpy as np
from joblib import dump, load

In [2]:
df = pd.read_csv('Resources/sales.csv')
df.head()

Unnamed: 0,Date,Store,Product,Is_Holiday,Base Price,Price,Weekly_Units_Sold,weekly_sales,year,month,day,week_of_year,promotion
0,2010-02-05,1,1,False,9.99,7.99,245,1957.55,2010,2,5,5,1
1,2010-02-12,1,1,True,9.99,7.99,453,3619.47,2010,2,12,6,1
2,2010-02-19,1,1,False,9.99,7.99,409,3267.91,2010,2,19,7,1
3,2010-02-26,1,1,False,9.99,7.99,191,1526.09,2010,2,26,8,1
4,2010-03-05,1,1,False,9.99,9.99,145,1448.55,2010,3,5,9,0


In [3]:
df=df.drop(columns=['Date', 'year', 'day', 'month','weekly_sales'])
df.head()

Unnamed: 0,Store,Product,Is_Holiday,Base Price,Price,Weekly_Units_Sold,week_of_year,promotion
0,1,1,False,9.99,7.99,245,5,1
1,1,1,True,9.99,7.99,453,6,1
2,1,1,False,9.99,7.99,409,7,1
3,1,1,False,9.99,7.99,191,8,1
4,1,1,False,9.99,9.99,145,9,0


In [4]:
df=df[['Store','Product','week_of_year','Base Price','Price','promotion','Is_Holiday','Weekly_Units_Sold']]
df.head()

Unnamed: 0,Store,Product,week_of_year,Base Price,Price,promotion,Is_Holiday,Weekly_Units_Sold
0,1,1,5,9.99,7.99,1,False,245
1,1,1,6,9.99,7.99,1,True,453
2,1,1,7,9.99,7.99,1,False,409
3,1,1,8,9.99,7.99,1,False,191
4,1,1,9,9.99,9.99,0,False,145


In [5]:
df['Temp']='_'
df['Store'] = df['Temp'].str.cat(df['Store'].values.astype(str))
df.head()

Unnamed: 0,Store,Product,week_of_year,Base Price,Price,promotion,Is_Holiday,Weekly_Units_Sold,Temp
0,_1,1,5,9.99,7.99,1,False,245,_
1,_1,1,6,9.99,7.99,1,True,453,_
2,_1,1,7,9.99,7.99,1,False,409,_
3,_1,1,8,9.99,7.99,1,False,191,_
4,_1,1,9,9.99,9.99,0,False,145,_


In [6]:
del df['Temp']
df

Unnamed: 0,Store,Product,week_of_year,Base Price,Price,promotion,Is_Holiday,Weekly_Units_Sold
0,_1,1,5,9.99,7.99,1,False,245
1,_1,1,6,9.99,7.99,1,True,453
2,_1,1,7,9.99,7.99,1,False,409
3,_1,1,8,9.99,7.99,1,False,191
4,_1,1,9,9.99,9.99,0,False,145
...,...,...,...,...,...,...,...,...
3856,_10,3,39,19.99,19.99,0,False,133
3857,_10,3,40,19.99,19.99,0,False,133
3858,_10,3,41,19.99,19.99,0,False,127
3859,_10,3,42,19.99,19.99,0,False,133


In [7]:
df['Temp']='_'
df['Product'] = df['Temp'].str.cat(df['Product'].values.astype(str))
df.head()

Unnamed: 0,Store,Product,week_of_year,Base Price,Price,promotion,Is_Holiday,Weekly_Units_Sold,Temp
0,_1,_1,5,9.99,7.99,1,False,245,_
1,_1,_1,6,9.99,7.99,1,True,453,_
2,_1,_1,7,9.99,7.99,1,False,409,_
3,_1,_1,8,9.99,7.99,1,False,191,_
4,_1,_1,9,9.99,9.99,0,False,145,_


In [8]:
del df['Temp']
df

Unnamed: 0,Store,Product,week_of_year,Base Price,Price,promotion,Is_Holiday,Weekly_Units_Sold
0,_1,_1,5,9.99,7.99,1,False,245
1,_1,_1,6,9.99,7.99,1,True,453
2,_1,_1,7,9.99,7.99,1,False,409
3,_1,_1,8,9.99,7.99,1,False,191
4,_1,_1,9,9.99,9.99,0,False,145
...,...,...,...,...,...,...,...,...
3856,_10,_3,39,19.99,19.99,0,False,133
3857,_10,_3,40,19.99,19.99,0,False,133
3858,_10,_3,41,19.99,19.99,0,False,127
3859,_10,_3,42,19.99,19.99,0,False,133


In [9]:
df['Temp']='_'
df['week_of_year'] = df['Temp'].str.cat(df['week_of_year'].values.astype(str))
del df['Temp']
df.head()

Unnamed: 0,Store,Product,week_of_year,Base Price,Price,promotion,Is_Holiday,Weekly_Units_Sold
0,_1,_1,_5,9.99,7.99,1,False,245
1,_1,_1,_6,9.99,7.99,1,True,453
2,_1,_1,_7,9.99,7.99,1,False,409
3,_1,_1,_8,9.99,7.99,1,False,191
4,_1,_1,_9,9.99,9.99,0,False,145


In [10]:
#  LabelEncoding Is_Holiday column
df['Is_Holiday']=LabelEncoder().fit_transform(df['Is_Holiday'])
df.head()

Unnamed: 0,Store,Product,week_of_year,Base Price,Price,promotion,Is_Holiday,Weekly_Units_Sold
0,_1,_1,_5,9.99,7.99,1,0,245
1,_1,_1,_6,9.99,7.99,1,1,453
2,_1,_1,_7,9.99,7.99,1,0,409
3,_1,_1,_8,9.99,7.99,1,0,191
4,_1,_1,_9,9.99,9.99,0,0,145


In [11]:
df.to_csv("Resources/preprocessed.csv")

In [12]:
sc1=df[df['Product']=='_1']
sc2=df[df['Product']=='_2']
sc3=df[df['Product']=='_3']

In [13]:
# Create features
X=sc3.drop(columns=['Weekly_Units_Sold'], axis = 1)

X=pd.get_dummies(X)

# Create target. 
y = sc3['Weekly_Units_Sold']

In [14]:
X

Unnamed: 0,Base Price,Price,promotion,Is_Holiday,Store__1,Store__10,Store__2,Store__3,Store__4,Store__5,...,week_of_year__48,week_of_year__49,week_of_year__5,week_of_year__50,week_of_year__51,week_of_year__52,week_of_year__6,week_of_year__7,week_of_year__8,week_of_year__9
286,19.99,19.99,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
287,19.99,19.99,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
288,19.99,19.99,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
289,19.99,19.99,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
290,19.99,19.99,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3856,19.99,19.99,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3857,19.99,19.99,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3858,19.99,19.99,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3859,19.99,19.99,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X.describe()

Unnamed: 0,Base Price,Price,promotion,Is_Holiday,Store__1,Store__10,Store__2,Store__3,Store__4,Store__5,...,week_of_year__48,week_of_year__49,week_of_year__5,week_of_year__50,week_of_year__51,week_of_year__52,week_of_year__6,week_of_year__7,week_of_year__8,week_of_year__9
count,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,...,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0,1287.0
mean,20.717273,19.117413,0.153846,0.06993,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,...,0.013986,0.013986,0.020979,0.013986,0.013986,0.013986,0.020979,0.020979,0.020979,0.020979
std,0.962465,3.822572,0.360941,0.255128,0.314392,0.314392,0.314392,0.314392,0.314392,0.314392,...,0.117478,0.117478,0.14337,0.117478,0.117478,0.117478,0.14337,0.14337,0.14337,0.14337
min,19.99,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.99,19.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,19.99,19.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,21.99,21.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,21.99,21.99,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Prep Data

In [16]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Gradient Boosting Regression

In [17]:
model=GradientBoostingRegressor()
model.fit(X_train_scaled, y_train)
pred=model.predict(X_test_scaled)

# for classification we use accuracy and F1 score
print(model.score(X_train_scaled,y_train))
print(model.score(X_test_scaled, y_test))


# for regression we use R2 score and MAE(mean absolute error)
# all other steps will be same as classification as shown above

print(mean_absolute_error(y_test,pred))
print(r2_score(y_test,pred))

0.960964461631848
0.9381445377237883
22.742096506007595
0.9381445377237883


In [18]:
# dump(model, 'filename.joblib') 

In [19]:
GBR = GradientBoostingRegressor()

parameters = {'learning_rate': [0.045, 0.05, 0.055, 0.06, 0.065],
              'subsample'    : [0.09, 0.085, 0.08, 0.075, 0.07],
              'n_estimators' : [900, 950, 1000, 1050, 1100],
              'max_depth'    : [6, 7, 8, 9, 10]
             }

grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 2, n_jobs=-1)
grid_GBR.fit(X_train_scaled, y_train)

# Generate predictions
y_pred = grid_GBR.predict(X_test_scaled)

# Generate r-squared
score = r2_score(y_test, y_pred)

print("Results from Grid Search")
print("\n The best score across ALL searched params:\n",grid_GBR.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)
print("\n r-squared:\n",score)

Results from Grid Search

 The best score across ALL searched params:
 0.8732056100375147

 The best parameters across ALL searched params:
 {'learning_rate': 0.06, 'max_depth': 6, 'n_estimators': 1000, 'subsample': 0.09}

 r-squared:
 0.9587797664950938


In [20]:
param_grid = dict(
    learning_rate=[0.01, 0.05, 0.1],
    n_estimators=[100, 150, 200, 250, 300],
    max_depth=[2, 5, 10, 15, 20],
    min_samples_leaf=[1, 5, 10, 20, 30, 50],
    min_samples_split=[2, 5, 10, 20, 30, 50],
)

model = GradientBoostingRegressor()

# model=GradientBoostingRegressor()
model.fit(X_train_scaled, y_train)
pred=model.predict(X_test_scaled)

RanSearch = RandomizedSearchCV(
    model,
    param_grid,
    n_iter=75,  # increase this if computational budget allows
    n_jobs=2,
    random_state=0,
).fit(X_train, y_train)

# Generate predictions
y_pred = model.predict(X_test_scaled)

# Generate r-squared
score = r2_score(y_test, y_pred)

print("Results from Randomized Search")
print("\n The best score across ALL searched params:\n",RanSearch.best_score_)
print("\n The best parameters across ALL searched params:\n",RanSearch.best_params_)
print("\n r-squared:\n",score)


Results from Randomized Search

 The best score across ALL searched params:
 0.9287003551361833

 The best parameters across ALL searched params:
 {'n_estimators': 150, 'min_samples_split': 20, 'min_samples_leaf': 1, 'max_depth': 15, 'learning_rate': 0.1}

 r-squared:
 0.9381445377237884
