In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [3]:
# Load the data
file_path = Path('../Resources/weekly_sales_complete.csv')
df = pd.read_csv(file_path)
df.columns =  df.columns.str.replace(' ','_')

df.head()

Unnamed: 0,Item,Item_Code,Quantity,Unit_Price,Total_Sales_Amount,date
0,PINT Spotted Cow,3140,64.0,3.9375,252.0,2021-01-08
1,DBL RAIL Vodka,3455,37.0,4.945946,183.0,2021-01-08
2,BTL Miller High Life,3122,31.0,3.25,100.75,2021-01-08
3,PINT Stein,3141,29.0,3.517241,102.0,2021-01-08
4,SHOT Seagrams VO,3325,26.0,2.423077,63.0,2021-01-08


In [4]:
columns = ["Item","Item_Code", "Quantity", "Unit_Price", "Total_Sales_Amount", "date"]

target = ["Total_Sales_Amount"]

In [5]:
df.dtypes

Item                   object
Item_Code               int64
Quantity              float64
Unit_Price            float64
Total_Sales_Amount    float64
date                   object
dtype: object

In [7]:
df.shape

(15585, 6)

In [6]:
data = df.sample(frac=0.90, random_state=600).reset_index(drop=True)
data_unseen = df.drop(data.index).reset_index(drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (21461, 6)
Unseen Data For Predictions: (2385, 6)


In [7]:
# Create our features
X = pd.get_dummies(data=df, columns=['Total_Sales_Amount', 'date'])
# Create our target
y = df["Total_Sales_Amount"]

In [8]:
X.describe()

Unnamed: 0,Item_Code,Quantity,Unit_Price,Total_Sales_Amount_0.0,Total_Sales_Amount_0.13,Total_Sales_Amount_0.25,Total_Sales_Amount_0.26,Total_Sales_Amount_0.5,Total_Sales_Amount_0.53,Total_Sales_Amount_0.63,...,date_2021-07-23,date_2021-07-30,date_2021-08-06,date_2021-08-13,date_2021-08-20,date_2021-08-27,date_2021-09-03,date_2021-09-10,date_2021-09-17,date_2021-09-24
count,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,...,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0,23846.0
mean,3221.302944,11.079636,5.147705,0.052587,4.2e-05,0.014468,4.2e-05,0.006542,4.2e-05,8.4e-05,...,0.008094,0.007632,0.007213,0.007171,0.007716,0.007339,0.007297,0.0078,0.007548,0.007674
std,278.452809,17.737029,4.94421,0.223213,0.006476,0.119412,0.006476,0.080619,0.006476,0.009158,...,0.089602,0.087031,0.084624,0.084379,0.087504,0.085353,0.085111,0.087975,0.086555,0.087268
min,2696.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3113.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3234.0,5.0,4.304952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3452.0,13.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3717.0,293.0,206.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)
Counter(y_train)

Counter({10.0: 296,
         28.0: 203,
         19.5: 59,
         36.0: 197,
         16.0: 478,
         7.0: 368,
         120.5: 3,
         39.0: 57,
         7.5: 111,
         74.0: 5,
         12.0: 496,
         14.0: 241,
         0.0: 958,
         30.0: 234,
         63.0: 73,
         97.0: 6,
         68.0: 38,
         201.0: 2,
         49.75: 8,
         9.0: 272,
         28.5: 29,
         60.0: 155,
         45.0: 129,
         272.0: 5,
         5974.0: 1,
         6.0: 444,
         0.5: 120,
         81.0: 34,
         5.0: 385,
         49.5: 30,
         8.0: 715,
         244.6: 1,
         32.0: 206,
         88.0: 65,
         15.0: 293,
         3.0: 421,
         105.5: 7,
         4.0: 567,
         27.0: 115,
         1.5: 341,
         54.0: 61,
         256.0: 8,
         13.0: 88,
         53.0: 12,
         243.75: 1,
         160.0: 27,
         21.0: 185,
         360.0: 7,
         90.5: 17,
         41.0: 16,
         171.5: 1,
         288.0: 3