In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
# Load the data
file_path = Path('C:/Users/Rica6/Tracks_Tavern_Data/Resources/weekly_sales.csv')
df = pd.read_csv(file_path)
df.columns =  df.columns.str.replace(' ','_')

# Drop the null rows
#df = df.dropna()

# Convert the target column values to 0 and 1 based on their values
#x = {'Current': '0'}   
#df = df.replace(x)

df.head()

Unnamed: 0,Item,Item_Code,Quantity,Unit_Price,Total_Sales_Amount,date
0,RAIL Vodka,3339,46.0,4.826087,222.0,2019-02-15
1,PINT LKFT IPA,3136,44.0,3.579545,157.5,2019-02-15
2,SHOT Tullamore Dew,3327,38.0,4.0,152.0,2019-02-15
3,PINT Spotted Cow,3140,36.0,3.75,135.0,2019-02-15
4,PINT Miller High Life,3137,30.0,3.0,90.0,2019-02-15


In [5]:
columns = ["Item","Item_Code", "Quantity", "Unit_Price", "Total_Sales_Amount", "date"]

target = ["Total_Sales_Amount"]

In [6]:
df.dtypes

Item                   object
Item_Code               int64
Quantity              float64
Unit_Price            float64
Total_Sales_Amount    float64
date                   object
dtype: object

In [7]:
df.shape

(15585, 6)

In [8]:
data = df.sample(frac=0.95, random_state=786).reset_index(drop=True)
data_unseen = df.drop(data.index).reset_index(drop=True)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (14806, 6)
Unseen Data For Predictions: (779, 6)


In [9]:
# Create our features
X = pd.get_dummies(data=df, columns=['Total_Sales_Amount', 'date'])
# Create our target
y = df["Total_Sales_Amount"]

In [10]:
X.describe()

Unnamed: 0,Item_Code,Quantity,Unit_Price,Total_Sales_Amount_0.0,Total_Sales_Amount_0.75,Total_Sales_Amount_1.5,Total_Sales_Amount_2.0,Total_Sales_Amount_2.25,Total_Sales_Amount_3.0,Total_Sales_Amount_3.25,...,date_2021-07-16,date_2021-07-23,date_2021-07-30,date_2021-08-06,date_2021-08-13,date_2021-08-20,date_2021-08-27,date_2021-09-03,date_2021-09-10,date_2021-09-17
count,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,...,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0,15585.0
mean,3362.503818,12.521976,5.273758,0.039525,0.001219,0.000834,0.000962,0.000513,0.019249,0.002695,...,0.008341,0.008791,0.008406,0.007956,0.007507,0.008213,0.008406,0.007892,0.008277,0.008213
std,173.109261,20.487179,5.395541,0.194847,0.034896,0.02887,0.03101,0.022651,0.137404,0.051844,...,0.090952,0.093348,0.091298,0.088846,0.086321,0.090256,0.091298,0.08849,0.090605,0.090256
min,3113.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3210.0,2.0,3.394737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3353.0,5.0,4.267241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,3492.0,14.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3717.0,293.0,206.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)
Counter(y_train)

Counter({26.0: 27,
         94.5: 6,
         135.0: 36,
         32.0: 172,
         5.0: 328,
         112.5: 8,
         484.0: 3,
         15.0: 279,
         6.0: 355,
         8.0: 646,
         14.0: 176,
         0.0: 476,
         4.5: 110,
         72.0: 71,
         42.0: 83,
         99.0: 20,
         30.0: 182,
         16.0: 415,
         4.0: 491,
         130.0: 12,
         44.0: 64,
         24.0: 321,
         3.0: 234,
         676.0: 1,
         123.0: 11,
         10.5: 15,
         12.0: 414,
         152.0: 11,
         115.0: 7,
         84.0: 51,
         34.0: 10,
         68.25: 5,
         312.0: 3,
         48.0: 140,
         50.0: 24,
         33.0: 58,
         9.0: 241,
         131.5: 2,
         264.0: 8,
         522.5: 1,
         209.0: 10,
         70.0: 23,
         31.5: 41,
         7.0: 268,
         18.0: 185,
         165.0: 39,
         37.5: 5,
         68.0: 40,
         114.0: 20,
         27.0: 85,
         40.0: 151,
         90.0: 5

# Balanced Random Forest Classifier

In [12]:
from imblearn.ensemble import BalancedRandomForestClassifier
random_forest = BalancedRandomForestClassifier(n_estimators = 100)
random_forest = random_forest.fit(X_train, y_train)

ValueError: could not convert string to float: 'BTL Miller Lite'