In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot
import datetime as dt

In [2]:
df = pd.read_excel("updated database.xlsx")

In [3]:
df.sort_values(by = ["item_name", "Purchase_date"], inplace = True)

In [4]:
df["M"] = df["Purchase_date"].dt.to_period("M")
df["Q"] = df["Purchase_date"].dt.to_period("Q")
df["Year"] = df["Purchase_date"].dt.year
df["Month"] = df["Purchase_date"].dt.month
df["Day"] = df["Purchase_date"].dt.day

In [5]:
def func(type):
    if "Beverage" in type:
        return "Beverage"
    elif "Dishware" in type:
        return "Dishware"
    elif "Coffee" in type:
        return "Coffee Stuff"
    elif "Snacks" in type or "Chips" in type or "Nuts" in type:
        return "Snacks"
    elif "Chocolate" in type or "Sweets" in type:
        return "Sweets"
    else:
        return type

df["new_desc"] = df["desc"].map(func)

In [6]:
product_categories = {
    "Beverage":(5,6),
    "Dishware":(11,12),
    "Food - Healthy":(1,2),
    "Snacks": (5,6),
    "Coffee Stuff": (11,12),
    "Sweets":(11,12),
    "Gum - Mints": (5,6)
}

In [7]:
def func2(type, month, price):
    if type not in product_categories:
        return price
    elif product_categories[type][0] <= month <= product_categories[type][1]:
        return price * 1.05
    else:
        return price

df["new_total_price"] = df.apply(lambda x: func2(x["new_desc"], x["Month"], x["total_price"]), axis = 1)

In [8]:
df["Change"] = df["new_total_price"] - df["quantity_purchased"] * df["unit_price"]
li = []
for i in df["Change"]:
    if i > 0:
        li.append(1)
    elif i < 0: 
        li.append(-1)
    else:
        li.append(0)
df["Change"] = li

In [9]:
X = df.drop(columns=["customer_key","Change","Time_of_purchase","manufacturing_country", "supplier","store_sub-district","store_district","Delivery_date","Lead_time","Purchase_date","M","Q", "total_price","new_total_price", "desc"])
y = df["Change"]

In [10]:
cats = [i for i in X.columns if X[i].dtype == "object"]
X.dtypes

quantity_purchased      int64
item_name              object
unit_price              int64
store_region           object
Inventory_level       float64
Inventory_cost          int64
Year                    int32
Month                   int32
Day                     int32
new_desc               object
dtype: object

In [11]:
X_enc = pd.get_dummies(X, columns = cats)

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [13]:
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()

models = [dt,rf,gb]
dt_cv = cross_validate(dt, X_enc, y, cv=5, scoring= "accuracy", return_estimator= True)
dt_cv

{'fit_time': array([17.91406608, 18.16551971,  7.12626719,  8.14591432,  8.47373509]),
 'score_time': array([0.216048  , 0.1211381 , 0.17088223, 0.11431074, 0.09628654]),
 'estimator': [DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier(),
  DecisionTreeClassifier()],
 'test_score': array([0.88632 , 0.952135, 0.88653 , 0.89013 , 0.839575])}

In [14]:
li = []
for idx,estimator in enumerate(dt_cv['estimator']):
    feature_importances = pd.DataFrame(estimator.feature_importances_,
                                       index = X_enc.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)
    li.append(feature_importances)

pd.concat(li, axis = 1).mean(axis=1).sort_values(ascending=False).head()

Month                 0.274054
unit_price            0.151956
quantity_purchased    0.076985
new_desc_Beverage     0.061895
new_desc_Sweets       0.044110
dtype: float64

In [15]:
rf_cv = cross_validate(rf, X_enc, y, cv=5, scoring= "accuracy", return_estimator= True)
rf_cv

{'fit_time': array([124.39472318, 124.60781169, 124.66962862, 131.55884862,
        120.91271591]),
 'score_time': array([2.62022781, 2.39450336, 2.1837914 , 2.43301868, 2.32248116]),
 'estimator': [RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier(),
  RandomForestClassifier()],
 'test_score': array([0.87267 , 0.847745, 0.88425 , 0.805825, 0.83201 ])}

In [16]:
gb_cv = cross_validate(gb, X_enc, y, cv=5, scoring= "accuracy", return_estimator= True)
gb_cv

{'fit_time': array([ 1051.71907091,   975.18621683,   959.38928676,   903.90493107,
        12309.40265059]),
 'score_time': array([2.36678815, 2.40625763, 2.03237581, 2.53728724, 2.48424387]),
 'estimator': [GradientBoostingClassifier(),
  GradientBoostingClassifier(),
  GradientBoostingClassifier(),
  GradientBoostingClassifier(),
  GradientBoostingClassifier()],
 'test_score': array([0.864565, 0.93571 , 0.96787 , 0.8973  , 0.888575])}

In [17]:
print("hello")

hello
