In [83]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor, export_text, export_graphviz
import graphviz
from sklearn import tree
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score, mean_absolute_error, mean_squared_error, r2_score

In [84]:
df_original = pd.read_csv("sales_train_final.csv")
df_original

Unnamed: 0.1,Unnamed: 0,date_block_num_x,item_id,shop_id,month_year_name_sort,item_cnt_month,item_cnt_last_month,item_cnt_last_year,month_year_name,month_of_year,quarter,year_half,year,red_day_not_sun
0,0,0,19,0,201301,0.0,0.0,0.0,January 2013,1,1,1,2013,25
1,1,0,19,1,201301,0.0,0.0,0.0,January 2013,1,1,1,2013,25
2,2,0,19,2,201301,0.0,0.0,0.0,January 2013,1,1,1,2013,25
3,3,0,19,3,201301,0.0,0.0,0.0,January 2013,1,1,1,2013,25
4,4,0,19,4,201301,0.0,0.0,0.0,January 2013,1,1,1,2013,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10913845,10913845,33,22167,55,201510,0.0,0.0,0.0,October 2015,10,4,2,2015,31
10913846,10913846,33,22167,56,201510,0.0,0.0,0.0,October 2015,10,4,2,2015,31
10913847,10913847,33,22167,57,201510,0.0,0.0,0.0,October 2015,10,4,2,2015,31
10913848,10913848,33,22167,58,201510,0.0,0.0,0.0,October 2015,10,4,2,2015,31


In [85]:
df_draft = df_original.copy()

In [86]:
mm_scaler = MinMaxScaler()
df_draft["item_cnt_last_month"] = mm_scaler.fit_transform(df_draft[["item_cnt_last_month"]])
df_draft["item_cnt_last_year"] = mm_scaler.fit_transform(df_draft[["item_cnt_last_year"]])
df_draft["red_day_not_sun"] = mm_scaler.fit_transform(df_draft[["red_day_not_sun"]])

In [87]:
unique_values = df_draft['date_block_num_x'].unique()[-2:]

test = df_draft[df_draft['date_block_num_x'].isin(unique_values)]

train = df_draft[~df_draft['date_block_num_x'].isin(unique_values)]

In [88]:
features = [
    "item_cnt_last_month",
    "item_cnt_last_year",
    "month_of_year",
    "red_day_not_sun",
    "quarter"
]
y_test = test["item_cnt_month"]
X_test = test[features]

y_train = train.loc[train["date_block_num_x"] >= 12, "item_cnt_month"]
X_train = train.loc[train["date_block_num_x"] >= 12, features]


In [89]:
model = LinearRegression()
model.fit(X=X_train, y=y_train)

In [90]:
model_xgb = XGBRegressor(max_depth = 2, early_stopping_rounds= 2)
model_xgb.fit(X=X_train, y=y_train, eval_set = [(X_test, y_test)])

[0]	validation_0-rmse:6.04284
[1]	validation_0-rmse:6.00642
[2]	validation_0-rmse:5.99497
[3]	validation_0-rmse:5.97918
[4]	validation_0-rmse:5.97918
[5]	validation_0-rmse:5.97396
[6]	validation_0-rmse:5.97396
[7]	validation_0-rmse:5.97396
[8]	validation_0-rmse:5.96925
[9]	validation_0-rmse:5.96925
[10]	validation_0-rmse:5.96406
[11]	validation_0-rmse:5.96406


In [91]:
model_tree = DecisionTreeRegressor(max_depth=3)
model_tree.fit(X=X_train, y=y_train)

In [92]:
y_test_pred_lr = model.predict(X_test)
y_train_pred_lr = model.predict(X_train)


In [93]:
y_test_pred_xgb = model_xgb.predict(X_test)
y_train_pred_xgb = model_xgb.predict(X_train)

In [94]:
y_test_pred_tree = model_tree.predict(X_test)
y_train_pred_tree = model_tree.predict(X_train)

In [95]:
y_pred_test_naive = X_test["item_cnt_last_month"]

In [96]:
r2_lr_test = model.score(X=X_test, y=y_test)
rmse_lr_test = np.sqrt(mean_squared_error(y_true=np.clip(y_test, 0, 20), y_pred=np.clip(y_test_pred_lr, 0, 20)))
r2_lr_train = model.score(X=X_train, y=y_train)
rmse_lr_train = np.sqrt(mean_squared_error(y_true=np.clip(y_train, 0, 20), y_pred=np.clip(y_train_pred_lr, 0, 20)))

r2_xgb_test = model_xgb.score(X=X_test, y=y_test)
rmse_xgb_test = np.sqrt(mean_squared_error(y_true=np.clip(y_test, 0, 20), y_pred=np.clip(y_test_pred_xgb, 0, 20)))
r2_xgb_train = model_xgb.score(X=X_train, y=y_train)
rmse_xgb_train = np.sqrt(mean_squared_error(y_true=np.clip(y_train, 0, 20), y_pred=np.clip(y_train_pred_xgb, 0, 20)))

r2_tree_test = model_tree.score(X_test, y=y_test)
rmse_tree_test = np.sqrt(mean_squared_error(y_true=np.clip(y_test, 0, 20), y_pred=np.clip(y_test_pred_tree, 0, 20)))
r2_tree_train = model_tree.score(X_train, y=y_train)
rmse_tree_train = np.sqrt(mean_squared_error(y_true=np.clip(y_train, 0, 20), y_pred=np.clip(y_train_pred_tree, 0, 20)))

rmse_naive = np.sqrt(mean_squared_error(y_true=np.clip(y_test, 0, 20), y_pred=np.clip(y_pred_test_naive, 0, 20)))



print(f"""
R2 Test LR: {r2_lr_test}      
RMSE Test LR: {rmse_lr_test}
R2 Test LR: {r2_lr_train}      
RMSE Test LR: {rmse_lr_train}
""")
print(f"""
R2 Test XGB: {r2_xgb_test}
RMSE Test XGB: {rmse_xgb_test}
R2 Train XGB: {r2_xgb_train}
RMSE Train XGB: {rmse_xgb_train}
""")
print(f"""
R2 Test Tree: {r2_tree_test}
RMSE Test Tree: {rmse_tree_test}
R2 Train Tree: {r2_tree_train}
RMSE Train Tree: {rmse_tree_train}
""")
print(f"""
R2 Test Naive: {r2_score(y_true=y_test, y_pred=y_pred_test_naive)}
RMSE Test Naive {rmse_naive}
""")


R2 Test LR: 0.16175962670710675      
RMSE Test LR: 1.032670386465554
R2 Test LR: 0.48589036618655357      
RMSE Test LR: 1.004296096197987


R2 Test XGB: 0.04707327226512881
RMSE Test XGB: 1.0088555191411392
R2 Train XGB: 0.14385419507521424
RMSE Train XGB: 0.9995823657939933


R2 Test Tree: 0.17936354359717077
RMSE Test Tree: 1.0007752109615455
R2 Train Tree: 0.5633758387180487
RMSE Train Tree: 0.9759734914935859


R2 Test Naive: -0.0021229575728256567
RMSE Test Naive 1.1699632574251642

