* The training data, comprising time series of features **store_nbr**, **family**, and **onpromotion** as well as the target **sales**.
* **store_nbr** identifies the store at which the products are sold.
* **family** identifies the type of product sold.
* **sales** gives the total sales for a product family at a particular store at a given date. Fractional values are possible since products can be sold in fractional units (1.5 kg of cheese, for instance, as opposed to 1 bag of chips).
* onpromotion gives the total number of items in a product family that were being promoted at a store at a given date.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, r2_score, mean_absolute_error

In [None]:
df = pd.read_csv('assets\\train.csv', parse_dates=['date'], index_col='id')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df_eda = round(df.describe(), 2)
df_eda.loc['unique'] = [df[col].nunique() for col in df_eda.columns]
df_eda.loc['duplicate'] = [df[col].duplicated().sum() for col in df_eda.columns]
df_eda.loc['missing'] = [df[col].isna().sum() for col in df_eda.columns]
df_eda.loc['type'] = [df[col].dtype for col in df_eda.columns]
df_eda.T

In [None]:
df['family'].nunique(), df['family'].value_counts()

In [None]:
df['sales'].plot();

In [None]:
df_c = df.copy()

In [None]:
df_c["saleYear"] = df_c["date"].dt.year
df_c["saleMonth"] = df_c["date"].dt.month
df_c["saleDay"] = df_c["date"].dt.day

In [None]:
df_c.drop("date", axis=1, inplace=True)

In [None]:
df_c['family'] = df_c['family'].astype("category").cat.codes +1

In [None]:
df_shuffle = df_c.sample(frac=1, random_state=42)
df_shuffle_mini = df_shuffle.sample(int(0.1*len(df_shuffle)), random_state=42)

X = df_shuffle_mini.drop('sales', axis=1)
y = df_shuffle_mini['sales']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def show_scores(model):
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_valid)
    scores = {"Training R^2": r2_score(y_train, train_pred),
              "Valid R^2": r2_score(y_valid, val_pred),
              "Training MAE": mean_absolute_error(y_train, train_pred),
              "Valid MAE": mean_absolute_error(y_valid, val_pred)}
    return scores

In [None]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
show_scores(model)

In [None]:
model_x = XGBRegressor(random_state=42)
model_x.fit(X_train, y_train)

In [None]:
show_scores(model_x)

In [None]:
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

In [None]:
grid_search = GridSearchCV(
    estimator=model_x,
    param_grid=parameters,
    scoring = 'r2',
    cv = 5,
    verbose=True
)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.score(X_valid, y_valid)