# Imports

In [None]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
from datetime import datetime
import time
from sklearn.model_selection import GridSearchCV
import statsmodels.api as sm


In [87]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# Read Data

In [88]:
df = pd.read_csv('../data/cleaned_sales_train.csv')

In [89]:
df

Unnamed: 0,date,year,month,date_block_num,shop_id,item_id,item_price,item_cnt_day,sales,item_cnt_month,item_cnt_month_diff,item_cnt_day_diff
0,2013-01-01,2013,1.0,0.0,15.0,2308.0,799.0,1.0,799.0,8.0,,
1,2013-01-01,2013,1.0,0.0,18.0,10242.0,249.0,1.0,249.0,4.0,-4.0,
2,2013-01-01,2013,1.0,0.0,51.0,7501.0,285.0,1.0,285.0,2.0,-2.0,
3,2013-01-01,2013,1.0,0.0,18.0,9685.0,58.0,1.0,58.0,1.0,-1.0,
4,2013-01-01,2013,1.0,0.0,19.0,1894.0,598.5,1.0,598.5,1.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
950661,2013-12-15,2013,12.0,11.0,31.0,9053.0,249.0,1.0,249.0,2.0,-2.0,-523.0
950662,2013-12-15,2013,12.0,11.0,58.0,14755.0,149.0,1.0,149.0,8.0,6.0,-523.0
950663,2013-12-15,2013,12.0,11.0,58.0,1523.0,799.0,1.0,799.0,2.0,-6.0,-523.0
950664,2013-12-15,2013,12.0,11.0,5.0,21652.0,499.0,1.0,499.0,1.0,-1.0,-523.0


In [90]:
# Drop rows with NaN values in the 'item_cnt_day_diff' and 'item_cnt_month_diff' columns
df_clean = df.dropna(subset=['item_cnt_day_diff', 'item_cnt_month_diff'])


In [76]:
df_clean.shape

(948964, 12)

In [77]:
df_clean.head()

Unnamed: 0,date,year,month,date_block_num,shop_id,item_id,item_price,item_cnt_day,sales,item_cnt_month,item_cnt_month_diff,item_cnt_day_diff
1354,2013-01-02,2013,1.0,0.0,54.0,2820.0,349.0,1.0,349.0,2.0,0.0,3889.0
1355,2013-01-02,2013,1.0,0.0,25.0,19369.0,549.0,1.0,549.0,6.0,4.0,3889.0
1356,2013-01-02,2013,1.0,0.0,56.0,18783.0,179.0,1.0,179.0,2.0,-4.0,3889.0
1357,2013-01-02,2013,1.0,0.0,56.0,19366.0,399.0,1.0,399.0,2.0,0.0,3889.0
1358,2013-01-02,2013,1.0,0.0,56.0,19154.0,199.0,1.0,199.0,2.0,0.0,3889.0


# Train test split

In [78]:
# Split based on time (for example, 80% for training, 20% for testing)
train_size = int(len(df_clean) * 0.8)

# Split the data
train = df_clean[:train_size]
test = df_clean[train_size:]

# You can also further split the training data into training and validation sets (e.g., 70-10-20 split)
validation_size = int(len(train) * 0.125)  # 10% of the total dataset
validation = train[-validation_size:]
train = train[:-validation_size]

print("Train shape:", train.shape)
print("Validation shape:", validation.shape)
print("Test shape:", test.shape)

Train shape: (664275, 12)
Validation shape: (94896, 12)
Test shape: (189793, 12)


In [79]:
df.columns

Index(['date', 'year', 'month', 'date_block_num', 'shop_id', 'item_id',
       'item_price', 'item_cnt_day', 'sales', 'item_cnt_month',
       'item_cnt_month_diff', 'item_cnt_day_diff'],
      dtype='object')

# Features Engineering -- Target : Month

In [80]:
# Feature Engineering - creating lag features
def create_lag_features(df, target_column, lags=[1, 2, 3, 6, 12]):
    """
    Creates lag features for a given target column.
    """
    for lag in lags:
        df[f'{target_column}_lag_{lag}'] = df.groupby(['shop_id', 'item_id'])[target_column].shift(lag)
    df.dropna(inplace=True)
    return df

# Apply lag features for 'item_cnt_month_diff' and 'item_cnt_day_diff'
train = create_lag_features(train, 'item_cnt_month_diff')
validation = create_lag_features(validation, 'item_cnt_month_diff')
test = create_lag_features(test, 'item_cnt_month_diff')

# Features for the model
features = ['year', 'month', 'date_block_num', 'shop_id', 'item_id', 'item_price'] + \
           [f'item_cnt_month_diff_lag_{lag}' for lag in [1, 2, 3, 6, 12]]

# Targets
target = 'item_cnt_month_diff'

# Define X and y for training, validation, and test datasets
X_train = train[features]
y_train = train[target]
X_val = validation[features]
y_val = validation[target]
X_test = test[features]
y_test = test[target]

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Verify the shapes after scaling
print(X_train_scaled.shape, X_val_scaled.shape, X_test_scaled.shape)


(78328, 11) (111, 11) (4239, 11)


# ML Experiments & Modeling with MLflow 

In [81]:
# Set the tracking URI for MLflow
mlflow.set_tracking_uri("../mlruns")

In [82]:
mlflow.set_experiment(experiment_id="0")

<Experiment: artifact_location='/Users/ilyeslenoob/Sales-Prediction-with-Deep-Learning/notebooks/../mlruns/0', creation_time=1732936617013, experiment_id='0', last_update_time=1732938747944, lifecycle_stage='active', name='SALES EXP Monthly', tags={}>

In [83]:
# Enable MLflow autologging for sklearn and XGBoost models
mlflow.sklearn.autolog()
mlflow.xgboost.autolog()
mlflow.autolog()

2024/11/30 18:04:14 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


In [14]:
def evaluate_and_log_model(model, model_name, X_train, y_train, X_val, y_val, X_test, y_test):
    # Create a custom run name by combining the model name and current date
    run_name = f"{model_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    
    # Start timing
    start_time = time.time()  # Start time before training the model
    
    # Start MLflow run with a custom name
    with mlflow.start_run(run_name=run_name):
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred_train = model.predict(X_train)
        y_pred_val = model.predict(X_val)
        
        # Calculate RMSE for train and validation sets
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        # Calculate RMSE for test set
        y_pred_test = model.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        
        # Calculate the elapsed time in seconds
        elapsed_time = time.time() - start_time  # End time - start time
        
        # Log parameters, metrics, and elapsed time to MLflow
        mlflow.log_param('model_name', model_name)
        mlflow.log_metric('train_rmse', train_rmse)
        mlflow.log_metric('val_rmse', val_rmse)
        mlflow.log_metric('test_rmse', test_rmse)
        mlflow.log_metric('elapsed_time', elapsed_time)  # Log elapsed time as a metric
        
        # Log the model (MLflow autologging will automatically log the model too)
        mlflow.sklearn.log_model(model, model_name)
        
        return val_rmse  # Return validation RMSE for comparison

## ML Models

In [15]:
# 1. Define basic models
models = {
    "XGBoost": XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5),
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10),
    "LinearRegression": LinearRegression()
}

In [16]:
# 2. Evaluate and log each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    val_rmse = evaluate_and_log_model(model, model_name, X_train, y_train, X_val, y_val, X_test, y_test)
    print(f"{model_name} - Validation RMSE: {val_rmse}")

Evaluating XGBoost...




XGBoost - Validation RMSE: 3.266006313997887
Evaluating RandomForest...




RandomForest - Validation RMSE: 3.3732701693315095
Evaluating LinearRegression...




LinearRegression - Validation RMSE: 3.3685457926667777


In [18]:
# Define basic models
models = {
    "XGBoost": XGBRegressor(objective='reg:squarederror'),
    "RandomForest": RandomForestRegressor(),
}

# Define parameter grids for each model
param_grids = {
    "XGBoost": {
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'learning_rate': [0.01, 0.1]
    },
    "RandomForest": {
        'n_estimators': [50, 100],
        'max_depth': [5, 10]
    },
}

In [19]:

# Function to evaluate models with GridSearchCV and log results to MLflow with a custom run name
def evaluate_and_log_model_grid_search(model, model_name, param_grid, X_train, y_train, X_val, y_val, X_test, y_test):
    # If no parameter grid for LinearRegression
    if len(param_grid) == 0:
        model_grid = model
    else:
        model_grid = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

    # Create a custom run name using model name and the current date
    run_name = f"{model_name}_gridsearch_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    
    start_time = time.time()  # Start time before training the model
    
    with mlflow.start_run(run_name=run_name):  # Set the custom run name
        # Train the model using GridSearchCV
        model_grid.fit(X_train, y_train)
        
        end_time = time.time()  # End time after training the model
        
        # Get the best model from the grid search
        best_model = model_grid.best_estimator_
        
        # Make predictions
        y_pred_train = best_model.predict(X_train)
        y_pred_val = best_model.predict(X_val)
        
        # Calculate RMSE for train and validation sets
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        
        # Calculate RMSE for test set
        y_pred_test = best_model.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
        
        # Log parameters and metrics to MLflow
        mlflow.log_param('model_name', model_name)
        mlflow.log_params(model_grid.best_params_)
        mlflow.log_metric('train_rmse', train_rmse)
        mlflow.log_metric('val_rmse', val_rmse)
        mlflow.log_metric('test_rmse', test_rmse)
        mlflow.log_metric('elapsed_time', end_time - start_time)
        
        
        
        # Log the model (MLflow autologging will automatically log the model too)
        mlflow.sklearn.log_model(best_model, model_name)
        
        return val_rmse  # Return validation RMSE for comparison


In [20]:
# 2. Evaluate and log each model using GridSearchCV
for model_name, model in models.items():
    print(f"Evaluating {model_name} with GridSearchCV...")
    val_rmse = evaluate_and_log_model_grid_search(model, model_name, param_grids[model_name], X_train, y_train, X_val, y_val, X_test, y_test)
    print(f"{model_name} - Validation RMSE: {val_rmse}")


Evaluating XGBoost with GridSearchCV...


2024/11/30 04:18:47 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


XGBoost - Validation RMSE: 3.12416419883291
Evaluating RandomForest with GridSearchCV...


2024/11/30 04:19:09 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


RandomForest - Validation RMSE: 2.9649778518309895


## DL Models

In [84]:
import pickle

In [85]:
def evaluate_and_log_arima(model, model_name, X_train, y_train, X_val, y_val, X_test, y_test):
    # Create a custom run name using model name and the current date
    run_name = f"{model_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    
    with mlflow.start_run(run_name=run_name):  # Set the custom run name
        # Train the ARIMA model
        arima_model = sm.tsa.ARIMA(y_train, order=(5,1,0))  # (p,d,q) parameters
        arima_model_fit = arima_model.fit()
        
        # Make predictions
        y_pred_train = arima_model_fit.predict(start=0, end=len(y_train)-1)
        y_pred_val = arima_model_fit.predict(start=len(y_train), end=len(y_train)+len(y_val)-1)
        
        # Calculate RMSE for train and validation sets
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        
        # Log parameters and metrics to MLflow
        mlflow.log_param('model_name', model_name)
        mlflow.log_metric('train_rmse', train_rmse)
        mlflow.log_metric('val_rmse', val_rmse)
        
        # Log the ARIMA model parameters (order)
        mlflow.log_param('arima_order', (5, 1, 0))  # The ARIMA order we used (p,d,q)
        
        # Serialize the ARIMA model using pickle
        arima_model_filename = "arima_model.pkl"
        with open(arima_model_filename, 'wb') as f:
            pickle.dump(arima_model_fit, f)
        
        # Log the ARIMA model file as an artifact
        mlflow.log_artifact(arima_model_filename)
        
        return val_rmse  # Return validation RMSE for comparison

In [86]:
evaluate_and_log_arima(None, "ARIMA", X_train, y_train, X_val, y_val, X_test, y_test)

To reduce model size, use `mlflow.statsmodels.autolog(log_models=False)` and manually log model by `mlflow.statsmodels.log_model(model, remove_data=True, artifact_path="model")`


9.644412756002716

In [32]:
from mlflow import log_param, log_metric
from sklearn.model_selection import ParameterGrid

def evaluate_and_log_arima(model_name, X_train, y_train, X_val, y_val, X_test, y_test):
    # Create a custom run name using model name and the current date
    run_name = f"{model_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
    
    # Define the parameter grid for ARIMA (p, d, q)
    param_grid = {
        'p': [0, 1, 2, 3],   # Auto-regressive order
        'd': [0, 1],         # Differencing order
        'q': [0, 1, 2, 3]    # Moving average order
    }
    
    best_val_rmse = float('inf')
    best_params = None
    best_arima_model_fit = None

    # Iterate through all combinations of parameters
    for params in ParameterGrid(param_grid):
        p, d, q = params['p'], params['d'], params['q']
        
        # Train the ARIMA model with the current combination of (p, d, q)
        arima_model = sm.tsa.ARIMA(y_train, order=(p, d, q))
        
        # Fit the ARIMA model
        arima_model_fit = arima_model.fit()
        
        # Make predictions
        y_pred_train = arima_model_fit.predict(start=0, end=len(y_train)-1)
        y_pred_val = arima_model_fit.predict(start=len(y_train), end=len(y_train)+len(y_val)-1)
        
        # Calculate RMSE for train and validation sets
        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
        val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
        
        # Log parameters and metrics to MLflow
        with mlflow.start_run(run_name=run_name):  # Set the custom run name
            mlflow.log_param('model_name', model_name)
            mlflow.log_param('arima_order', (p, d, q))  # Log current (p, d, q) values
            mlflow.log_metric('train_rmse', train_rmse)
            mlflow.log_metric('val_rmse', val_rmse)
            
            # Check if this model is the best one based on validation RMSE
            if val_rmse < best_val_rmse:
                best_val_rmse = val_rmse
                best_params = (p, d, q)
                best_arima_model_fit = arima_model_fit

            print(f"ARIMA ({p},{d},{q}) - Train RMSE: {train_rmse} | Validation RMSE: {val_rmse}")
    
    # Log the best model and its parameters
    if best_arima_model_fit:
        best_run_name = f"Best_ARIMA_{model_name}_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
        with mlflow.start_run(run_name=best_run_name):  # Log the best model
            mlflow.log_param('best_arima_order', best_params)
            mlflow.log_metric('best_val_rmse', best_val_rmse)
            
            # Serialize the best ARIMA model
            arima_model_filename = "best_arima_model.pkl"
            with open(arima_model_filename, 'wb') as f:
                pickle.dump(best_arima_model_fit, f)
            
            # Log the best ARIMA model as an artifact
            mlflow.log_artifact(arima_model_filename)
            
            print(f"Best ARIMA model with order {best_params} - Validation RMSE: {best_val_rmse}")
    
    return best_val_rmse  # Return the best validation RMSE for comparison


In [None]:
# Run ARIMA model evaluation
evaluate_and_log_arima("ARIMA_Model", X_train, y_train, X_val, y_val, X_test, y_test)

# Best Model Results: Random Forest
* Best Hyperparameters:
    * max_depth: 5
    * n_estimators: 100

* Random Forest Regressor model performed better based on the validation RMSE.
    * Best max_depth: 5
    * Best n_estimators: 100


# Saving the model for future use


In [34]:
best_max_depth = 5
best_n_estimators = 100

# Create the Random Forest model with the best hyperparameters
legacy_monthly_model = RandomForestRegressor(n_estimators=best_n_estimators, 
                                             max_depth=best_max_depth, 
                                             random_state=42)

# Train the model on the full training data
legacy_monthly_model.fit(X_train, y_train)

# Save the trained model as a pickle file
with open("legacy_monthly_model.pkl", 'wb') as f:
    pickle.dump(legacy_monthly_model, f)

print("Legacy Monthly Model has been saved as 'legacy_monthly_model.pkl'.")

2024/11/30 04:51:33 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a813ab3b91e342c58624274bed8184fc', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Legacy Monthly Model has been saved as 'legacy_monthly_model.pkl'.


In [72]:
# save the scaler

with open("scaler.pkl", 'wb') as f:
    pickle.dump(scaler, f)
    
print("Scaler has been saved as 'scaler.pkl'.")

Scaler has been saved as 'scaler.pkl'.


# --- Monthly EXP Done ---

# Daily Sales Exp

In [52]:
# Feature Engineering - creating lag features
def create_lag_features(df, target_column, lags=[1, 2, 3, 6, 12]):
    """
    Creates lag features for a given target column.
    """
    for lag in lags:
        df[f'{target_column}_lag_{lag}'] = df.groupby(['shop_id', 'item_id'])[target_column].shift(lag)
    df.dropna(inplace=True)
    return df

# Apply lag features for 'item_cnt_month_diff' and 'item_cnt_day_diff'
train = create_lag_features(train, 'item_cnt_day_diff')
validation = create_lag_features(validation, 'item_cnt_day_diff')
test = create_lag_features(test, 'item_cnt_day_diff')

# Features for the model
features = ['year', 'month', 'date_block_num', 'shop_id', 'item_id', 'item_price'] + \
           [f'item_cnt_day_diff_lag_{lag}' for lag in [1, 2, 3, 6, 12]]

# Targets
target = 'item_cnt_day_diff'

# Define X and y for training, validation, and test datasets
X_train = train[features]
y_train = train[target]
X_val = validation[features]
y_val = validation[target]
X_test = test[features]
y_test = test[target]

# Verify the shapes after scaling
print(X_train_scaled.shape, X_val_scaled.shape, X_test_scaled.shape)


(20642, 11) (111, 11) (4239, 11)


In [54]:
experiment_name = "daily_sales_forecasting"
# Set the new experiment
mlflow.set_experiment(experiment_name)
mlflow.sklearn.autolog()
mlflow.xgboost.autolog()
mlflow.autolog()

2024/11/30 04:59:54 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.


In [55]:
# Define basic models
models = {
    "XGBoost": XGBRegressor(objective='reg:squarederror'),
    "RandomForest": RandomForestRegressor(),
}

# Define parameter grids for each model
param_grids = {
    "XGBoost": {
        'n_estimators': [100, 200],
        'max_depth': [5, 10],
        'learning_rate': [0.01, 0.1]
    },
    "RandomForest": {
        'n_estimators': [50, 100],
        'max_depth': [5, 10]
    },
}

In [56]:
# Before evaluating, check the shapes of your datasets
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")


y_train shape: (78328,)
y_val shape: (111,)
y_test shape: (4239,)


In [57]:
# 2. Evaluate and log each model using GridSearchCV
for model_name, model in models.items():
    print(f"Evaluating {model_name} with GridSearchCV...")
    val_rmse = evaluate_and_log_model_grid_search(model, model_name, param_grids[model_name], X_train, y_train, X_val, y_val, X_test, y_test)
    print(f"{model_name} - Validation RMSE: {val_rmse}")

Evaluating XGBoost with GridSearchCV...


2024/11/30 05:00:09 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


XGBoost - Validation RMSE: 604.8419631615602
Evaluating RandomForest with GridSearchCV...


2024/11/30 05:00:37 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


RandomForest - Validation RMSE: 600.3716826937632


In [None]:
# Run ARIMA model evaluation

evaluate_and_log_arima("ARIMA_Model", X_train, y_train, X_val, y_val, X_test, y_test)

## Conlusion :

Based on the results of ML Models and DL Models like Arima, we have seen poor predicting and forecasting perfomance so we conclude that we will just work with monthly predictions

# Exp Done