###### Importing Libraries

In [1]:
#Core libraries
import pandas as pd
import numpy as np

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt

#MLOps
import mlflow
import mlflow.sklearn

#Database connection
import pymysql

#Warnings
import warnings
warnings.filterwarnings('ignore')


###### Importing Dataset

In [2]:
df = pd.read_csv("retail_sales_dataset.csv")

###### Data Preparation

In [3]:
# Check info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB


In [4]:
#Convert Date to datetime
df['Date'] = pd.to_datetime(df['Date'])

#Extract month and year
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year

#Aggregate to monthly sales
monthly_sales = df.groupby(['Year', 'Month'])['Total Amount'].sum().reset_index()
monthly_sales['Date'] = pd.to_datetime(monthly_sales[['Year', 'Month']].assign(DAY=1))
monthly_sales = monthly_sales.sort_values('Date')

monthly_sales.rename(columns={'Total Amount': 'Sales'}, inplace=True)
monthly_sales.head()


Unnamed: 0,Year,Month,Sales,Date
0,2023,1,35450,2023-01-01
1,2023,2,44060,2023-02-01
2,2023,3,28990,2023-03-01
3,2023,4,33870,2023-04-01
4,2023,5,53150,2023-05-01


In [28]:
monthly_sales.columns

Index(['Year', 'Month', 'Sales', 'Date', 'lag_1', 'lag_2', 'lag_3'], dtype='object')

######  Feature Engineering

In [5]:
#Create lag features
for lag in range(1, 4):
    monthly_sales[f'lag_{lag}'] = monthly_sales['Sales'].shift(lag)

#Drop NA rows
monthly_sales.dropna(inplace=True)


###### Train Test Split

In [6]:
#Features and target
X = monthly_sales[['Year', 'Month', 'lag_1', 'lag_2', 'lag_3']]
y = monthly_sales['Sales']

#Split without shuffle (time series)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

#Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [29]:
X_train.columns

Index(['Year', 'Month', 'lag_1', 'lag_2', 'lag_3'], dtype='object')

###### Define Models

In [7]:
models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=100, random_state=42)
}


###### Connecting Mlflow with MYSQL Tracking

In [29]:
#mlflow.set_tracking_uri("mysql+pymysql://root:Best2410%23@localhost/monthly_forecast_db")
#mlflow.set_experiment("Sales_Prediction_Experiment")
#mlflow.set_registry_uri("mysql+pymysql://root:Best2410%23@localhost/monthly_forecast_db")


In [10]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_registry_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Monthly_Sales_Prediction")

2025/10/16 00:55:59 INFO mlflow.tracking.fluent: Experiment with name 'Monthly_Sales_Prediction' does not exist. Creating a new experiment.


<Experiment: artifact_location=('file:///C:/Users/JAMES TECH/Documents/MLOPS Projects/Monthly Sales '
 'Prediction/mlruns/1'), creation_time=1760601359844, experiment_id='1', last_update_time=1760601359844, lifecycle_stage='active', name='Monthly_Sales_Prediction', tags={}>

###### Train, Evaluate and Log Models

In [11]:
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        #Train
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)

        #Evaluate
        rmse = sqrt(mean_squared_error(y_test, preds))
        r2 = r2_score(y_test, preds)


        #Log params and metrics
        mlflow.log_param("model_name", name)
        if hasattr(model, 'n_estimators'):
            mlflow.log_param("n_estimators", model.n_estimators)
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("R2_Score", r2)

        #Log model
        mlflow.sklearn.log_model(model, "model", registered_model_name=f"Sales_{name}_Model")

        print(f"{name} -> RMSE: {rmse:.2f}, R2: {r2:.2f}")


Successfully registered model 'Sales_LinearRegression_Model'.
2025/10/16 00:59:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Sales_LinearRegression_Model, version 1
Created version '1' of model 'Sales_LinearRegression_Model'.


LinearRegression -> RMSE: 32472.93, R2: -1.26
🏃 View run LinearRegression at: http://127.0.0.1:5000/#/experiments/1/runs/7f9545eb350e45ef908dfc144f69b131
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Successfully registered model 'Sales_RandomForestRegressor_Model'.
2025/10/16 01:00:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Sales_RandomForestRegressor_Model, version 1
Created version '1' of model 'Sales_RandomForestRegressor_Model'.


RandomForestRegressor -> RMSE: 27213.81, R2: -0.59
🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/b9b83181034645c8a80db6e2138a727a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Successfully registered model 'Sales_GradientBoostingRegressor_Model'.
2025/10/16 01:00:36 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Sales_GradientBoostingRegressor_Model, version 1
Created version '1' of model 'Sales_GradientBoostingRegressor_Model'.


GradientBoostingRegressor -> RMSE: 28287.47, R2: -0.72
🏃 View run GradientBoostingRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/85cbb39335e342139e8f11651cb4d177
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


###### Selecting the best model

In [18]:
import mlflow

experiments = mlflow.search_experiments()
for exp in experiments:
    print(exp.experiment_id, exp.name)


1 Monthly_Sales_Prediction
0 Default


###### Finding the best model

In [19]:

#Connect to MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000")
client = MlflowClient()

# Use experiment ID
experiment_id = "1"

#Get all runs from this experiment
runs = client.search_runs(
    experiment_ids=[experiment_id],
    order_by=["metrics.RMSE ASC"],
    max_results=1
)

#Select the best run
best_run = runs[0]
print(f"Best Run ID: {best_run.info.run_id}")
print(f"Best RMSE: {best_run.data.metrics['RMSE']}")
print(f"Model Name: {best_run.data.params.get('model_name', 'Unknown')}")


Best Run ID: b9b83181034645c8a80db6e2138a727a
Best RMSE: 27213.812406960733
Model Name: RandomForestRegressor


######  Registering best model

In [20]:
registered_model_name = "Monthly_Sales_Model"
model_uri = "runs:/b9b83181034645c8a80db6e2138a727a/model"

#Register the model
result = mlflow.register_model(model_uri=model_uri, name=registered_model_name)

print(f"Model registered as: {registered_model_name}")
print(f"Model version: {result.version}")


Successfully registered model 'Monthly_Sales_Model'.
2025/10/16 03:05:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Monthly_Sales_Model, version 1
Created version '1' of model 'Monthly_Sales_Model'.


Model registered as: Monthly_Sales_Model
Model version: 1


###### Transition the model to production

In [21]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

client.transition_model_version_stage(
    name=registered_model_name,
    version=result.version,
    stage="Production"
)

print(f"Model '{registered_model_name}' version {result.version} moved to Production.")


Model 'Monthly_Sales_Model' version 1 moved to Production.


###### Loading the model and making prediction

In [22]:

#Set the tracking URI
mlflow.set_tracking_uri("http://127.0.0.1:5000")

#Load the latest production model
model_name = "Monthly_Sales_Model"
model_stage = "Production"

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_stage}")

print("Production model loaded successfully!")


Production model loaded successfully!


In [23]:
model

mlflow.pyfunc.loaded_model:
  artifact_path: file:///C:/Users/JAMES TECH/Documents/MLOPS Projects/Monthly Sales
    Prediction/mlruns/1/models/m-1dfec4927015458da0071b321dc0413e/artifacts
  flavor: mlflow.sklearn
  run_id: b9b83181034645c8a80db6e2138a727a

###### Making Predictions

In [27]:
import pandas as pd

#create a sample record 
X_new = pd.DataFrame({
    "Month": [10],
    "Year": [2025],
    "Marketing_Spend": [12000],
    "Store_Visitors": [450],
    "Discount": [15]
})

# Make prediction
pred = model.predict(X_new)
print("Predicted Sales:", pred[0])


Predicted Sales: 35752.65


In [32]:
#saving the model
import joblib
joblib.dump(model, "monthly_sales_model.joblib")


['monthly_sales_model.joblib']

In [33]:
#Save the scaler
joblib.dump(scaler, "scaler.joblib")

['scaler.joblib']

In [36]:
import os
print(os.getcwd())


C:\Users\JAMES TECH\Documents\MLOPS Projects\Monthly Sales Prediction


In [37]:
os.listdir()


['.ipynb_checkpoints',
 'mlruns',
 'monthly_sales_model.joblib',
 'preprocessing .ipynb',
 'retail_sales_dataset.csv',
 'scaler.joblib']