<a href="https://colab.research.google.com/github/jitender2622/Multi_Echelon_Inventory_Demand_Forecasting_System/blob/main/Multi_Echelon_Inventory_Demand_Forecasting_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os

# --- Configuration ---
NUM_RECORDS = 100000
START_DATE = datetime(2016, 1, 1)
END_DATE = datetime(2019, 6, 1) # Data up to project hand-off
STORES = [101, 102, 103, 104, 105] # Pilot region stores
SKUS = ['AUTO_OIL_5W30', 'SNACK_CHIPS_LG', 'BEV_ENERGY_DRINK', 'AUTO_WIPER_BLADE', 'FRESH_SANDWICH_TURKEY']

print("Generating Industrial Dataset (This may take a moment)...")

# --- Helper Functions ---
def generate_dates(start, end):
    delta = end - start
    days = delta.days + 1
    return [start + timedelta(days=i) for i in range(days)]

dates = generate_dates(START_DATE, END_DATE)

# --- 1. Generate Holiday Data (External API Simulation) ---
holidays_data = {
    'Date': [],
    'Is_Holiday': [],
    'Holiday_Name': []
}
us_holidays = {
    (1, 1): "New Year's Day",
    (7, 4): "Independence Day",
    (12, 25): "Christmas",
    (11, 24): "Thanksgiving" # Approximation for simulation
}

for d in dates:
    holidays_data['Date'].append(d)
    if (d.month, d.day) in us_holidays:
        holidays_data['Is_Holiday'].append(1)
        holidays_data['Holiday_Name'].append(us_holidays[(d.month, d.day)])
    else:
        holidays_data['Is_Holiday'].append(0)
        holidays_data['Holiday_Name'].append("None")

df_holidays = pd.DataFrame(holidays_data)
df_holidays.to_csv('holiday_data.csv', index=False)
print("✔ holiday_data.csv created.")

# --- 2. Generate Weather Data (External API Simulation) ---
weather_data = {
    'Date': [],
    'Store_ID': [],
    'Temperature_F': [],
    'Precipitation_In': []
}

for d in dates:
    for store in STORES:
        weather_data['Date'].append(d)
        weather_data['Store_ID'].append(store)
        # Simulate seasonality: Summer is hot, Winter is cold
        base_temp = 75 if d.month in [6, 7, 8] else 40
        temp = np.random.normal(base_temp, 10)
        weather_data['Temperature_F'].append(round(temp, 1))
        # Rain simulation
        precip = np.random.exponential(0.1) if random.random() > 0.7 else 0.0
        weather_data['Precipitation_In'].append(round(precip, 2))

df_weather = pd.DataFrame(weather_data)
df_weather.to_csv('weather_data.csv', index=False)
print("✔ weather_data.csv created.")

# --- 3. Generate Historical Sales Data (SAP System Dump) ---
# We will create a base dataset and then introduce "missing data" to simulate the SAP issues
sales_data = []

for d in dates:
    for store in STORES:
        for sku in SKUS:
            # Base demand
            base_demand = 20

            # Seasonality & Holiday Spikes
            if d.month in [6, 7, 8] and 'BEV' in sku: base_demand += 30 # Summer drinks
            if (d.month, d.day) in us_holidays: base_demand += 50 # Holiday traffic
            if 'AUTO' in sku and d.month in [11, 12, 1]: base_demand += 15 # Winter auto parts

            # Randomness
            noise = np.random.randint(-5, 15)
            qty = max(0, base_demand + noise)

            # Introduce Missing Data (The "Problem" in the project description)
            # 5% chance the SAP system failed to record data that day
            if random.random() < 0.05:
                qty = np.nan

            sales_data.append([d, store, sku, qty])

df_sales = pd.DataFrame(sales_data, columns=['Date', 'Store_ID', 'SKU_ID', 'Qty_Sold'])
df_sales.to_csv('historical_sales_sap.csv', index=False)
print(f"✔ historical_sales_sap.csv created with {len(df_sales)} records.")
print("DATA GENERATION COMPLETE.")

Generating Industrial Dataset (This may take a moment)...
✔ holiday_data.csv created.
✔ weather_data.csv created.
✔ historical_sales_sap.csv created with 31200 records.
DATA GENERATION COMPLETE.


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt

# ==========================================
# 1. DATA INGESTION
# ==========================================
print("--- Loading Data ---")
df_sales = pd.read_csv('historical_sales_sap.csv', parse_dates=['Date'])
df_weather = pd.read_csv('weather_data.csv', parse_dates=['Date'])
df_holidays = pd.read_csv('holiday_data.csv', parse_dates=['Date'])

# Merge Data (The "Data Lake" approach)
# Left join sales with weather (on Date+Store) and holidays (on Date)
df_master = df_sales.merge(df_weather, on=['Date', 'Store_ID'], how='left')
df_master = df_master.merge(df_holidays[['Date', 'Is_Holiday']], on='Date', how='left')

# Sort for Time Series processing
df_master = df_master.sort_values(by=['Store_ID', 'SKU_ID', 'Date']).reset_index(drop=True)

# ==========================================
# 2. DATA CLEANING (Linear Interpolation)
# ==========================================
print("--- Cleaning Data (Interpolation) ---")
# Project Requirement: Fix missing data using linear interpolation
# We group by Store and SKU to ensure we don't interpolate across different products
df_master['Qty_Sold'] = df_master.groupby(['Store_ID', 'SKU_ID'])['Qty_Sold'].transform(
    lambda x: x.interpolate(method='linear').fillna(method='bfill').fillna(method='ffill')
)

# ==========================================
# 3. FEATURE ENGINEERING
# ==========================================
print("--- Feature Engineering ---")

# Feature 1: Lags (Past sales)
# We use 7-day lag because travel patterns are weekly
df_master['Lag_7'] = df_master.groupby(['Store_ID', 'SKU_ID'])['Qty_Sold'].shift(7)

# Feature 2: Rolling Means (Trends)
df_master['Rolling_Mean_7'] = df_master.groupby(['Store_ID', 'SKU_ID'])['Qty_Sold'].transform(lambda x: x.rolling(window=7).mean())
df_master['Rolling_Mean_30'] = df_master.groupby(['Store_ID', 'SKU_ID'])['Qty_Sold'].transform(lambda x: x.rolling(window=30).mean())

# Feature 3: Rolling Standard Deviation (Volatility)
# Project Requirement: Identify unstable products
df_master['Rolling_Std_7'] = df_master.groupby(['Store_ID', 'SKU_ID'])['Qty_Sold'].transform(lambda x: x.rolling(window=7).std())

# Feature 4: Date Parts
df_master['DayOfWeek'] = df_master['Date'].dt.dayofweek
df_master['Month'] = df_master['Date'].dt.month
df_master['Is_Weekend'] = df_master['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

# Drop rows with NaNs created by lagging/rolling (the first 30 days of data)
df_model_data = df_master.dropna().reset_index(drop=True)

print(f"Final Dataset Shape: {df_model_data.shape}")

# ==========================================
# 4. MODELING (Random Forest)
# ==========================================
print("--- Training Random Forest Regressor ---")

# Define Features and Target
features = ['Lag_7', 'Rolling_Mean_7', 'Rolling_Mean_30', 'Rolling_Std_7',
            'Temperature_F', 'Precipitation_In', 'Is_Holiday', 'DayOfWeek', 'Month']
target = 'Qty_Sold'

# Time-Based Split (Crucial for Forecasting)
# We train on data < 2019, Test on 2019 data (Simulating the timeline)
split_date = pd.to_datetime('2019-01-01')

train = df_model_data[df_model_data['Date'] < split_date]
test = df_model_data[df_model_data['Date'] >= split_date]

X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

# Initialize Random Forest (Parameters tuned for 2018 tech)
rf_model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1)

# Train
rf_model.fit(X_train, y_train)

# Evaluate
predictions = rf_model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
print(f"Model Performance - MAE: {mae:.2f}")
print("Note: An MAE of ~5-8 is acceptable given the high volatility of retail travel stops.")

# Feature Importance Visualization (Optional Project requirement to explain 'Why')
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]
print("\nTop 3 Important Features:")
for f in range(3):
    print(f"{f+1}. {features[indices[f]]} ({importances[indices[f]]:.4f})")

# ==========================================
# 5. FORECASTING (Next 14 Days)
# ==========================================
print("--- Generating 14-Day Forecast for SAP ---")

# We need to simulate the "Next 14 Days" based on the last available data point
last_date = df_model_data['Date'].max()
forecast_horizon = 14
future_dates = [last_date + timedelta(days=x) for x in range(1, forecast_horizon + 1)]

# Create a placeholder dataframe for future predictions
future_rows = []

# For simplicity in this demo, we will take the LAST known values of the rolling features
# and propagate them (In production, we would re-calculate rolling recursively)
last_known_data = df_model_data.groupby(['Store_ID', 'SKU_ID']).tail(1)

for d in future_dates:
    temp_df = last_known_data.copy()
    temp_df['Date'] = d
    temp_df['DayOfWeek'] = d.dayofweek
    temp_df['Month'] = d.month

    # Simulate future weather (fetching from API in real life)
    temp_df['Temperature_F'] = 75 # Summer assumption
    temp_df['Precipitation_In'] = 0.0

    # Simulate future holiday
    temp_df['Is_Holiday'] = 1 if (d.month, d.day) in us_holidays else 0

    future_rows.append(temp_df)

df_future = pd.concat(future_rows)

# Predict
df_future['Predicted_Demand'] = rf_model.predict(df_future[features])

# Round up because you can't sell 1.5 units
df_future['Predicted_Demand'] = np.ceil(df_future['Predicted_Demand']).astype(int)

# ==========================================
# 6. OUTPUT GENERATION (SAP INTEGRATION)
# ==========================================
# Format specifically for SAP ingestion
sap_output = df_future[['Date', 'Store_ID', 'SKU_ID', 'Predicted_Demand']].copy()

# Add logic for Safety Stock (Project Requirement: reduce stockouts)
# If volatility (Rolling_Std) was high, add buffer
sap_output['Safety_Buffer'] = df_future['Rolling_Std_7'].apply(lambda x: int(x * 1.5) if x > 5 else 0)
sap_output['Final_Order_Qty'] = sap_output['Predicted_Demand'] + sap_output['Safety_Buffer']

sap_output.to_csv('final_sap_orders.csv', index=False)
print("✔ final_sap_orders.csv created.")
print("System Run Complete.")

--- Loading Data ---
--- Cleaning Data (Interpolation) ---
--- Feature Engineering ---


  lambda x: x.interpolate(method='linear').fillna(method='bfill').fillna(method='ffill')


Final Dataset Shape: (30475, 14)
--- Training Random Forest Regressor ---
Model Performance - MAE: 4.48
Note: An MAE of ~5-8 is acceptable given the high volatility of retail travel stops.

Top 3 Important Features:
1. Rolling_Mean_7 (0.6108)
2. Is_Holiday (0.1933)
3. Rolling_Std_7 (0.0648)
--- Generating 14-Day Forecast for SAP ---
✔ final_sap_orders.csv created.
System Run Complete.


In [None]:
%%writefile Dockerfile
# Use an official Python runtime as a parent image (Version from 2019)
FROM python:3.7-slim

# Set the working directory to /app
WORKDIR /app

# Copy the current directory contents into the container at /app
COPY . /app

# Install any needed packages specified in requirements.txt
RUN pip install --trusted-host pypi.python.org -r requirements.txt

# Make port 80 available to the world outside this container
EXPOSE 80

# Run model_pipeline.py when the container launches
CMD ["python", "model_pipeline.py"]

Writing Dockerfile
