In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

#### **Part I** demand simulation

#### generate temp and CPI data

In [48]:
def gen_macro_df(days, cpi_low, cpi_high, temp_low, temp_high):    
    # Step 1: Generate dates spanning 6 months to today
    end_date = datetime.today()
    start_date = end_date - timedelta(days=days)  # Approximate 6 months as 180 days
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Step 2: Generate temperature values with increasing trend and random noise
    np.random.seed(0)  # For reproducibility
    num_days = len(dates)
    temperature_base = np.linspace(temp_low, temp_high, num_days)  # Base temperature increasing linearly from 20 to 30
    temperature_noise = np.random.normal(0, 5, num_days)  # Adding random noise with mean 0 and std deviation 2
    temperatures = np.round(temperature_base + temperature_noise, 2)
    
    rainfalls = np.round(np.random.normal(0, 100, num_days), 2) 
    rainfalls = np.where(rainfalls<0, 0, rainfalls)

    # Step 3: Generate CPI values that vary each month with random noise
    months = pd.date_range(start=start_date, end=end_date, freq='M')
    cpi_base = np.linspace(cpi_low, cpi_high, len(months))  # Base CPI increasing linearly from 100 to 105
    cpi_noise = np.random.normal(0, 1, len(months))  # Adding random noise with mean 0 and std deviation 1
    cpi_values = np.round(cpi_base + cpi_noise, 2)
    
    # Create a DataFrame with dates and temperatures
    df = pd.DataFrame({'Date': dates, 'Temperature': temperatures, 'Rainfall': rainfalls})
    
    # Assign CPI values to the first day of each month, and forward fill the rest of the days in the month
    cpi_series = pd.Series(cpi_values, index=months)
    
    # Ensure the CPI series starts from the first date in the DataFrame
    cpi_series = cpi_series.reindex(dates, method='ffill').fillna(method='bfill')
    
    # Assign the CPI values to the DataFrame
    df['CPI'] = cpi_series.values
    
    # Fix the date format to 'YYYY-MM-DD'
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

    return df
    
df_macro = gen_macro_df(days=180, cpi_low=100, cpi_high=105, temp_low=40, temp_high=80)

#### generate customer data

In [49]:
def gen_cust_df(n_customers, income_mean, income_std):    
    # Number of customers
    n_customers = n_customers
    
    # Generate customer IDs
    customer_ids = np.arange(1, n_customers + 1)
    
    # Generate random values for female
    np.random.seed(0)  # For reproducibility
    female = np.random.choice([0, 1], size=n_customers)
    
    # Generate random values for age (between 18 and 70)
    age = np.random.randint(18, 71, size=n_customers)
    
    # Generate random values for income (normal distribution with mean 50k and std deviation 15k)
    income = np.random.normal(income_mean, income_std, size=n_customers)
    income = np.round(income, 2)  # Round to 2 decimal places
    
    # Create a DataFrame with the generated data
    data = pd.DataFrame({
        'CustomerID': customer_ids,
        'Female': female,
        'Age': age,
        'Income': income
    })
    
    return data

df_customer = gen_cust_df(n_customers=100, income_mean=50000, income_std=30000)

#### generate order data

In [50]:
n_customers = 100
n_products = 4
n_days = 180

# Create order quantities for date, customer, and product combination
order_quantities = np.random.randint(0, 10, size=(n_customers, n_products, n_days))

# Create a meshgrid for customer IDs, product IDs, and days
customers = np.arange(n_customers) + 1
products = np.arange(n_products) + 1
# days = np.arange(n_days)
end_date = datetime.today()
start_date = end_date - timedelta(days=n_days-1)  # Approximate 6 months as 180 days
dates = pd.date_range(start=start_date, end=end_date, freq='D')

customers_grid, products_grid, dates_grid = np.meshgrid(customers, products, dates, indexing='ij')

df_orders = pd.DataFrame({
    'CustomerID': customers_grid.flatten(),
    'ProductID': products_grid.flatten(),
    'Date': dates_grid.flatten(),
    'OrderQuantity': order_quantities.flatten()
})

df_orders['OrderQuantity'] = np.where(df_orders['Date']==df_orders['Date'].min(), df_orders['OrderQuantity'], 0)
df_orders.sort_values(by=['CustomerID', 'ProductID', 'Date'], inplace=True)
df_orders['OrderQuantity_lag1'] = df_orders.groupby(['CustomerID', 'ProductID'])['OrderQuantity'].shift(1)
df_orders['Date'] = df_orders['Date'].dt.strftime('%Y-%m-%d')
df_orders['Day'] = df_orders['Date'].rank(method='dense').astype(int)

df_orders.head(2)

Unnamed: 0,CustomerID,ProductID,Date,OrderQuantity,OrderQuantity_lag1,Day
0,1,1,2023-12-20,6,,1
1,1,1,2023-12-21,0,6.0,2


In [112]:
df = df_orders.merge(df_macro, on='Date', how='left')
df = df.merge(df_customer, on='CustomerID', how='left')
df['intercept']=1
df['next_product']= -100000
df.head(2)

Unnamed: 0,CustomerID,ProductID,Date,OrderQuantity,OrderQuantity_lag1,Day,Temperature,Rainfall,CPI,Female,Age,Income,intercept,next_product
0,1,1,2023-12-20,6,,1,42.22,0.0,100.93,0,60,59662.22,1,-100000
1,1,1,2023-12-21,0,6.0,2,45.34,0.0,100.93,0,60,59662.22,1,-100000


In [147]:
# define the coefficient for ground truth
data = {
    'intercept': [0.017925, -0.928484, -0.881833, -0.747056],
    'OrderQuantity_lag1': [0.792873, 0.732268, 0.71704, 0.708182],
    'Female': [0.324909, 0.281898, 0.416095, 0.561980],
    'Age': [0.12468, 0.184523, 0.161964, 0.108463],
    'Income': [0.000046, 0.000025, 0.000036, 0.000026],
    'Temperature': [-0.067693, -0.115606, -0.033221, 0.031307],
    'Rainfall': [0.013351, -0.002803, 0.006147, 0.016702],
    'CPI': [0.004972, 0.002614, -0.002624, -0.003080]
}

# Index for the DataFrame
index = pd.Index([1, 2, 3, 4], name='ProductID')

# Create the DataFrame
coefficients = pd.DataFrame(data, index=index)
coefficients


Unnamed: 0_level_0,intercept,OrderQuantity_lag1,Female,Age,Income,Temperature,Rainfall,CPI
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.017925,0.792873,0.324909,0.12468,4.6e-05,-0.067693,0.013351,0.004972
2,-0.928484,0.732268,0.281898,0.184523,2.5e-05,-0.115606,-0.002803,0.002614
3,-0.881833,0.71704,0.416095,0.161964,3.6e-05,-0.033221,0.006147,-0.002624
4,-0.747056,0.708182,0.56198,0.108463,2.6e-05,0.031307,0.016702,-0.00308


In [148]:
def gen_next_product_quantity(df, n_days, prod, coefficients):
    for day in range(2, n_days+1):
        rows_to_mul = df['Day'] == day
        next_product = df.loc[rows_to_mul,coefficients.columns].dot(coefficients[coefficients.index==prod].values[0]).values[0]\
        + np.random.normal(0,4)
        if next_product>=0:
            df.loc[rows_to_mul, 'OrderQuantity'] = int(next_product)
        else:
            df.loc[rows_to_mul, 'OrderQuantity'] = 0
        rows_to_update = df['Day'] == day+1
        df.loc[rows_to_update, 'OrderQuantity_lag1']=df.loc[rows_to_mul, 'OrderQuantity'].values[0]
    return df

products = df['ProductID'].unique()
custs = df['CustomerID'].unique()
df_res = []
for cust in custs:
    for prod in products:
        df_tmp = df[(df['CustomerID']==cust)&(df['ProductID']==prod)]
        df_tmp = gen_next_product_quantity(df=df_tmp, n_days=180, prod=prod, coefficients=coefficients)
        df_res.append(df_tmp)
stage1_data = pd.concat(df_res)
stage1_data.head()


Unnamed: 0,CustomerID,ProductID,Date,OrderQuantity,OrderQuantity_lag1,Day,Temperature,Rainfall,CPI,Female,Age,Income,intercept,next_product
0,1,1,2023-12-20,6,,1,42.22,0.0,100.93,0,60,59662.22,1,-100000
1,1,1,2023-12-21,10,6.0,2,45.34,0.0,100.93,0,60,59662.22,1,-100000
2,1,1,2023-12-22,14,10.0,3,51.87,62.52,100.93,0,60,59662.22,1,-100000
3,1,1,2023-12-23,21,14.0,4,50.23,0.0,100.93,0,60,59662.22,1,-100000
4,1,1,2023-12-24,24,21.0,5,36.22,0.0,100.93,0,60,59662.22,1,-100000


In [149]:
stage1_data.groupby(['ProductID'])['OrderQuantity'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,18000.0,21.0565,12.840549,0.0,11.0,21.0,30.0,70.0
2,18000.0,8.825778,9.285053,0.0,0.0,6.0,15.0,52.0
3,18000.0,20.144722,11.173542,0.0,11.0,20.0,28.0,57.0
4,18000.0,24.638111,8.904106,0.0,18.0,25.0,31.0,58.0


In [150]:
stage1_data = stage1_data.drop(columns = ['next_product'])
stage1_data.to_csv('./data/stage1_data.csv', index=False)

In [151]:
stage1_data.head()

Unnamed: 0,CustomerID,ProductID,Date,OrderQuantity,OrderQuantity_lag1,Day,Temperature,Rainfall,CPI,Female,Age,Income,intercept
0,1,1,2023-12-20,6,,1,42.22,0.0,100.93,0,60,59662.22,1
1,1,1,2023-12-21,10,6.0,2,45.34,0.0,100.93,0,60,59662.22,1
2,1,1,2023-12-22,14,10.0,3,51.87,62.52,100.93,0,60,59662.22,1
3,1,1,2023-12-23,21,14.0,4,50.23,0.0,100.93,0,60,59662.22,1
4,1,1,2023-12-24,24,21.0,5,36.22,0.0,100.93,0,60,59662.22,1


In [176]:
np.random.seed(0)
stage1_data_store = stage1_data.groupby(['Date', 'ProductID']).agg({'OrderQuantity': 'sum',
                                                                    'Temperature': 'mean',
                                                                    'Rainfall': 'mean',
                                                                    'CPI': 'mean',
                                                                    'Female': 'mean',
                                                                    'Age': 'mean',
                                                                    'Income': 'mean'
}).reset_index()
stage1_data_store['Inventory'] = stage1_data_store['OrderQuantity'] + np.random.normal(0, 100, n_days*n_products).astype(int)
stage1_data_store['Low_Stock_Risk'] = 1 - (stage1_data_store['OrderQuantity'] / stage1_data_store['Inventory'])

In [187]:
stage1_data_store.sort_values(by=['Date', 'Low_Stock_Risk'], inplace=True)
stage1_data_store['Low_Stock_Priority'] = pd.qcut(stage1_data_store['Low_Stock_Risk'], 10, labels=[f'{i}' for i in range(1, 11)])

stage1_data_store = stage1_data_store[['Date', 'ProductID', 'Low_Stock_Priority', 'Low_Stock_Risk', 'OrderQuantity', 'Inventory', 'Temperature', 'Rainfall', 'CPI',
       'Female', 'Age', 'Income']]
stage1_data_store.to_csv('./data/stage1_data_store.csv', index=False)

In [188]:
stage1_data_store

Unnamed: 0,Date,ProductID,Low_Stock_Priority,Low_Stock_Risk,OrderQuantity,Inventory,Temperature,Rainfall,CPI,Female,Age,Income
1,2023-12-20,2,10,0.076190,485,525,42.22,0.00,100.93,0.56,44.08,49946.1506
2,2023-12-20,3,10,0.184061,430,527,42.22,0.00,100.93,0.56,44.08,49946.1506
0,2023-12-20,1,10,0.287113,437,613,42.22,0.00,100.93,0.56,44.08,49946.1506
3,2023-12-20,4,10,0.331852,451,675,42.22,0.00,100.93,0.56,44.08,49946.1506
5,2023-12-21,2,1,-0.183712,625,528,45.34,0.00,100.93,0.56,44.08,49946.1506
...,...,...,...,...,...,...,...,...,...,...,...,...
714,2024-06-15,3,10,0.070513,1740,1872,79.87,69.85,104.61,0.56,44.08,49946.1506
717,2024-06-16,2,2,-0.094262,534,488,78.23,0.38,104.61,0.56,44.08,49946.1506
716,2024-06-16,1,3,-0.053156,1585,1505,78.23,0.38,104.61,0.56,44.08,49946.1506
719,2024-06-16,4,4,-0.022222,2530,2475,78.23,0.38,104.61,0.56,44.08,49946.1506
