In [306]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

In [307]:
n_customers = 100
n_products = 10
n_days = 180
product_names = ['Avocado','Banana','Beef','Cauliflower','Egg','Milk','Shrimp','Tomato','Yogurt','Zucchini']

end_date = datetime.today() + timedelta(days=14)
start_date = end_date - timedelta(days=n_days-1)  # Approximate 6 months as 180 days
dates = pd.date_range(start=start_date, end=end_date, freq='D')

#### **Part I** demand simulation

#### generate temp and CPI data

In [308]:
def gen_macro_df(days, cpi_low, cpi_high, temp_low, temp_high, mcpi_low, mcpi_high):    
    # Step 1: Generate dates spanning 6 months to today
    
    # Step 2: Generate temperature values with increasing trend and random noise
    np.random.seed(0)  # For reproducibility
    num_days = len(dates)
    temperature_base = np.linspace(temp_low, temp_high, num_days)  # Base temperature increasing linearly from 20 to 30
    temperature_noise = np.random.normal(0, 5, num_days)  # Adding random noise with mean 0 and std deviation 2
    temperatures = np.round(temperature_base + temperature_noise, 2)
    
    rainfalls = np.round(np.random.normal(0, 100, num_days), 2) 
    rainfalls = np.where(rainfalls<0, 0, rainfalls)

    # Step 3: Generate CPI values that vary each month with random noise
    months = pd.date_range(start=start_date, end=end_date, freq='M')
    cpi_base = np.linspace(cpi_low, cpi_high, len(months))  # Base CPI increasing linearly from 100 to 105
    cpi_noise = np.random.normal(0, 1, len(months))  # Adding random noise with mean 0 and std deviation 1
    cpi_values = np.round(cpi_base + cpi_noise, 2)

    # Step 4: Generate CMPI values that vary each month with random noise
    mcpi_base = np.linspace(mcpi_low, mcpi_high, len(months))  # Base MCPI increasing linearly from 100 to 105
    mcpi_noise = np.random.normal(0, 1, len(months))  # Adding random noise with mean 0 and std deviation 1
    mcpi_values = np.round(mcpi_base + mcpi_noise, 2)
    
    # Create a DataFrame with dates and temperatures
    df = pd.DataFrame({'Date': dates, 'Temperature': temperatures, 'Rainfall': rainfalls})
    
    # Assign CPI values to the first day of each month, and forward fill the rest of the days in the month
    cpi_series = pd.Series(cpi_values, index=months)
    mcpi_series = pd.Series(mcpi_values, index=months)
    
    # Ensure the CPI series starts from the first date in the DataFrame
    cpi_series = cpi_series.reindex(dates, method='ffill').fillna(method='bfill')
    mcpi_series = mcpi_series.reindex(dates, method='ffill').fillna(method='bfill')
    
    # Assign the CPI values to the DataFrame
    df['CPI'] = cpi_series.values
    df['MCPI'] = cpi_series.values
    
    # Fix the date format to 'YYYY-MM-DD'
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

    return df
    
df_macro = gen_macro_df(days=180, cpi_low=100, cpi_high=105, temp_low=40, temp_high=80, mcpi_low=50, mcpi_high=250)

#### generate customer data

In [309]:
def gen_cust_df(n_customers, income_mean, income_std):    
    # Number of customers    
    # Generate customer IDs
    customer_ids = np.arange(1, n_customers + 1)
    
    # Generate random values for female
    np.random.seed(0)  # For reproducibility
    female = np.random.choice([0, 1], size=n_customers)
    
    # Generate random values for age (between 18 and 70)
    age = np.random.randint(18, 71, size=n_customers)
    
    # Generate random values for income (normal distribution with mean 50k and std deviation 15k)
    income = np.random.normal(income_mean, income_std, size=n_customers)
    income = np.round(income, 2)  # Round to 2 decimal places
    
    # Create a DataFrame with the generated data
    data = pd.DataFrame({
        'CustomerID': customer_ids,
        'Female': female,
        'Age': age,
        'Income': income
    })
    
    return data

df_customer = gen_cust_df(n_customers=100, income_mean=50000, income_std=30000)

#### generate order data

In [310]:
# Create order quantities for date, customer, and product combination
order_quantities = np.random.randint(0, 10, size=(n_customers, n_products, n_days))

# Create a meshgrid for customer IDs, product IDs, and days
customers = np.arange(n_customers) + 1
products = np.arange(n_products) + 1
# days = np.arange(n_days)

customers_grid, products_grid, dates_grid = np.meshgrid(customers, products, dates, indexing='ij')

df_orders = pd.DataFrame({
    'CustomerID': customers_grid.flatten(),
    'ProductID': products_grid.flatten(),
    'Date': dates_grid.flatten(),
    'OrderQuantity': order_quantities.flatten()
})

df_orders['OrderQuantity'] = np.where(df_orders['Date']==df_orders['Date'].min(), df_orders['OrderQuantity'], 0)
df_orders.sort_values(by=['CustomerID', 'ProductID', 'Date'], inplace=True)
df_orders['OrderQuantity_lag1'] = df_orders.groupby(['CustomerID', 'ProductID'])['OrderQuantity'].shift(1)
df_orders['Date'] = df_orders['Date'].dt.strftime('%Y-%m-%d')
df_orders['Day'] = df_orders['Date'].rank(method='dense').astype(int)

product_map = {i+1: name for i, name in enumerate(product_names)}
df_orders['Product'] = df_orders['ProductID'].map(product_map)

df_orders.tail(2)

Unnamed: 0,CustomerID,ProductID,Date,OrderQuantity,OrderQuantity_lag1,Day,Product
179998,100,10,2024-07-02,0,0.0,179,Zucchini
179999,100,10,2024-07-03,0,0.0,180,Zucchini


In [311]:
df = df_orders.merge(df_macro, on='Date', how='left')
df = df.merge(df_customer, on='CustomerID', how='left')
df['intercept']=1
df.head(2)

Unnamed: 0,CustomerID,ProductID,Date,OrderQuantity,OrderQuantity_lag1,Day,Product,Temperature,Rainfall,CPI,MCPI,Female,Age,Income,intercept
0,1,1,2024-01-06,6,,1,Avocado,48.82,0.0,100.7,100.7,0,60,59662.22,1
1,1,1,2024-01-07,0,6.0,2,Avocado,42.22,0.0,100.7,100.7,0,60,59662.22,1


In [312]:
# define the coefficient for ground truth
data = {
    'intercept': np.random.uniform(-0.5, -0.2, n_products),
    'OrderQuantity_lag1': np.random.uniform(0.5, 0.7, n_products),
    'Female': np.random.uniform(0.013, 0.025, n_products),
    'Age': np.random.uniform(0.036, 0.08, n_products),
    'Income': np.random.uniform(0.00003, 0.00004, n_products),
    'Temperature': np.random.uniform(0.001, 0.002, n_products),
    'Rainfall': np.random.uniform(-0.2, -0.1, n_products),
    'CPI': np.random.uniform(0.005, 0.01, n_products),
    'MCPI': np.random.uniform(-0.005, 0.01, n_products)
}

# Index for the DataFrame
index = pd.Index([i+1 for i in range(n_products)], name='ProductID')

# Create the DataFrame
coefficients = pd.DataFrame(data, index=index)
coefficients


Unnamed: 0_level_0,intercept,OrderQuantity_lag1,Female,Age,Income,Temperature,Rainfall,CPI,MCPI
ProductID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,-0.259576,0.517365,0.021763,0.052175,3.3e-05,0.001801,-0.131333,0.005064,-0.001028
2,-0.477298,0.692852,0.017565,0.05003,3e-05,0.001519,-0.193946,0.008984,-0.001595
3,-0.344942,0.647552,0.019914,0.048881,3.6e-05,0.001492,-0.195463,0.005825,0.006756
4,-0.274048,0.574454,0.017123,0.041404,3.4e-05,0.001364,-0.163815,0.006688,-0.004201
5,-0.225346,0.506465,0.021357,0.037098,3.4e-05,0.001103,-0.154124,0.005443,-0.000274
6,-0.456719,0.54789,0.015271,0.038349,3.6e-05,0.001873,-0.121872,0.006925,0.008576
7,-0.344712,0.581481,0.023974,0.077235,3.2e-05,0.001294,-0.114542,0.006609,0.009866
8,-0.476151,0.626464,0.017608,0.043839,3.7e-05,0.001606,-0.165988,0.008383,0.000346
9,-0.356108,0.693245,0.020947,0.052811,3.5e-05,0.001761,-0.172299,0.009903,0.002367
10,-0.4908,0.542169,0.020624,0.041053,3.1e-05,0.001501,-0.132162,0.009238,0.000484


In [313]:
def gen_next_product_quantity(df, n_days, prod, coefficients):
    for day in range(2, n_days+1):
        rows_to_mul = df['Day'] == day
        next_product = df.loc[rows_to_mul,coefficients.columns].dot(coefficients[coefficients.index==prod].values[0]).values[0]\
        + np.random.normal(0,4)
        if next_product>=0:
            df.loc[rows_to_mul, 'OrderQuantity'] = int(next_product)
        else:
            df.loc[rows_to_mul, 'OrderQuantity'] = 0
        rows_to_update = df['Day'] == day+1
        df.loc[rows_to_update, 'OrderQuantity_lag1']=df.loc[rows_to_mul, 'OrderQuantity'].values[0]
    return df

products = df['ProductID'].unique()
custs = df['CustomerID'].unique()
df_res = []
for cust in custs:
    for prod in products:
        df_tmp = df[(df['CustomerID']==cust)&(df['ProductID']==prod)]
        df_tmp = gen_next_product_quantity(df=df_tmp, n_days=180, prod=prod, coefficients=coefficients)
        df_res.append(df_tmp)
stage1_data = pd.concat(df_res)
stage1_data.head()


KeyboardInterrupt: 

In [None]:
stage1_data.groupby(['Product'])['OrderQuantity'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Avocado,18000.0,3.870167,4.448579,0.0,0.0,2.0,7.0,26.0
Banana,18000.0,5.832222,6.019494,0.0,0.0,5.0,10.0,34.0
Beef,18000.0,6.413056,5.859997,0.0,0.0,6.0,11.0,30.0
Cauliflower,18000.0,7.676833,6.803583,0.0,1.0,7.0,12.0,35.0
Egg,18000.0,5.1165,5.315966,0.0,0.0,4.0,9.0,28.0
Milk,18000.0,4.673222,4.965663,0.0,0.0,3.0,8.0,26.0
Shrimp,18000.0,6.043889,5.827174,0.0,0.0,5.0,10.0,32.0
Tomato,18000.0,5.983389,5.829542,0.0,0.0,5.0,10.0,34.0
Yogurt,18000.0,6.454333,6.389198,0.0,0.0,5.0,11.0,35.0
Zucchini,18000.0,8.2015,7.031973,0.0,1.0,8.0,13.0,35.0


In [None]:
stage1_data.to_csv('./data/stage1_data.csv', index=False)

In [314]:
np.random.seed(0)
stage1_data_store = stage1_data.groupby(['Date', 'ProductID', 'Product']).agg({'OrderQuantity': 'sum',
                                                                    'Temperature': 'mean',
                                                                    'Rainfall': 'mean',
                                                                    'CPI': 'mean',
                                                                    'MCPI': 'mean',
                                                                    'Female': 'mean',
                                                                    'Age': 'mean',
                                                                    'Income': 'mean'
}).reset_index()
stage1_data_store['Inventory'] = stage1_data_store['OrderQuantity'] + np.random.normal(1, 100, n_days*n_products).astype(int)
stage1_data_store['Low_Stock_Risk'] = 1 - (stage1_data_store['OrderQuantity'] / stage1_data_store['Inventory'])

In [315]:
stage1_data_store.sort_values(by=['Date', 'Low_Stock_Risk'], inplace=True)
stage1_data_store['Low_Stock_Priority'] = pd.qcut(stage1_data_store['Low_Stock_Risk'], 10, labels=[f'{i}' for i in range(1, 11)])

stage1_data_store = stage1_data_store[['Date', 'ProductID', 'Product', 'Low_Stock_Priority', 'Low_Stock_Risk', 'OrderQuantity', 'Inventory', 'Temperature', 'Rainfall', 'CPI', 'MCPI',
       'Female', 'Age', 'Income']]
stage1_data_store.to_csv('./data/stage1_data_store.csv', index=False)

In [316]:
stage1_data_store.groupby(['Product']).OrderQuantity.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Avocado,180.0,387.016667,265.97176,0.0,105.5,424.5,614.75,824.0
Banana,180.0,583.222222,417.638888,0.0,165.0,598.5,903.25,1392.0
Beef,180.0,641.305556,374.30181,0.0,365.25,714.0,946.25,1220.0
Cauliflower,180.0,767.683333,446.119475,0.0,483.5,860.5,1140.25,1481.0
Egg,180.0,511.65,330.569941,0.0,235.5,529.5,796.25,1112.0
Milk,180.0,467.322222,332.669945,0.0,84.5,517.5,745.25,993.0
Shrimp,180.0,604.388889,399.23062,0.0,213.75,646.0,923.75,1247.0
Tomato,180.0,598.338889,412.363228,0.0,158.25,642.5,919.5,1275.0
Yogurt,180.0,645.433333,451.51795,0.0,235.75,680.0,1017.25,1488.0
Zucchini,180.0,820.15,482.413364,0.0,508.0,936.0,1207.75,1640.0
