In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [12]:
df = pd.read_csv('data.csv')
df.columns

Index(['Unnamed: 0', 'Date', 'ProductId', 'ProductPrice', 'LocationId',
       'SalesVolume', 'SalesValue', 'ItemName', 'ProductType',
       'ProductSubCategory', 'ProductCategory', 'Department', 'LocationType',
       'LocationPincode', 'LocationAddress', 'City', 'State', 'Country',
       'Month', 'Week', 'Quarter', 'WeekOfYear', 'Year', 'RowNumber',
       'SupplierId'],
      dtype='object')

In [23]:
def transform_and_group_by_grain(fileName:str, time:str="Month", product:str="", location:str="") -> tuple:
    df = pd.read_csv(fileName)
    
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
    
    group_columns = []

    if(time == 'Month'):
        group_columns.append('Year')
    elif(time == 'Quater'):
        group_columns.extend(['Year','Month'])
    elif(time == 'Week'):
        group_columns.extend(['Year','Month','Quarter'])
    
    group_columns.append(time)

    if product:
        group_columns.append(product)
    if location:
        group_columns.append(location)
    
    grouped_df = df.groupby(group_columns).agg({
        'SalesVolume': 'sum',  
    }).reset_index()
    grouped_df['Key'] = grouped_df.apply(lambda row: f"{row[product]}_{row[location]}", axis=1)  
    distinct_keys = grouped_df['Key'].unique().tolist()
    return (grouped_df,group_columns,distinct_keys)

In [14]:
def prepare_time_series_data(df, learning_time, skip_period, horizon,group_cols, distinct_keys):
    
    filtered_dfs = []
    for key in distinct_keys:
    
        filtered_df = df[df['Key'] == key].copy()
        filtered_df.sort_values(by=group_cols, inplace=True)
        filtered_df.reset_index(drop=True, inplace=True)
        
        for i in range(1, learning_time + 1):
            column_name_b = f"SalesVolume_lag_{i}"
            filtered_df[column_name_b] = filtered_df['SalesVolume'].shift(i)
    
        for i in range(1, horizon + 1):
            column_name_b = f"skip_period_{skip_period}_horizon_{i}_SalesVolume_lead"
            filtered_df[column_name_b] = filtered_df['SalesVolume'].shift(-(skip_period + i))
        
        filtered_df.dropna(inplace=True)
        filtered_dfs.append(filtered_df)
    concatenated_df = pd.concat(filtered_dfs, ignore_index=True)
    return concatenated_df


In [37]:
time_column = 'Month'  
product_column = 'ProductType'  
location_column = 'City'  
file_name = 'data.csv'
result_df,group_columns,distinct_keys = transform_and_group_by_grain(file_name,time_column, product_column, location_column)
print(len(distinct_keys))

168


In [38]:
result_df

Unnamed: 0,Year,Month,ProductType,City,SalesVolume,Key
0,2019,1,Apparel,Bangalore,966151,Apparel_Bangalore
1,2019,1,Apparel,Chennai,703495,Apparel_Chennai
2,2019,1,Apparel,Hyderabad,841488,Apparel_Hyderabad
3,2019,1,Apparel,Kolkata,792268,Apparel_Kolkata
4,2019,1,Apparel,Mumbai,839441,Apparel_Mumbai
...,...,...,...,...,...,...
8059,2022,12,Waste Disposal,Chennai,133960,Waste Disposal_Chennai
8060,2022,12,Waste Disposal,Hyderabad,160236,Waste Disposal_Hyderabad
8061,2022,12,Waste Disposal,Kolkata,150870,Waste Disposal_Kolkata
8062,2022,12,Waste Disposal,Mumbai,159847,Waste Disposal_Mumbai


In [39]:
learning_time = 3
skip_period = 3
horizon = 4
prepared_df = prepare_time_series_data(result_df, learning_time, skip_period, horizon,group_columns,distinct_keys)

In [40]:
model_dict = {}
horizon = 3
for key in distinct_keys:
    models = []
    for i in range(1,horizon+1):
        rf_model = RandomForestRegressor()
        models.append(rf_model)
    
    model_dict[key] = models

In [41]:
def train_model(df,time_group_col,models,horizon):
    X_lagged_list_train = df.filter(regex='SalesVolume_lag_').values
    X_additional_list_train = df[time_group_col].values
    X_train = np.column_stack((X_lagged_list_train, X_additional_list_train))

    y_columns = [f"skip_period_{skip_period}_horizon_{h}_SalesVolume_lead" for h in range(1, horizon+1)]
    Y_train = df[y_columns].values
    for i in range(horizon):
        y_horizon = Y_train[:, i]
        models[i].fit(X_train,y_horizon.ravel())

In [42]:
def test_model(df_test,time_group_col,models,horizon,predictions_df_dict,key):
    # df_test = df.iloc[split_index:]
    predictions_df = pd.DataFrame()
    X_lagged_list_test = df_test.filter(regex='SalesVolume_lag_').values
    X_additional_list_test = df_test[time_group_col].values
    X_test = np.column_stack((X_lagged_list_test, X_additional_list_test))

    y_columns = [f"skip_period_{skip_period}_horizon_{h}_SalesVolume_lead" for h in range(1, horizon+1)]
    Y_test = df_test[y_columns].values
    predictions_df[time_group_col] = df_test[time_group_col]

    for i in range(horizon):
        predictions = models[i].predict(X_test)
        predictions_df[f"Horizon_{i+1}_Predictions"] = predictions
        y_horizon = Y_test[:, i]
        predictions_df[f"Horizon_{i+1}_Actuals"] = y_horizon

    predictions_df_dict[key].append(predictions_df)

In [10]:
predictions_df_dict = {}
        
def train_test_split_fun1(minimum_test_cases,minimum_train_cases,minimum_set,distinct_keys):
    for key in distinct_keys:
        filtered_df = prepared_df[prepared_df['Key'] == key]
        increase = (len(filtered_df)-minimum_test_cases-minimum_train_cases)//minimum_set
        if(minimum_train_cases + increase * minimum_set + minimum_test_cases > len(filtered_df) or increase == 0):
            print("minimum_set not possible training testing through default mode")
            minimum_train_cases = int(len(filtered_df) * 0.5)
            minimum_test_cases = int(len(filtered_df) * 0.1)
            increase  = 2
        # print("increase",increase)
        print(filtered_df)
        predictions_df_dict[key] = []
        for i in range(minimum_train_cases,len(filtered_df)-minimum_test_cases+1,increase):
            df_train = filtered_df.iloc[:i]
            train_model(df_train,['Month', 'Year'],model_dict[key],3)
            # df_test = filtered_df.iloc[i:]
            # test_model(df_test,['Month', 'Year'],model_dict[key],3,predictions_df_dict,key)


train_test_split_fun1(15,20,4,distinct_keys)

minimum_set not possible training testing through default mode


In [15]:
predictions_df_dict['Apparel_Bangalore'][3]

Unnamed: 0,Month,Year,Horizon_1_Predictions,Horizon_1_Actuals,Horizon_2_Predictions,Horizon_2_Actuals,Horizon_3_Predictions,Horizon_3_Actuals
25,5,2021,1409990.56,1446892.0,1793760.76,1772994.0,2132555.27,2210892.0
26,6,2021,1777424.9,1772994.0,2092594.44,2210892.0,1807233.29,1805908.0
27,7,2021,2063789.21,2210892.0,1802100.71,1805908.0,1094568.74,1024227.0
28,8,2021,1780864.41,1805908.0,1110981.44,1024227.0,1050585.44,1010870.0
29,9,2021,1130391.15,1024227.0,1029186.7,1010870.0,1248703.49,1286658.0
30,10,2021,1009882.45,1010870.0,1226351.05,1286658.0,932908.77,930758.0
31,11,2021,1220906.89,1286658.0,941329.69,930758.0,821327.79,784162.0
32,12,2021,960800.99,930758.0,829919.6,784162.0,940687.12,972979.0
33,1,2022,810129.6,784162.0,972242.18,972979.0,1400238.53,1410579.0
34,2,2022,975242.05,972979.0,1406286.07,1410579.0,1711288.39,1801547.0


In [32]:
def metrics(horizon,prediction_df):
    predictions_col = []
    actuals_col = []
    for i in range(horizon):  # for Horizon 1 to Horizon 3
        predictions_col_ele = f'Horizon_{i+1}_Predictions'
        predictions_col.append(predictions_col_ele)
        actuals_col_ele = f'Horizon_{i+1}_Actuals'
        actuals_col.append(actuals_col_ele)

    actuals_col_sum = prediction_df[actuals_col].sum()
    predictions_col_sum = prediction_df[predictions_col].sum() 
    WMAPES = []
    for i in range(0, horizon):
        WMAPE = (abs(actuals_col_sum[i] - predictions_col_sum[i])/actuals_col_sum[i])*100
        WMAPES.append(WMAPE)
    return WMAPES
 

In [33]:
for key in distinct_keys:
    accuracy = [0]*horizon
    for df in predictions_df_dict[key]:
        temp = metrics(horizon,df)
        for i in range(horizon):
            accuracy[i] = accuracy[i] + temp[i] 
    accuracy = [i/horizon for i in accuracy] 
    model_dict[key] = [model_dict[key],accuracy]

In [34]:
model_dict['Apparel_Bangalore']

[[RandomForestRegressor(), RandomForestRegressor(), RandomForestRegressor()],
 [7.011474391191105, 14.094185717944042, 1.9612783446305404]]

In [None]:
start  = 4
end = 5
horizon_temp = end - start + 1
def train_test_split_fun2(start,end,time_column):
    for key in distinct_keys:
        filtered_df = prepared_df[prepared_df['Key'] == key]
        increase = (len(filtered_df)-minimum_test_cases-minimum_train_cases)//minimum_set
        if(minimum_train_cases + increase * minimum_set + minimum_test_cases > len(filtered_df) or increase == 0):
            print("minimum_set not possible training testing through default mode")
            minimum_train_cases = int(len(filtered_df) * 0.5)
            minimum_test_cases = int(len(filtered_df) * 0.1)
            increase  = 2
        # print("increase",increase)
        print(filtered_df)
        predictions_df_dict[key] = []
        for i in range(minimum_train_cases,len(filtered_df)-minimum_test_cases+1,increase):
            df_train = filtered_df.iloc[:i]
            train_model(df_train,['Month', 'Year'],model_dict[key],3)
            # df_test = filtered_df.iloc[i:]
            # test_model(df_test,['Month', 'Year'],model_dict[key],3,predictions_df_dict,key)
