In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from pmdarima import auto_arima
import warnings


In [3]:
# Load dataset
df = pd.read_csv('../raw_data/cleaned_merge_df_top10.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [4]:
df.head()

Unnamed: 0_level_0,id,item_id,dept_id,cat_id,store_id,state_id,sales,weekday,wday,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-01-29,FOODS_2_197_CA_1_validation,FOODS_2_197,FOODS_2,FOODS,CA_1,CA,38,Saturday,1,0,0,0,0,0,0,0,2.98
2011-01-29,FOODS_3_080_CA_1_validation,FOODS_3_080,FOODS_3,FOODS,CA_1,CA,33,Saturday,1,0,0,0,0,0,0,0,1.48
2011-01-29,FOODS_3_090_CA_1_validation,FOODS_3_090,FOODS_3,FOODS,CA_1,CA,107,Saturday,1,0,0,0,0,0,0,0,1.25
2011-01-29,FOODS_3_120_CA_1_validation,FOODS_3_120,FOODS_3,FOODS,CA_1,CA,0,Saturday,1,0,0,0,0,0,0,0,0.0
2011-01-29,FOODS_3_252_CA_1_validation,FOODS_3_252,FOODS_3,FOODS,CA_1,CA,19,Saturday,1,0,0,0,0,0,0,0,1.48


In [5]:

# Load dataset
df = pd.read_csv('../raw_data/cleaned_merge_df_top10.csv')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Prepare results storage
results = {}

# Assuming there is a id column to iterate over and 'sales' as the target
for id in df['id'].unique():
    print(f"Processing {id}")
    series = df[df['id'] == id]['sales']

    # Initialize time-series cross-validator
    tscv = TimeSeriesSplit(n_splits=5)

    # Lists to store MAE scores for comparison
    mae_scores_arima = []
    mae_scores_rf = []

    # Perform cross-validation
    for train_idx, test_idx in tscv.split(series):
        train_data, test_data = series.iloc[train_idx], series.iloc[test_idx]

        # Auto-ARIMA model
        arima_model = auto_arima(train_data, seasonal=True, m=7, suppress_warnings=True, stepwise=True)
        arima_predictions = arima_model.predict(n_periods=len(test_data))
        mae_arima = mean_absolute_error(test_data, arima_predictions)
        mae_scores_arima.append(mae_arima)

        # Random Forest model with lag features
        X_train = pd.DataFrame({
            'lag1': train_data.shift(1),
            'lag2': train_data.shift(2),
            'lag3': train_data.shift(3)
        }).dropna()
        y_train = train_data.iloc[3:]
        rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)

        X_test = pd.DataFrame({
            'lag1': test_data.shift(1),
            'lag2': test_data.shift(2),
            'lag3': test_data.shift(3)
        }).dropna()
        y_test_aligned = test_data.iloc[3:]  # Align y_test with the available X_test
        rf_predictions = rf_model.predict(X_test)
        mae_rf = mean_absolute_error(y_test_aligned, rf_predictions)
        mae_scores_rf.append(mae_rf)

        print(f"Fold completed. ARIMA MAE: {mae_arima}, RF MAE: {mae_rf}")

    # Store results
    results[id] = {
        'ARIMA_MAE': np.mean(mae_scores_arima),
        'RF_MAE': np.mean(mae_scores_rf)
    }

# Print overall results
for product, scores in results.items():
    print(f"Product ID {product}: ARIMA MAE = {scores['ARIMA_MAE']}, RF MAE = {scores['RF_MAE']}")

# Optionally, summarize all results
average_arima_mae = np.mean([res['ARIMA_MAE'] for res in results.values()])
average_rf_mae = np.mean([res['RF_MAE'] for res in results.values()])
print(f"Average ARIMA MAE across all products: {average_arima_mae}")
print(f"Average RF MAE across all products: {average_rf_mae}")


Processing FOODS_2_197_CA_1_validation
Fold completed. ARIMA MAE: 11.455723560225175, RF MAE: 10.341722499533716
Fold completed. ARIMA MAE: 12.588663154758894, RF MAE: 9.201569681576464
Fold completed. ARIMA MAE: 20.004147728214548, RF MAE: 9.267003976907906
Fold completed. ARIMA MAE: 6.966593783212147, RF MAE: 8.614751181525243
Fold completed. ARIMA MAE: 5.930905216998448, RF MAE: 7.761411238383719
Processing FOODS_3_080_CA_1_validation
Fold completed. ARIMA MAE: 5.770661692250511, RF MAE: 6.933755555555555
Fold completed. ARIMA MAE: 6.384335172725188, RF MAE: 6.4566975812547245
Fold completed. ARIMA MAE: 5.856336763401625, RF MAE: 6.854338926681784
Fold completed. ARIMA MAE: 4.288200259923211, RF MAE: 6.127151020408163
Fold completed. ARIMA MAE: 5.463257695939435, RF MAE: 6.954212849584278
Processing FOODS_3_090_CA_1_validation
Fold completed. ARIMA MAE: 39.58494074682603, RF MAE: 42.20307941957396
Fold completed. ARIMA MAE: 59.03203725879767, RF MAE: 36.44437696626981
Fold completed



Fold completed. ARIMA MAE: 120.95479299693288, RF MAE: 17.528479436783226
Fold completed. ARIMA MAE: 26.568387045797028, RF MAE: 19.351807883178054
Fold completed. ARIMA MAE: 21.07701740580339, RF MAE: 22.074143210602806
Fold completed. ARIMA MAE: 38.566311396061394, RF MAE: 19.009118752029888
Processing FOODS_3_252_CA_1_validation
Fold completed. ARIMA MAE: 10.584738709656099, RF MAE: 13.01434814814815
Fold completed. ARIMA MAE: 13.105612751051275, RF MAE: 14.025446560846559
Fold completed. ARIMA MAE: 38.45116147652414, RF MAE: 12.326678306878307
Fold completed. ARIMA MAE: 27.506876869128366, RF MAE: 12.326512698412698
Fold completed. ARIMA MAE: 12.18900183742526, RF MAE: 11.532054572940288
Processing FOODS_3_555_CA_1_validation
Fold completed. ARIMA MAE: 17.423916727705627, RF MAE: 6.173224111866969
Fold completed. ARIMA MAE: 5.429391771068531, RF MAE: 6.232328647014361
Fold completed. ARIMA MAE: 5.281615837480839, RF MAE: 6.104027538422776
Fold completed. ARIMA MAE: 5.42156453770688

Traceback:
Traceback (most recent call last):
  File "/Users/julietta/.pyenv/versions/3.10.6/envs/walmart/lib/python3.10/site-packages/pmdarima/arima/_auto_solvers.py", line 508, in _fit_candidate_model
    fit.fit(y, X=X, **fit_params)
  File "/Users/julietta/.pyenv/versions/3.10.6/envs/walmart/lib/python3.10/site-packages/pmdarima/arima/arima.py", line 603, in fit
    self._fit(y, X, **fit_args)
  File "/Users/julietta/.pyenv/versions/3.10.6/envs/walmart/lib/python3.10/site-packages/pmdarima/arima/arima.py", line 524, in _fit
    fit, self.arima_res_ = _fit_wrapper()
  File "/Users/julietta/.pyenv/versions/3.10.6/envs/walmart/lib/python3.10/site-packages/pmdarima/arima/arima.py", line 510, in _fit_wrapper
    fitted = arima.fit(
  File "/Users/julietta/.pyenv/versions/3.10.6/envs/walmart/lib/python3.10/site-packages/statsmodels/tsa/statespace/mlemodel.py", line 703, in fit
    mlefit = super().fit(start_params, method=method,
  File "/Users/julietta/.pyenv/versions/3.10.6/envs/walmar

Fold completed. ARIMA MAE: 4.787165930330738, RF MAE: 5.527988107835727
Processing FOODS_3_586_CA_1_validation
Fold completed. ARIMA MAE: 15.332695430333656, RF MAE: 13.492632275132275
Fold completed. ARIMA MAE: 13.246389778301284, RF MAE: 11.855555555555556
Fold completed. ARIMA MAE: 11.178444201409338, RF MAE: 11.327628571428571
Fold completed. ARIMA MAE: 26.926922600832157, RF MAE: 11.835697959183674
Fold completed. ARIMA MAE: 11.36623325324494, RF MAE: 11.479730687830687
Processing FOODS_3_587_CA_1_validation
Fold completed. ARIMA MAE: 21.283304525090234, RF MAE: 11.40168253968254
Fold completed. ARIMA MAE: 23.438018660117756, RF MAE: 12.080049886621312
Fold completed. ARIMA MAE: 24.228789512960457, RF MAE: 6.66794962639945
Fold completed. ARIMA MAE: 19.283835717804358, RF MAE: 7.684779807447977
Fold completed. ARIMA MAE: 17.47381967415282, RF MAE: 9.683838246409675
Processing FOODS_3_714_CA_1_validation
Fold completed. ARIMA MAE: 14.476563094307803, RF MAE: 12.74560574452003
Fold 

In [9]:
from dask.distributed import Client
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error
import pickle
import warnings

In [12]:
# Start a Dask client
client = Client()

# Define the specifics of the custom time-series cross-validation
num_splits = 5
days_per_split = 28

# Dictionary to store results for each product id
product_results = {}
auto_arima_models = {}

# Function to perform ARIMA modeling and calculate MAE
def perform_auto_arima(train_data, test_data):
    y_train = train_data["sales"]
    y_test = test_data["sales"]

    # Fit ARIMA model on the training data
    model = auto_arima(y_train, start_p=0, start_q=0, max_p=5, max_q=5, d=1,
                       seasonal=True, trace=False, error_action='ignore', 
                       suppress_warnings=True, stepwise=True)
    
    # Predict on the test data
    predictions = model.predict(n_periods=len(y_test))

    # Calculate MAE
    mae = mean_absolute_error(y_test, predictions)
    
    return model, mae

# Iterate over each unique product series identified by id
for product_id in df['id'].unique():
    print(f"Analyzing product: {id}")
    product_data = df[df['id'] == id]

    # Check if enough data is available
    if len(product_data) < days_per_split * num_splits:
        print(f"Not enough data for product {id} to perform {num_splits} splits with {days_per_split} days each.")
        continue
    
    mae_scores = []
    
    # Create data slices for 5-fold validation with each test slice being exactly 28 days
    for i in range(1, num_splits + 1):
        train_data = product_data.iloc[:-days_per_split * i]
        test_data = product_data.iloc[-days_per_split * i: -days_per_split * (i - 1) if i > 1 else None]

        # Perform Auto ARIMA in parallel
        future = client.submit(perform_auto_arima, train_data, test_data)
        model, mae = client.gather(future)

        # Store results
        mae_scores.append(mae)

    # Calculate average MAE and store the model
    average_mae = np.mean(mae_scores)
    product_results[id] = average_mae
    auto_arima_models[id] = model
    print(f'Average Mean Absolute Error for {id}: {average_mae}')

    # Save the model
    #filename = f'/path_to_save_models/{id}_model.pkl'
    #with open(filename, 'wb') as f:
        #pickle.dump(model, f)

# Print average MAE across all products
average_mae_across_products = np.mean(list(product_results.values()))
print(f'Average Mean Absolute Error across all products: {average_mae_across_products}')

# Handle possible convergence warnings
warnings.filterwarnings("ignore")


Perhaps you already have a cluster running?
Hosting the HTTP server on port 59884 instead


Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_808_CA_1_validation: 3.5194842680202996
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_808_CA_1_validation: 3.5194842680202996
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_808_CA_1_validation: 3.5194842680202996
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_808_CA_1_validation: 3.5194842680202996
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_808_CA_1_validation: 3.5194842680202996
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_808_CA_1_validation: 3.5194842680202996
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_808_CA_1_validation: 3.5194842680202996
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_808_CA_1_validation: 3.5

In [13]:
from dask.distributed import Client
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import pickle
import warnings


In [16]:

# Define the specifics of the custom time-series cross-validation
num_splits = 5
days_per_split = 28

# Dictionary to store results for each product id
product_results = {}
random_forest_models = {}

# Function to perform Random Forest modeling and calculate MAE
def perform_random_forest(train_data, test_data):
    # Prepare features by creating lags
    y_train = train_data["sales"]
    y_test = test_data["sales"]
    X_train = pd.DataFrame({
        'lag1': y_train.shift(1),
        'lag2': y_train.shift(2),
        'lag3': y_train.shift(3)
    }).dropna()
    y_train = y_train.iloc[3:]  # Align target with available features
    
    X_test = pd.DataFrame({
        'lag1': y_test.shift(1),
        'lag2': y_test.shift(2),
        'lag3': y_test.shift(3)
    }).dropna()
    y_test = y_test.iloc[3:]  # Align target with available features

    # Fit Random Forest model on the training data
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict on the test data
    predictions = model.predict(X_test)

    # Calculate MAE
    mae = mean_absolute_error(y_test, predictions)
    
    return model, mae

# Iterate over each unique product series identified by id
for product_id in df['id'].unique():
    print(f"Analyzing product: {id}")
    product_data = df[df['id'] == id]

    # Check if enough data is available
    if len(product_data) < days_per_split * num_splits:
        print(f"Not enough data for product {id} to perform {num_splits} splits with {days_per_split} days each.")
        continue
    
    mae_scores = []
    
# Create data slices for 5-fold validation with each test slice being exactly 28 days
    for i in range(1, num_splits + 1):
        train_data = product_data.iloc[:-days_per_split * i]
        test_data = product_data.iloc[-days_per_split * i: -days_per_split * (i - 1) if i > 1 else None]

        # Perform Random Forest modeling in parallel
        future = client.submit(perform_random_forest, train_data, test_data)
        model, mae = client.gather(future)

        # Store results
        mae_scores.append(mae)

    # Calculate average MAE and store the model
    average_mae = np.mean(mae_scores)
    product_results[product_id] = average_mae
    random_forest_models[product_id] = model
    print(f'Average Mean Absolute Error for {product_id}: {average_mae}')

# Print average MAE across all products
average_mae_across_products = np.mean(list(product_results.values()))
print(f'Average Mean Absolute Error across all products: {average_mae_across_products}')

# Handle possible convergence warnings
warnings.filterwarnings("ignore")


Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_2_197_CA_1_validation: 4.685929990104027
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_080_CA_1_validation: 4.685929990104027
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_090_CA_1_validation: 4.685929990104027
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_120_CA_1_validation: 4.685929990104027
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_252_CA_1_validation: 4.685929990104027
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_555_CA_1_validation: 4.685929990104027
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_586_CA_1_validation: 4.685929990104027
Analyzing product: FOODS_3_808_CA_1_validation
Average Mean Absolute Error for FOODS_3_587_CA_1_validation: 4.68592999