In [1]:
import numpy as np
import pandas as pd

from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error as mse
import sys
from datetime import datetime
import itertools

sys.path.append("../dora/models")
from utils import read_data, process_time, merge_data, dataset_builder, cumulative_sale_by_category

from tqdm import tqdm

In [2]:
CATEGORY = "category2"

In [3]:
infos, items, orders = read_data("../main/datasets/")
process_time(orders)
df = dataset_builder(orders, items)


In [4]:
display(df)

Unnamed: 0,group_backwards,itemID,orderSum,brand,manufacturer,customerRating,category1,category2,category3,recommendedRetailPrice
0,13,1,0.0,0,1,4.38,1,1,1,8.84
1,13,2,0.0,0,2,3.00,1,2,1,16.92
2,13,3,1.0,0,3,5.00,1,3,1,15.89
3,13,4,0.0,0,2,4.44,1,2,1,40.17
4,13,5,2.0,0,2,2.33,1,1,1,17.04
...,...,...,...,...,...,...,...,...,...,...
136014,1,10459,0.0,180,253,0.00,8,44,8,56.57
136015,1,10460,0.0,0,253,0.00,8,44,8,163.81
136016,1,10461,0.0,0,253,0.00,8,44,8,128.01
136017,1,10462,0.0,180,253,0.00,8,44,8,166.97


In [5]:
df2 = df.groupby([CATEGORY,'group_backwards'],as_index=False)['orderSum'].sum()
df2.sort_values(by=[CATEGORY,'group_backwards'], ascending=[True,False], inplace=True)
print(df2)

     category2  group_backwards  orderSum
12           1               13    4332.0
11           1               12    6196.0
10           1               11    7530.0
9            1               10    8329.0
8            1                9    6014.0
..         ...              ...       ...
667         52                5       0.0
666         52                4       0.0
665         52                3       0.0
664         52                2      81.0
663         52                1       5.0

[676 rows x 3 columns]


## Training ARRIMA for each CATEGORY

In [6]:
from sklearn.metrics import mean_squared_error


def get_arrima_best_param(cat):
    aux = df2.query(f'{CATEGORY} == {cat}')
    
    ts=aux.query('group_backwards > 1')["orderSum"]
    ts.reset_index(inplace=True,drop=True)
    result = int(aux.query('group_backwards == 1')["orderSum"])

    # Generate all different combinations of p, q and q triplets
    p = range(3)
    d = range(3)
    q = range(3)
    pdq = list(itertools.product(p, d, q))
    
    
    bestMRSE = np.inf
    erro = 0
    bestParam = None
    bestpredict = 0
    
    for param in pdq:
        try:

            model = ARIMA(ts, order=param)
            model_fit = model.fit(disp=0)
        
            predict = model_fit.forecast(1)
            predict = predict[0]
            currentScore = mean_squared_error(predict,[result])
            
            if(currentScore < bestMRSE):
                bestMRSE = currentScore
                bestParam = param
                bestpredict = predict[0]
        except:
            continue
            
    return  bestpredict, bestMRSE, bestParam

    

In [7]:
import warnings
warnings.filterwarnings("ignore")

predict = []
mrse = []
best_params = []
for cat in tqdm(df2[CATEGORY].unique()):
    p1,er1, param = get_arrima_best_param(cat)
    
    predict.append(p1)
    mrse.append(er1)
    best_params.append(param)

100%|██████████| 52/52 [00:54<00:00,  1.04s/it]


## putting results in csv (use the script 12 to compare score)

In [8]:
result = pd.DataFrame(index=df[CATEGORY].unique(), data={'pred': predict})
result[result < 0] = 0
result.to_csv(f"pred/arrima_{CATEGORY}.csv", header=["orderSum_cat_arrima"],
            index_label=CATEGORY, sep="|")