In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
import sys
import xgboost as xgb
import lightgbm as lgb
from datetime import datetime
import seaborn as sns
import itertools

sys.path.append("../dora/models")
from utils import read_data, process_time, merge_data, dataset_builder, cumulative_sale_by_category

NUMBER_OF_LAGS = 4

from statsmodels.tsa.arima_model import ARIMA
from tqdm import tqdm

In [3]:
infos, items, orders = read_data("../main/datasets/")
process_time(orders)
df = dataset_builder(orders, items)

df.sort_values(by=['itemID','group_backwards'], ascending=[True,False], inplace=True)

In [4]:
#removing IDs with less than 11 aperance
not_zero = df.query('orderSum != 0')

id_count = not_zero.groupby('itemID')['group_backwards'].count()
valid_id = id_count[id_count > 11].index

df2 = df[df.itemID.isin(valid_id)

## training ARRIMA for each itemID

In [7]:
from sklearn.metrics import mean_squared_error


def get_arrima_best_param(id):
    aux = df2.query(f'itemID == {id}')
    ts=aux.query('group_backwards > 2')["orderSum"]
    ts2=aux.query('group_backwards > 1')["orderSum"]

    result1 = int(aux.query('group_backwards == 2')["orderSum"])
    result2 = int(aux.query('group_backwards == 1')["orderSum"])

    # Generate all different combinations of p, q and q triplets
    p = range(3)
    d = range(3)
    q = range(3)
    pdq = list(itertools.product(p, d, q))
    
    
    #grid search best parameters
    bestParam = None
    bestpredict = (0,0,0)
    bestMRSE = np.inf
    erro1 = 0
    erro2 = 0
    
    for param in pdq:
        try:
            currentScore = 0
            
            #arrima trained until week 3
            model = ARIMA(ts, order=param)
            model_fit = model.fit(disp=0)
            
            #mrse of week 2
            predict = model_fit.forecast(steps=2)[0]
            predict_error = mean_squared_error([predict[0]],[result1])
            currentScore += predict_error
            
            if(currentScore > bestMRSE):
                continue
            
            
            #arrima trained until week 2
            model = ARIMA(ts2, order=param)
            model_fit = model.fit(disp=0)
            

            #mrse of week 1
            predict2 = model_fit.forecast(1)[0]
            predict_error2 = mean_squared_error(predict2,[result2])
            currentScore += predict_error2
            
            if(currentScore < bestMRSE):
                bestMRSE = currentScore
                erro1 = predict_error
                erro2 = predict_error2
                bestParam = param
                bestpredict = (predict[0],predict[1],predict2[0])
        except:
            continue
    
    #return: prediction week 2 trained until 3 and its mrse error
    #prediction week 1 trained until 3
    #prediction week 1 trained until 2 and its mrse error
    #best parameterss
    return bestpredict[0] , bestpredict[1], bestpredict[2],erro1, erro2, bestParam

In [8]:
import warnings
warnings.filterwarnings("ignore")

predict1_week2 = []
predict1_week1 = []
predict2_week1 = []
mrse1 = []
mrse2 = []
best_params = []

for id in tqdm(df2.itemID.unique()):
    p1, p2,p3, er1, er2, param = get_arrima_best_param(id)
    
    predict1_week2.append(p1)
    predict1_week1.append(p2)
    predict2_week1.append(p3)
    mrse1.append(er1)
    mrse2.append(er2)
    best_params.append(param)

100%|██████████| 232/232 [03:07<00:00,  1.24it/s]


In [10]:
print(sum([a>b  for a, b in zip(mrse1, mrse2)]))
print(sum([a<b  for a, b in zip(mrse1, mrse2)]))

107
124


## Results:

In [11]:
#adding zero sales rows
predict1_week2 = pd.Series(predict1_week2)
final1 = pd.Series(0, index=np.arange(1, len(items)+1))
#removing negative values
final1[final1 < 0] = 0


predict1_week1 = pd.Series(predict1_week1)
final2 = pd.Series(0, index=np.arange(1, len(items)+1))
final2[final2 < 0] = 0


predict2_week1 = pd.Series(predict2_week1)
final3 = pd.Series(0, index=np.arange(1, len(items)+1))
final3[final3 < 0] = 0

In [12]:
def baseline_score(prediction, target, simulatedPrice):
    prediction = prediction.astype(int)

    return np.sum((prediction - np.maximum(prediction - target, 0) * 1.6)  * simulatedPrice)

In [13]:
result1 = df.query('group_backwards == 2')['orderSum'].values
result2 = df.query('group_backwards == 1')['orderSum'].values

print(f"train1: {baseline_score(final1, result1, infos['simulationPrice'])}")

print(f"eval: {baseline_score(final2, result1, infos['simulationPrice'])}")

print(f"train2 : {baseline_score(final3, result2, infos['simulationPrice'])}")

train1: -20042.972000000045
eval: -88474.76200000006
train2 : 145713.642


In [15]:
# parece bem instaveis os resultados

### to csv

In [14]:
final3.to_csv("pred/arrima.csv", header=["demandPrediction"],
            index_label="itemID", sep="|")