In [1]:
# Nativos
import os
import sys

#calculo
import numpy as np
import pandas as pd
from IPython.display import display

#grafico
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid")

#warning ignore future
import warnings
warnings.filterwarnings('ignore')

from extract import Extract

### OJO: Apesar de tener data hasta 03/02/2019, solo se manejara hasta el 27/01/2019 con fines de poder comparar los resultados de ambos modelos

In [2]:
df = pd.read_excel(
    'https://s3.amazonaws.com/amp.ec/peru21_new.xlsx', parse_dates=['Fecha']
).sort_index()

df.index = df['Fecha']
del df.index.name
df.head()

KeyboardInterrupt: 

OBSERVACION, por casos de datos faltantes

In [None]:
df[df['CodigoSapCanilla'] == 1300000384][['Pauta', 'VentaNeta', 'Proy Holt']].tail()

Filtrado

In [None]:
df = df[(df['Pauta'] > 0) | (df['VentaNeta'].notnull())]

No se descarta todos los valores donde la pauta es nula (AUSENCIA DE CANILLA), para analizar la variacion del pronostico considereando estos dias de falta entre el 21 y el 27 de Enero

## Validacion de nulos Globales

In [None]:
df.isnull().sum(), df.shape

## Configuracion

In [None]:
init_prediction = '2019-01-21'
since_date = '2019-01-20'

In [None]:
agencias_detectadas = df.groupby(by=['NombreAgencia'])['CodigoSapAgencia'].max()
agencias_detectadas

## Creación de contenedor de caniilas por agencia

In [None]:
extractor = Extract(df)

### CALLAO

In [None]:
dicc_callao = extractor.populate_data(1200000065)
extractor.valid_null(dicc_callao)

In [None]:
extractor.inspect_dict(dicc_callao, n=3, init=init_prediction)

In [None]:
dicc_callao = extractor.cut_serie(dicc_callao, since=since_date)

### CANTO GRANDE

In [None]:
dicc_canto_grande = extractor.populate_data(1200001015)
extractor.valid_null(dicc_canto_grande)

In [None]:
extractor.inspect_dict(dicc_canto_grande, n=3, init=init_prediction)

In [None]:
dicc_canto_grande = extractor.cut_serie(dicc_canto_grande, since=since_date)

### JESUS MARIA

In [None]:
dicc_jesus_maria = extractor.populate_data(1200000047)
extractor.valid_null(dicc_jesus_maria)

In [None]:
extractor.inspect_dict(dicc_jesus_maria, n=3, init=init_prediction)

In [None]:
dicc_jesus_maria = extractor.cut_serie(dicc_jesus_maria, since=since_date)

### VISUALIZACION POR PERCENTILES

### CALLAO

In [None]:
extractor.show_percentile(dicc_callao)

### CANTO GRANDE

In [None]:
extractor.show_percentile(dicc_canto_grande)

### JESUS MARIA

In [None]:
extractor.show_percentile(dicc_jesus_maria)

De los graficos generales de los datos se toma la decision de tomar los percentiles
5% y 95%

In [None]:
%%time
cut_up = 95/100
cut_down = 5/100

dicc_callao = extractor.noise_control(dicc_callao, cut_down, cut_up)
dicc_canto_grande = extractor.noise_control(dicc_canto_grande, cut_down, cut_up)
dicc_jesus_maria = extractor.noise_control(dicc_jesus_maria, cut_down, cut_up)

In [None]:
extractor.plot_diff_noise(dicc_callao, n=5)

In [None]:
extractor.plot_diff_noise(dicc_canto_grande, n=5)

In [None]:
extractor.plot_diff_noise(dicc_jesus_maria, n=5)

# MODELADO DE DATOS

In [None]:
from statsmodels.tsa.arima_model import ARIMA
from itertools import product

In [None]:
ps = range(2, 5)
d=1 
qs = range(2, 5)
Ps = range(0, 2)
D=1 
Qs = range(0, 2)
s = 7 # Semanal

parameters = product(ps, qs, Ps, Qs)
parameters_list = list(parameters)
len(parameters_list)

In [None]:
from tqdm import tqdm_notebook
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

def optimizeSARIMA(data, parameters_list, d, D, s):    
    results = []
    best_aic = float("inf")

    for param in tqdm_notebook(parameters_list):
        
        try:
            model=sm.tsa.statespace.SARIMAX(
                data, 
                order=(param[0], d, param[1]),
                seasonal_order=(param[2], D, param[3], s)
            ).fit(disp=-1)
        except:
            continue
        aic = model.aic

        if aic < best_aic:
            best_model = model
            best_aic = aic
            best_param = param
        results.append([param, model.aic])
    
    print(type(results), len(results))
    result_table = pd.DataFrame(results)
    result_table.columns = ['parameters', 'aic']
    
    result_table = result_table.sort_values(
        by='aic', ascending=True
    ).reset_index(drop=True)
    
    return result_table

## CALLAO

In [None]:
%%time
from tqdm import tqdm_notebook

for k, v in dicc_callao.items():
    try:
        v['data_cut'] = v.get('data_cut').apply(lambda val: 0.1 if val <= 0 else val)

        p, q, P, Q = optimizeSARIMA(
            v.get('data_cut'), parameters_list, d, D, s).parameters[0]

        print("Canilla : ", k, " --> ", p, ' 1 ', q, P,' 1 ', Q, s)
        model = sm.tsa.statespace.SARIMAX(
             v.get('data_cut'), order=(p, d, q), seasonal_order=(P, D, Q, s)
        ).fit()

        lendata = v.get('data_cut').shape[0]
        forecast = model.predict(start=lendata, end=lendata + 6)
        print(forecast)
        v['result'] = forecast
    except Exception as e:
        print("ERROR: ", str(e))