In [2]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.tsa.stattools import pacf, acf
from datetime import datetime

In [130]:
def create_corr_plot(series, plot_pacf=False):
    corr_array = pacf(series.dropna(), alpha=0.05) if plot_pacf else acf(series.dropna(), alpha=0.05)
    lower_y = corr_array[1][:,0] - corr_array[0]
    upper_y = corr_array[1][:,1] - corr_array[0]

    fig = go.Figure()
    [fig.add_scatter(x=(x,x), y=(0,corr_array[0][x]), mode='lines',line_color='#3f3f3f') 
     for x in range(len(corr_array[0]))]
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color='#1f77b4',
                   marker_size=12)
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(32, 146, 230,0.3)',
            fill='tonexty', line_color='rgba(255,255,255,0)')
    fig.update_traces(showlegend=False)
    fig.update_xaxes(range=[-1,42])
    fig.update_yaxes(zerolinecolor='#000000')
    
    title='Partial Autocorrelation (PACF)' if plot_pacf else 'Autocorrelation (ACF)'
    fig.update_layout(title=title)
    fig.show()

In [3]:
df_gmc = pd.read_csv('../data/gmc/sigesguarda_cleaned.csv')
df_gmc['OCORRENCIA_DATA_SEM_HORARIO'] = pd.to_datetime(df_gmc['OCORRENCIA_DATA'], format='%Y-%m-%d %H:%M:%S.%f').dt.date

df_gmc.sample(5)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,ATENDIMENTO_ANO,ATENDIMENTO_BAIRRO_NOME,EQUIPAMENTO_URBANO_NOME,FLAG_EQUIPAMENTO_URBANO,FLAG_FLAGRANTE,LOGRADOURO_NOME,NATUREZA1_DEFESA_CIVIL,NATUREZA1_DESCRICAO,NATUREZA2_DEFESA_CIVIL,NATUREZA2_DESCRICAO,...,OPERACAO_DESCRICAO,ORIGEM_CHAMADO_DESCRICAO,REGIONAL_FATO_NOME,SECRETARIA_NOME,SECRETARIA_SIGLA,SERVICO_NOME,SITUACAO_EQUIPE_DESCRICAO,NUMERO_PROTOCOLO_156,OCORRENCIA_DATA_SEM_HORARIO,FERIADO
349141,2021,SITIO CERCADO,,NÃO,NÃO,IZAAC FERREIRA DA CRUZ,0,Trânsito,,,...,NORMAL,CCO MURALHA DIGITAL,BAIRRO NOVO,SECRETARIA MUNICIPAL DE DEFESA SOCIAL E TRANSITO,SMDT,FISCALIZAÇÃO DE TRÂNSITO,Solicitação,,2021-05-28,0
74178,2012,ALTO BOQUEIRAO,CENTRO DE EDUCAÇÃO INTEGRAL MAESTRO BENTO MOSS...,SIM,NÃO,CAPITÃO ROBERTO LOPES QUINTAS,0,Apoio,,,...,,153,BOQUEIRÃO,SECRETARIA MUNICIPAL DA EDUCAÇÃO,SME,NORMAL,,,2012-04-02,0
219530,2018,AUGUSTA,,NÃO,NÃO,JAN FREDERIK DE JAGER,0,Alarmes,,,...,NORMAL,FONE,CIC,FUNDAÇÃO DE AÇÃO SOCIAL,FAS,NORMAL,Deslocamento,,2018-07-14,0
61378,2011,XAXIM,CENTRO DE EDUCAÇÃO INTEGRAL DAVID CARNEIRO,SIM,NÃO,JOÃO BATISTA ZAGONEL PASSOS,0,Furto,,,...,,153,BOQUEIRÃO,SECRETARIA MUNICIPAL DE DEFESA SOCIAL E TRANSITO,SMDT,NORMAL,,,2011-08-07,0
73008,2012,CENTRO,,NÃO,NÃO,PEDRO IVO,0,Material abandonado,,,...,,À VIATURA,MATRIZ,SECRETARIA MUNICIPAL DE DEFESA SOCIAL E TRANSITO,SMDT,NORMAL,,,2012-03-13,0


In [166]:
df_series = df_gmc[['ATENDIMENTO_ANO','ATENDIMENTO_BAIRRO_NOME','OCORRENCIA_ANO','OCORRENCIA_DATA','OCORRENCIA_DIA_SEMANA','OCORRENCIA_MES','NATUREZA1_DESCRICAO']].copy()

df_series['OCORRENCIA_DATA'] = pd.to_datetime(df_series['OCORRENCIA_DATA'], format='%Y-%m-%d %H:%M:%S.%f')
df_series['OCORRENCIA_DIA'] = df_series['OCORRENCIA_DATA'].apply(lambda x: x.day)
df_series['OCORRENCIA_HORA'] = df_series['OCORRENCIA_DATA'].apply(lambda x: x.hour)
df_series['OCORRENCIA_MINUTO'] = df_series['OCORRENCIA_DATA'].apply(lambda x: x.minute)

df_series = df_series[['ATENDIMENTO_ANO','OCORRENCIA_DIA','OCORRENCIA_MES','OCORRENCIA_ANO','OCORRENCIA_DIA_SEMANA','OCORRENCIA_HORA','OCORRENCIA_MINUTO','ATENDIMENTO_BAIRRO_NOME','NATUREZA1_DESCRICAO']]
df_series.tail()

Unnamed: 0,ATENDIMENTO_ANO,OCORRENCIA_DIA,OCORRENCIA_MES,OCORRENCIA_ANO,OCORRENCIA_DIA_SEMANA,OCORRENCIA_HORA,OCORRENCIA_MINUTO,ATENDIMENTO_BAIRRO_NOME,NATUREZA1_DESCRICAO
389959,2022,28,2,2022,SEGUNDA,22,0,BATEL,Atitude Suspeita (Abordagem)
389960,2022,28,2,2022,SEGUNDA,22,19,CENTRO,Apoio
389961,2022,28,2,2022,SEGUNDA,22,58,CAJURU,Apoio
389962,2022,28,2,2022,SEGUNDA,22,40,SITIO CERCADO,Furto
389963,2022,28,2,2022,SEGUNDA,23,29,CAPAO RASO,Apoio


In [167]:
df_alltime = df_gmc.groupby(['OCORRENCIA_DATA_SEM_HORARIO']).size().reset_index(name='OCORRENCIAS_ATENDIDAS')
df_alltime.head()

Unnamed: 0,OCORRENCIA_DATA_SEM_HORARIO,OCORRENCIAS_ATENDIDAS
0,2009-01-01,30
1,2009-01-02,81
2,2009-01-03,96
3,2009-01-04,93
4,2009-01-05,59


In [168]:
px.line(df_alltime,x='OCORRENCIA_DATA_SEM_HORARIO',y='OCORRENCIAS_ATENDIDAS')

# ARIMA Model

In [146]:
def adfuller_test(series,diff=False):
    if diff:
        series = series.diff()[1:]
    result = adfuller(np.array(series))
    print('ADF Statistic: {}'.format(result[0]))
    print('p-value: {}'.format(result[1]))

adfuller_test(df_alltime['OCORRENCIAS_ATENDIDAS'].squeeze())

ADF Statistic: -2.940694796243668
p-value: 0.04081628718101317


In [147]:
create_corr_plot(df_alltime['OCORRENCIAS_ATENDIDAS'].squeeze(), plot_pacf=True)

In [148]:
create_corr_plot(df_alltime['OCORRENCIAS_ATENDIDAS'].squeeze())

In [149]:
from statsmodels.tsa.arima.model import ARIMA

In [150]:

series = np.array(df_alltime['OCORRENCIAS_ATENDIDAS'].squeeze())
arima_model = ARIMA(series, order=(2,2,7))
model = arima_model.fit()
print(model.summary())


Non-invertible starting MA parameters found. Using zeros as starting parameters.


Maximum Likelihood optimization failed to converge. Check mle_retvals



                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 4807
Model:                 ARIMA(2, 2, 7)   Log Likelihood              -21525.735
Date:                Mon, 15 Aug 2022   AIC                          43071.471
Time:                        11:01:04   BIC                          43136.245
Sample:                             0   HQIC                         43094.219
                               - 4807                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -1.8002      0.002   -732.282      0.000      -1.805      -1.795
ar.L2         -0.9962      0.003   -385.008      0.000      -1.001      -0.991
ma.L1          0.2230      0.006     37.837      0.0

In [151]:
df_result = pd.DataFrame([series,model.predict(start=0,end=10000)]).transpose().reset_index()
px.line(df_result,x='index',y=df_result.columns)

## Regression Model

In [94]:
from statsmodels.regression.linear_model import OLS
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
import statsmodels.formula.api as smf
import math

In [5]:
def evaluate_ols(results, X, y):
    print(f'params:\n{results.params}\n')
    print(f'rsquared:\n{results.rsquared}\n')
    print(f'rsquared_adj:\n{results.rsquared_adj}\n')
    print(f'mse:\n{mean_squared_error(y, results.predict(X))}\n')
    print(f'rmse:\n{math.sqrt(mean_squared_error(y, results.predict(X)))}\n')
    print(f'mae:\n{mean_absolute_error(y, results.predict(X))}')

In [176]:
df_gmc_ols_model = df_gmc
df_gmc_ols_model = df_gmc_ols_model.groupby(['OCORRENCIA_DATA_SEM_HORARIO','FERIADO']).size().reset_index(name='OCORRENCIAS_ATENDIDAS')
df_gmc_ols_model['x0_final_de_semana'] = df_gmc_ols_model['OCORRENCIA_DATA_SEM_HORARIO'].apply(lambda x: 1 if x.weekday() == 5 or x.weekday() == 6 else 0)
df_gmc_ols_model['x1_time'] = np.arange(df_gmc_ols_model.shape[0])
df_gmc_ols_model['x2_lag1'] = df_gmc_ols_model['OCORRENCIAS_ATENDIDAS'].shift(1).fillna(0)
df_gmc_ols_model['x3_lag2'] = df_gmc_ols_model['OCORRENCIAS_ATENDIDAS'].shift(2).fillna(0)
df_gmc_ols_model['x4_lag3'] = df_gmc_ols_model['OCORRENCIAS_ATENDIDAS'].shift(3).fillna(0)
df_gmc_ols_model['x5_lag4'] = df_gmc_ols_model['OCORRENCIAS_ATENDIDAS'].shift(4).fillna(0)
df_gmc_ols_model['x6_lag5'] = df_gmc_ols_model['OCORRENCIAS_ATENDIDAS'].shift(5).fillna(0)
df_gmc_ols_model['x7_lag6'] = df_gmc_ols_model['OCORRENCIAS_ATENDIDAS'].shift(6).fillna(0)
df_gmc_ols_model['x8_lag7'] = df_gmc_ols_model['OCORRENCIAS_ATENDIDAS'].shift(7).fillna(0)
df_gmc_ols_model['x9_window_mean'] = df_gmc_ols_model['OCORRENCIAS_ATENDIDAS'].shift(1).rolling(window=7).mean().fillna(0)
df_gmc_ols_model['x10_feriado'] = df_gmc_ols_model['FERIADO']
df_gmc_ols_model['x11_pandemia'] = df_gmc_ols_model['OCORRENCIA_DATA_SEM_HORARIO'].apply(lambda x: 1 if x >= datetime.strptime('11/04/2020','%d/%m/%Y').date() else 0)
df_gmc_ols_model['x12_dia_semana'] = df_gmc_ols_model['OCORRENCIA_DATA_SEM_HORARIO'].apply(lambda x: x.weekday())
df_gmc_ols_model

Unnamed: 0,OCORRENCIA_DATA_SEM_HORARIO,FERIADO,OCORRENCIAS_ATENDIDAS,x0_final_de_semana,x1_time,x2_lag1,x3_lag2,x4_lag3,x5_lag4,x6_lag5,x7_lag6,x8_lag7,x9_window_mean,x10_feriado,x11_pandemia,x12_dia_semana
0,2009-01-01,1,30,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1,0,3
1,2009-01-02,0,81,0,1,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,4
2,2009-01-03,0,96,1,2,81.0,30.0,0.0,0.0,0.0,0.0,0.0,0.000000,0,0,5
3,2009-01-04,0,93,1,3,96.0,81.0,30.0,0.0,0.0,0.0,0.0,0.000000,0,0,6
4,2009-01-05,0,59,0,4,93.0,96.0,81.0,30.0,0.0,0.0,0.0,0.000000,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4802,2022-02-24,0,166,0,4802,150.0,173.0,150.0,135.0,166.0,143.0,183.0,157.142857,0,1,3
4803,2022-02-25,0,143,0,4803,166.0,150.0,173.0,150.0,135.0,166.0,143.0,154.714286,0,1,4
4804,2022-02-26,0,142,1,4804,143.0,166.0,150.0,173.0,150.0,135.0,166.0,154.714286,0,1,5
4805,2022-02-27,0,143,1,4805,142.0,143.0,166.0,150.0,173.0,150.0,135.0,151.285714,0,1,6


### Model 1

In [21]:
X_model1, y_model1 = df_gmc_ols_model[['x1_time']], df_gmc_ols_model['OCORRENCIAS_ATENDIDAS']
print(X_model1, y_model1)

      x1_time
0           0
1           1
2           2
3           3
4           4
...       ...
4802     4802
4803     4803
4804     4804
4805     4805
4806     4806

[4807 rows x 1 columns] 0        30
1        81
2        96
3        93
4        59
       ... 
4802    166
4803    143
4804    141
4805    143
4806    131
Name: OCORRENCIAS_ATENDIDAS, Length: 4807, dtype: int64


In [22]:
model1 = OLS(y_model1, X_model1)
results1 = model1.fit()

evaluate_ols(results1, X_model1, y_model1)
results1.summary()

params:
x1_time    0.029766
dtype: float64

rsquared:
0.8036031169836728

rsquared_adj:
0.8035622520475479

mse:
1667.3619799182852

rmse:
40.83334397178714

mae:
30.70310163152059


0,1,2,3
Dep. Variable:,OCORRENCIAS_ATENDIDAS,R-squared (uncentered):,0.804
Model:,OLS,Adj. R-squared (uncentered):,0.804
Method:,Least Squares,F-statistic:,19660.0
Date:,"Thu, 18 Aug 2022",Prob (F-statistic):,0.0
Time:,13:21:11,Log-Likelihood:,-24652.0
No. Observations:,4807,AIC:,49310.0
Df Residuals:,4806,BIC:,49310.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1_time,0.0298,0.000,140.231,0.000,0.029,0.030

0,1,2,3
Omnibus:,1710.425,Durbin-Watson:,0.385
Prob(Omnibus):,0.0,Jarque-Bera (JB):,15203.937
Skew:,1.444,Prob(JB):,0.0
Kurtosis:,11.22,Cond. No.,1.0


In [23]:
df_results1 = pd.DataFrame([y_model1, results1.predict(X_model1)]).transpose().reset_index()
px.line(df_results1,x='index',y=df_results1.columns)

### Model 2

In [24]:
X_model2, y_model2 = df_gmc_ols_model[['x1_time','x0_final_de_semana']], df_gmc_ols_model['OCORRENCIAS_ATENDIDAS']
print(X_model2, y_model2)

      x1_time  x0_final_de_semana
0           0                   0
1           1                   0
2           2                   1
3           3                   1
4           4                   0
...       ...                 ...
4802     4802                   0
4803     4803                   0
4804     4804                   1
4805     4805                   1
4806     4806                   0

[4807 rows x 2 columns] 0        30
1        81
2        96
3        93
4        59
       ... 
4802    166
4803    143
4804    141
4805    143
4806    131
Name: OCORRENCIAS_ATENDIDAS, Length: 4807, dtype: int64


In [25]:
model2 = OLS(y_model2, X_model2)
results2 = model2.fit()

evaluate_ols(results2, X_model2, y_model2)
results2.summary()

params:
x1_time                0.027669
x0_final_de_semana    23.509353
dtype: float64

rsquared:
0.8182207489471467

rsquared_adj:
0.8181450864076867

mse:
1543.261824162206

rmse:
39.284371245601044

mae:
29.507128154505576


0,1,2,3
Dep. Variable:,OCORRENCIAS_ATENDIDAS,R-squared (uncentered):,0.818
Model:,OLS,Adj. R-squared (uncentered):,0.818
Method:,Least Squares,F-statistic:,10810.0
Date:,"Thu, 18 Aug 2022",Prob (F-statistic):,0.0
Time:,13:21:11,Log-Likelihood:,-24467.0
No. Observations:,4807,AIC:,48940.0
Df Residuals:,4805,BIC:,48950.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1_time,0.0277,0.000,120.074,0.000,0.027,0.028
x0_final_de_semana,23.5094,1.196,19.657,0.000,21.165,25.854

0,1,2,3
Omnibus:,1713.676,Durbin-Watson:,0.41
Prob(Omnibus):,0.0,Jarque-Bera (JB):,18842.634
Skew:,1.381,Prob(JB):,0.0
Kurtosis:,12.298,Cond. No.,5860.0


In [26]:
df_results2 = pd.DataFrame([y_model2,results2.predict(X_model2)]).transpose().reset_index()
px.line(df_results2,x='index',y=df_results2.columns)

### Model 3

In [27]:
X_model3, y_model3 = df_gmc_ols_model[['x0_final_de_semana','x1_time','x10_feriado']], df_gmc_ols_model['OCORRENCIAS_ATENDIDAS']
print(X_model3, y_model3)

      x0_final_de_semana  x1_time  x10_feriado
0                      0        0            1
1                      0        1            0
2                      1        2            0
3                      1        3            0
4                      0        4            0
...                  ...      ...          ...
4802                   0     4802            0
4803                   0     4803            0
4804                   1     4804            0
4805                   1     4805            0
4806                   0     4806            1

[4807 rows x 3 columns] 0        30
1        81
2        96
3        93
4        59
       ... 
4802    166
4803    143
4804    141
4805    143
4806    131
Name: OCORRENCIAS_ATENDIDAS, Length: 4807, dtype: int64


In [28]:
model3 = OLS(y_model3, X_model3)
results3 = model3.fit()

evaluate_ols(results3, X_model3, y_model3)
results3.summary()

params:
x0_final_de_semana    23.536811
x1_time                0.027603
x10_feriado            6.188158
dtype: float64

rsquared:
0.8183653484144037

rsquared_adj:
0.8182519212797749

mse:
1542.0342096995023

rmse:
39.26874341890127

mae:
29.502175775983993


0,1,2,3
Dep. Variable:,OCORRENCIAS_ATENDIDAS,R-squared (uncentered):,0.818
Model:,OLS,Adj. R-squared (uncentered):,0.818
Method:,Least Squares,F-statistic:,7215.0
Date:,"Thu, 18 Aug 2022",Prob (F-statistic):,0.0
Time:,13:21:12,Log-Likelihood:,-24465.0
No. Observations:,4807,AIC:,48940.0
Df Residuals:,4804,BIC:,48950.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x0_final_de_semana,23.5368,1.196,19.684,0.000,21.193,25.881
x1_time,0.0276,0.000,118.567,0.000,0.027,0.028
x10_feriado,6.1882,3.164,1.956,0.051,-0.015,12.392

0,1,2,3
Omnibus:,1718.669,Durbin-Watson:,0.41
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19014.097
Skew:,1.384,Prob(JB):,0.0
Kurtosis:,12.342,Cond. No.,15500.0


In [146]:
df_results3 = pd.DataFrame([y_model3, results3.predict(X_model3)]).transpose().reset_index()
df_results3['Data'] = df_gmc_ols_model['OCORRENCIA_DATA_SEM_HORARIO']
px.line(df_results3,x='Data',y=df_results3.columns[1:3])

## Model 4

In [30]:
X_model4, y_model4 = df_gmc_ols_model[['x0_final_de_semana','x1_time','x10_feriado','x11_pandemia']], df_gmc_ols_model['OCORRENCIAS_ATENDIDAS']
print(X_model4, y_model4)

      x0_final_de_semana  x1_time  x10_feriado  x11_pandemia
0                      0        0            1             0
1                      0        1            0             0
2                      1        2            0             0
3                      1        3            0             0
4                      0        4            0             0
...                  ...      ...          ...           ...
4802                   0     4802            0             1
4803                   0     4803            0             1
4804                   1     4804            0             1
4805                   1     4805            0             1
4806                   0     4806            1             1

[4807 rows x 4 columns] 0        30
1        81
2        96
3        93
4        59
       ... 
4802    166
4803    143
4804    141
4805    143
4806    131
Name: OCORRENCIAS_ATENDIDAS, Length: 4807, dtype: int64


In [31]:
model4 = OLS(y_model4, X_model4)
results4 = model4.fit()

evaluate_ols(results4, X_model4, y_model4)

params:
x0_final_de_semana    27.224263
x1_time                0.022924
x10_feriado            9.490609
x11_pandemia          51.961276
dtype: float64

rsquared:
0.8466574786346737

rsquared_adj:
0.8465297730162141

mse:
1301.8408749801652

rmse:
36.08103206644961

mae:
27.03284092746062


In [32]:
df_results4 = pd.DataFrame([y_model4, results4.predict(X_model4)]).transpose().reset_index()
df_results4['Data'] = df_gmc_ols_model['OCORRENCIA_DATA_SEM_HORARIO']
px.line(df_results4,x='Data',y=df_results4.columns[1:3])

## Model 5

In [33]:
X_model5, y_model5 = df_gmc_ols_model[['x0_final_de_semana','x1_time','x2_lag1','x3_lag2','x4_lag3','x5_lag4','x6_lag5','x7_lag6','x8_lag7','x10_feriado','x11_pandemia']], df_gmc_ols_model['OCORRENCIAS_ATENDIDAS']
print(X_model5, y_model5)

      x0_final_de_semana  x1_time  x2_lag1  x3_lag2  x4_lag3  x5_lag4  \
0                      0        0      0.0      0.0      0.0      0.0   
1                      0        1     30.0      0.0      0.0      0.0   
2                      1        2     81.0     30.0      0.0      0.0   
3                      1        3     96.0     81.0     30.0      0.0   
4                      0        4     93.0     96.0     81.0     30.0   
...                  ...      ...      ...      ...      ...      ...   
4802                   0     4802    150.0    173.0    150.0    135.0   
4803                   0     4803    166.0    150.0    173.0    150.0   
4804                   1     4804    143.0    166.0    150.0    173.0   
4805                   1     4805    141.0    143.0    166.0    150.0   
4806                   0     4806    143.0    141.0    143.0    166.0   

      x6_lag5  x7_lag6  x8_lag7  x10_feriado  x11_pandemia  
0         0.0      0.0      0.0            1             0  
1

In [34]:
model5 = OLS(y_model5, X_model5)
results5 = model5.fit()

evaluate_ols(results5, X_model5, y_model5)
results5.summary()

params:
x0_final_de_semana    8.672257
x1_time               0.001253
x2_lag1               0.367369
x3_lag2               0.106463
x4_lag3              -0.018537
x5_lag4               0.100958
x6_lag5              -0.000656
x7_lag6               0.193666
x8_lag7               0.167917
x10_feriado           1.208655
x11_pandemia          5.447578
dtype: float64

rsquared:
0.950666492210406

rsquared_adj:
0.9505533419631822

mse:
418.8295351791976

rmse:
20.465325191142153

mae:
13.233730546364932


0,1,2,3
Dep. Variable:,OCORRENCIAS_ATENDIDAS,R-squared (uncentered):,0.951
Model:,OLS,Adj. R-squared (uncentered):,0.951
Method:,Least Squares,F-statistic:,8402.0
Date:,"Thu, 18 Aug 2022",Prob (F-statistic):,0.0
Time:,13:21:17,Log-Likelihood:,-21332.0
No. Observations:,4807,AIC:,42690.0
Df Residuals:,4796,BIC:,42760.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x0_final_de_semana,8.6723,0.700,12.381,0.000,7.299,10.046
x1_time,0.0013,0.000,4.694,0.000,0.001,0.002
x2_lag1,0.3674,0.014,26.103,0.000,0.340,0.395
x3_lag2,0.1065,0.015,7.227,0.000,0.078,0.135
x4_lag3,-0.0185,0.015,-1.252,0.210,-0.048,0.010
x5_lag4,0.1010,0.015,6.846,0.000,0.072,0.130
x6_lag5,-0.0007,0.015,-0.043,0.965,-0.030,0.029
x7_lag6,0.1937,0.015,13.144,0.000,0.165,0.223
x8_lag7,0.1679,0.014,11.597,0.000,0.140,0.196

0,1,2,3
Omnibus:,5054.59,Durbin-Watson:,1.953
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2254713.117
Skew:,4.531,Prob(JB):,0.0
Kurtosis:,108.712,Cond. No.,15600.0


In [35]:
df_results5 = pd.DataFrame([y_model5, results5.predict(X_model5)]).transpose().reset_index()
df_results5['Data'] = df_gmc_ols_model['OCORRENCIA_DATA_SEM_HORARIO']
px.line(df_results5,x='Data',y=df_results5.columns[1:3])

## Model 6

In [54]:
X_model6, y_model6 = df_gmc_ols_model[['x0_final_de_semana','x1_time','x2_lag1','x3_lag2','x4_lag3','x5_lag4','x6_lag5','x7_lag6','x8_lag7','x9_window_mean','x10_feriado','x11_pandemia']], df_gmc_ols_model['OCORRENCIAS_ATENDIDAS']
print(X_model6, y_model6)

      x0_final_de_semana  x1_time  x2_lag1  x3_lag2  x4_lag3  x5_lag4  \
0                      0        0      0.0      0.0      0.0      0.0   
1                      0        1     30.0      0.0      0.0      0.0   
2                      1        2     81.0     30.0      0.0      0.0   
3                      1        3     96.0     81.0     30.0      0.0   
4                      0        4     93.0     96.0     81.0     30.0   
...                  ...      ...      ...      ...      ...      ...   
4802                   0     4802    150.0    173.0    150.0    135.0   
4803                   0     4803    166.0    150.0    173.0    150.0   
4804                   1     4804    143.0    166.0    150.0    173.0   
4805                   1     4805    141.0    143.0    166.0    150.0   
4806                   0     4806    143.0    141.0    143.0    166.0   

      x6_lag5  x7_lag6  x8_lag7  x9_window_mean  x10_feriado  x11_pandemia  
0         0.0      0.0      0.0        0.00000

In [85]:
model6 = OLS(y_model6, X_model6)
results6 = model6.fit()

evaluate_ols(results6, X_model6, y_model6)
results6.summary()

params:
x0_final_de_semana    8.644637
x1_time               0.001277
x2_lag1               0.444248
x3_lag2               0.183312
x4_lag3               0.058254
x5_lag4               0.178308
x6_lag5               0.077128
x7_lag6               0.272212
x8_lag7               0.247424
x9_window_mean       -0.544488
x10_feriado           1.211790
x11_pandemia          5.479671
dtype: float64

rsquared:
0.9507290923856859

rsquared_adj:
0.9506057866731996

mse:
418.298074849508

rmse:
20.452336659890673

mae:
13.225927407467498


0,1,2,3
Dep. Variable:,OCORRENCIAS_ATENDIDAS,R-squared (uncentered):,0.951
Model:,OLS,Adj. R-squared (uncentered):,0.951
Method:,Least Squares,F-statistic:,7710.0
Date:,"Thu, 18 Aug 2022",Prob (F-statistic):,0.0
Time:,14:06:31,Log-Likelihood:,-21329.0
No. Observations:,4807,AIC:,42680.0
Df Residuals:,4795,BIC:,42760.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x0_final_de_semana,8.6446,0.700,12.346,0.000,7.272,10.017
x1_time,0.0013,0.000,4.780,0.000,0.001,0.002
x2_lag1,0.4442,0.034,12.999,0.000,0.377,0.511
x3_lag2,0.1833,0.034,5.322,0.000,0.116,0.251
x4_lag3,0.0583,0.034,1.691,0.091,-0.009,0.126
x5_lag4,0.1783,0.035,5.149,0.000,0.110,0.246
x6_lag5,0.0771,0.035,2.207,0.027,0.009,0.146
x7_lag6,0.2722,0.035,7.763,0.000,0.203,0.341
x8_lag7,0.2474,0.035,7.006,0.000,0.178,0.317

0,1,2,3
Omnibus:,5059.575,Durbin-Watson:,1.954
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2266378.383
Skew:,4.538,Prob(JB):,0.0
Kurtosis:,108.986,Cond. No.,15600.0


In [177]:
df_results6 = pd.DataFrame([y_model6, results6.predict(X_model6)]).transpose().reset_index()
df_results6['Data'] = df_gmc_ols_model['OCORRENCIA_DATA_SEM_HORARIO']
px.line(df_results6,x='Data',y=df_results6.columns[1:3])

## Model 7

In [65]:
df_gmc_ols_model.columns

Index(['OCORRENCIA_DATA_SEM_HORARIO', 'x0_final_de_semana',
       'OCORRENCIAS_ATENDIDAS', 'x1_time', 'x2_lag1', 'x3_lag2', 'x4_lag3',
       'x5_lag4', 'x6_lag5', 'x7_lag6', 'x8_lag7', 'x9_window_mean',
       'x10_feriado', 'x11_pandemia', 'x12_dia_semana'],
      dtype='object')

In [80]:
model7 = smf.ols(formula='OCORRENCIAS_ATENDIDAS ~ x0_final_de_semana + x1_time + x2_lag1 + x3_lag2 + x4_lag3 + x5_lag4 + x6_lag5 + x7_lag6 + x8_lag7 + x10_feriado + x11_pandemia + C(x12_dia_semana) - 1', data=df_gmc_ols_model)
results7 = model7.fit()
results7.summary()

0,1,2,3
Dep. Variable:,OCORRENCIAS_ATENDIDAS,R-squared:,0.791
Model:,OLS,Adj. R-squared:,0.791
Method:,Least Squares,F-statistic:,1136.0
Date:,"Thu, 18 Aug 2022",Prob (F-statistic):,0.0
Time:,13:59:27,Log-Likelihood:,-21234.0
No. Observations:,4807,AIC:,42500.0
Df Residuals:,4790,BIC:,42610.0
Df Model:,16,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
C(x12_dia_semana)[0],-2.0814,1.257,-1.656,0.098,-4.546,0.383
C(x12_dia_semana)[1],5.9482,1.257,4.733,0.000,3.484,8.412
C(x12_dia_semana)[2],7.9106,1.256,6.300,0.000,5.449,10.372
C(x12_dia_semana)[3],9.4980,1.258,7.551,0.000,7.032,11.964
C(x12_dia_semana)[4],10.0359,1.263,7.949,0.000,7.561,12.511
C(x12_dia_semana)[5],7.1188,0.675,10.553,0.000,5.796,8.441
C(x12_dia_semana)[6],2.1694,0.675,3.212,0.001,0.845,3.494
x0_final_de_semana,9.2882,0.757,12.268,0.000,7.804,10.773
x1_time,0.0008,0.000,2.816,0.005,0.000,0.001

0,1,2,3
Omnibus:,5286.191,Durbin-Watson:,2.013
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2561195.462
Skew:,4.907,Prob(JB):,0.0
Kurtosis:,115.654,Cond. No.,8.41e+18


## Outliers baseado em resíduos

In [89]:
df_results6.drop(columns=['index'],inplace=True)
df_results6.rename(columns={'Unnamed 0':'OCORRENCIAS_PREVISTAS'},inplace=True)
df_results6['OCORRENCIAS_PREVISTAS'] = df_results6['OCORRENCIAS_PREVISTAS'].astype(int)
df_results6

Unnamed: 0,OCORRENCIAS_ATENDIDAS,OCORRENCIAS_PREVISTAS,Data
0,30.0,1,2009-01-01
1,81.0,13,2009-01-02
2,96.0,50,2009-01-03
3,93.0,67,2009-01-04
4,59.0,68,2009-01-05
...,...,...,...
4802,166.0,154,2022-02-24
4803,143.0,156,2022-02-25
4804,141.0,158,2022-02-26
4805,143.0,150,2022-02-27


In [95]:
df_results6['ERRO'] = df_results6['OCORRENCIAS_PREVISTAS'] - df_results6['OCORRENCIAS_ATENDIDAS']
df_results6['Z-SCORE'] = (df_results6['ERRO'] - df_results6['ERRO'].mean())/df_results6['ERRO'].std()
df_results6['OUTLIER'] = df_results6['Z-SCORE'].apply(lambda x: 1 if abs(x) >= 2.17 else 0)
df_results6

Unnamed: 0,OCORRENCIAS_ATENDIDAS,OCORRENCIAS_PREVISTAS,Data,ERRO,Z-SCORE,OUTLIER
0,30.0,1,2009-01-01,-29.0,-1.370493,0
1,81.0,13,2009-01-02,-68.0,-3.277779,1
2,96.0,50,2009-01-03,-46.0,-2.201874,1
3,93.0,67,2009-01-04,-26.0,-1.223779,0
4,59.0,68,2009-01-05,9.0,0.487888,0
...,...,...,...,...,...,...
4802,166.0,154,2022-02-24,-12.0,-0.539112,0
4803,143.0,156,2022-02-25,13.0,0.683507,0
4804,141.0,158,2022-02-26,17.0,0.879126,0
4805,143.0,150,2022-02-27,7.0,0.390078,0


In [145]:
import plotly.express as px
import plotly.graph_objects as go

fig1 = px.line(df_results6, x='Data', y=['OCORRENCIAS_ATENDIDAS','OCORRENCIAS_PREVISTAS'])
fig1.data[0].line.color = 'rgba(135, 148, 168, .85)'
fig1.data[1].line.color = 'rgba(75, 83, 94, 1)'

fig2 = px.scatter(df_results6[df_results6['OUTLIER'] == 1], x='Data', y='OCORRENCIAS_ATENDIDAS', color_discrete_sequence=['rgba(75, 83, 94, 1)'])
fig2['data'][0]['showlegend'] = True
fig2['data'][0]['name'] = 'Outlier'

fig3 = go.Figure(data=fig1.data + fig2.data)
fig3.show()