In [89]:
import sklearn as skl
import pandas as pd
import numpy as np

%matplotlib inline

In [90]:
df_156 = pd.read_hdf('../../data/processed/solicitacoes156.h5', key='data', format="table")
df_plu = pd.read_hdf('../../data/processed/pluviometrico.h5', key='data', format="table")

In [91]:
filtered_df = df_156[df_156['SERVICO_DESCRICAO'] == 'QUEDA DE ARVORE']
group_df = filtered_df.groupby('DATA_DEMANDA')[['SERVICO_CODIGO']]
count_df = group_df.count().reset_index()
count_df.columns = ['DATA', 'QUEDAS']
df = df_plu.merge(count_df, on='DATA')

In [92]:
month_df = pd.get_dummies(df['DATA'].dt.month)
month_df.columns = ['m' + str(i) for i in range(1, 13)]

day_df = pd.get_dummies(df['DATA'].dt.day)
day_df.columns = ['d' + str(i) for i in range(1, 32)]

df = pd.concat([day_df, month_df, df[['INDICE', 'QUEDAS']]], axis=1)

In [93]:
from sklearn.neighbors import KNeighborsRegressor
skl.utils.shuffle(df)

split = int(df.shape[0] * 0.7)
train, test = df.iloc[:split,:], df.iloc[split:,:]

clf = KNeighborsRegressor(
    weights='distance',
    n_neighbors=5)

In [100]:
clf.fit(train.iloc[:,:-1], train.iloc[:,-1])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=None, n_neighbors=5, p=2,
          weights='distance')

In [101]:
clf.score(test.iloc[:,:-1], test.iloc[:,-1])

-0.013196116581627448

In [102]:
skl.metrics.mean_absolute_error(clf.predict(test.iloc[:,:-1]), test.iloc[:,-1])

1.89753285953115

In [97]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [98]:
formula = 'QUEDAS ~ d1 + d2 + d3 + d4 + d5 + d6 + d7 + d8 + d9 + d10 + d11 + d12 + d13 + d14 + d15 + d16 + d17 + d18 + d19 + d20 + d21 + d22 + d23 + d24 + d25 + d26 + d27 + d28 + d29 + d30 + d31 + m1 + m2 + m3 + m4 + m5 + m6 + m7 + m8 + m9 + m10 + m11 + m12 + INDICE'

In [99]:
model = smf.ols(formula,data=df)
model.fit().summary()

0,1,2,3
Dep. Variable:,QUEDAS,R-squared:,0.068
Model:,OLS,Adj. R-squared:,0.035
Method:,Least Squares,F-statistic:,2.051
Date:,"Thu, 29 Nov 2018",Prob (F-statistic):,0.000109
Time:,03:52:52,Log-Likelihood:,-3809.5
No. Observations:,1229,AIC:,7705.0
Df Residuals:,1186,BIC:,7925.0
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.6596,0.161,10.340,0.000,1.345,1.974
d1,0.1556,0.999,0.156,0.876,-1.805,2.116
d2,-0.1384,0.797,-0.174,0.862,-1.702,1.425
d3,-0.2523,0.874,-0.289,0.773,-1.966,1.462
d4,0.6704,0.863,0.777,0.437,-1.023,2.364
d5,0.2124,0.842,0.252,0.801,-1.440,1.865
d6,-0.3905,0.814,-0.480,0.632,-1.988,1.207
d7,0.0504,0.875,0.058,0.954,-1.667,1.767
d8,-0.0476,0.885,-0.054,0.957,-1.784,1.689

0,1,2,3
Omnibus:,2854.308,Durbin-Watson:,2.056
Prob(Omnibus):,0.0,Jarque-Bera (JB):,17207065.874
Skew:,21.418,Prob(JB):,0.0
Kurtosis:,581.088,Cond. No.,1.2e+17
