In [1]:
# !pip install plotly
# !pip install matplotlib==3.5.0

In [2]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm, trange

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
import plotly.express as px
from difflib import SequenceMatcher
import seaborn as sns

In [3]:
def millions(x, pos):
    """The two args are the value and tick position."""
    return '{:1.1f}M'.format(x*1e-6)


In [4]:
all_data = pd.read_pickle('data/morbilidad_global.csv')

In [5]:
#all_data = all_data[(all_data['YEAR']>=2018) & (all_data['YEAR']<2022)]
all_data = all_data[(all_data['YEAR']>=2018)]

all_data['IS_LIMA'] = all_data['PROVINCE'].apply(lambda x : True if x=='LIMA' else False)
all_data['SECTOR'] = all_data['SECTOR'].apply(lambda x: 'PRIVATE' if x == 'PRIVADO' else 'PUBLIC')

all_data.head()

Unnamed: 0,DATE,YEAR,MONTH,STATE,PROVINCE,DISTRICT,SECTOR,CATEGORY,CATEGORY2,CO_IPRESS,...,DIAGNOSIS_ID,QTY_PEOPLE_SERVED,DISEASE,DISEASE_GROUP,COD_IPRESS,x,y,SECTOR_R,TYPE,IS_LIMA
0,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R10.2,10,PELVIC AND PERINEAL PAIN,"Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True
1,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R10.4,11,UNIDENTIFIED,"Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True
2,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R11.X,3,UNIDENTIFIED,"Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True
3,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R16.0,1,"HEPATOMEGALY, NOT ELSEWHERE CLASSIFIED","Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True
4,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R42.X,1,UNIDENTIFIED,"Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True


In [6]:
diagnostics_tag = np.unique(all_data['DISEASE_GROUP'])

cmap = cm.get_cmap('tab20b', 256)
colorsb = cmap(np.linspace(0, 1, 20))
cmap = cm.get_cmap('tab20c', 256)
colorsc = cmap(np.linspace(0, 1, 20))[8:-4]

colors = list(colorsc)+list(colorsb)
# new_colors = []
# for i in range(len(colorsc)):
#     new_colors.append(colorsc[i])
#     new_colors.append(colorsb[i])

dict_colors = {}
counter = 0
for d in diagnostics_tag:
    x=colors[counter]
    dict_colors[d]=matplotlib.colors.rgb2hex(x)
    counter+=1

### Lima and Peru

In [7]:
all_data = all_data[all_data['IS_LIMA']==True]

In [8]:
all_data.head()

Unnamed: 0,DATE,YEAR,MONTH,STATE,PROVINCE,DISTRICT,SECTOR,CATEGORY,CATEGORY2,CO_IPRESS,...,DIAGNOSIS_ID,QTY_PEOPLE_SERVED,DISEASE,DISEASE_GROUP,COD_IPRESS,x,y,SECTOR_R,TYPE,IS_LIMA
0,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R10.2,10,PELVIC AND PERINEAL PAIN,"Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True
1,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R10.4,11,UNIDENTIFIED,"Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True
2,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R11.X,3,UNIDENTIFIED,"Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True
3,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R16.0,1,"HEPATOMEGALY, NOT ELSEWHERE CLASSIFIED","Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True
4,2018-01-01,2018,1,LIMA,LIMA,LIMA,PRIVATE,II-2,II,15610,...,R42.X,1,UNIDENTIFIED,"Abnormal clinical and laboratory symptoms, sig...",15610,-12.058367,-77.038362,PRIVADO,Outpatient Consultation,True


### Forecast

In [17]:
import plotly.io as pio 
import plotly.graph_objects as go

from pycaret.time_series import *

ModuleNotFoundError: No module named 'pycaret.time_series'

In [18]:
#!pip install pycaret-ts-alpha
#!pip install pycaret[full]



In [19]:
import pycaret as pyct

In [None]:
CATEGORIA_STUDIO = 'I'

In [None]:
data_temp = all_data[all_data['CATEGORY2']==CATEGORIA_STUDIO]
data_temp =  data_temp.groupby(['DATE', 'SECTOR']).agg({ 'QTY_PEOPLE_SERVED': "sum"})
data_temp = data_temp.reset_index()

data_temp['QTY_PEOPLE_SERVED'] =data_temp['QTY_PEOPLE_SERVED'].mask((data_temp['DATE'].dt.year==2018) & (data_temp['DATE'].dt.month==4),data_temp['QTY_PEOPLE_SERVED'].median())

data_temp = data_temp.pivot_table(index=['DATE'], 
                                  columns=['SECTOR'], 
                                  values='QTY_PEOPLE_SERVED').fillna(0)
data_temp = data_temp.reset_index()

In [None]:
private_max = data_temp['PRIVATE'].mean() + 1*data_temp['PRIVATE'].std()
private_min = data_temp['PRIVATE'].mean() - 1*data_temp['PRIVATE'].std()

data_temp['PRIVATE_PURE'] = data_temp['PRIVATE'].apply(lambda x :
                                                       0 if (private_min>x or
                                                             private_max<x)
                                                       else x)

public_max = data_temp['PUBLIC'].mean() + 1*data_temp['PUBLIC'].std()
public_min = data_temp['PUBLIC'].mean() - 1*data_temp['PUBLIC'].std()

data_temp['PUBLIC_PURE'] = data_temp['PUBLIC'].apply(lambda x :
                                                       0 if (public_min>x or
                                                             public_max<x)
                                                       else x)



In [None]:
post_pandemic_df = data_temp[(data_temp['DATE'] > '2020-03')]
pre_pandemic_df = data_temp[~(data_temp['DATE'] > '2020-03')]

post_pandemic_df

#### PRIVATE

In [None]:
index = pre_pandemic_df['DATE'].values
data_to = pre_pandemic_df['PRIVATE'].values
df_series_pre = pd.Series(data = data_to, index = index)
df_series_pre.plot()

In [None]:
index = post_pandemic_df['DATE'].values
data_to = post_pandemic_df['PRIVATE'].values
df_series = pd.Series(data = data_to, index = index)
df_series.plot()

In [None]:

setup(data=df_series, fh = 1 , fold = 7)

plot_model(plot = 'diagnostics', 
           fig_kwargs={'renderer': 'png'})

In [None]:
plot_model(plot = 'decomp_stl', 
           fig_kwargs={'renderer': 'png'})

In [None]:
best_baseline_models = compare_models( sort='MAE', errors = 'raise', verbose =False) # Media absolute error
best_baseline_models

In [None]:
#best_baseline_models = 'auto_arima'
the_model = create_model(best_baseline_models)
print(the_model)

tuned_model= tune_model(the_model)
print(tuned_model)

In [None]:
final_best = finalize_model(the_model)
predict_model(final_best, fh = 1)

abc = plot_model(the_model, 
           plot = 'forecast', 
           data_kwargs = {'fh' : 10}, 
           fig_kwargs={'renderer': 'png'})

In [None]:
np.mean(df_series_pre)

In [None]:
setup(data=df_series_pre)
best_baseline_models = compare_models(sort='MAE', errors = 'raise', verbose =False) # Media absolute error
print(best_baseline_models)

In [None]:
the_model = create_model(best_baseline_models)
print(the_model)

tuned_model= tune_model(the_model)
print(tuned_model)

In [None]:
final_best = finalize_model(the_model)
predict_model(final_best, fh = 5)

plot_model(the_model, 
           plot = 'forecast', 
           data_kwargs = {'fh' : 5}, 
           fig_kwargs={'renderer': 'png'})

#### PUBLIC

In [None]:
index = post_pandemic_df['DATE'].values
data_to = post_pandemic_df['PUBLIC'].values
df_series = pd.Series(data = data_to, index = index)

index = pre_pandemic_df['DATE'].values
data_to = pre_pandemic_df['PUBLIC'].values
df_series_pre = pd.Series(data = data_to, index = index)

df_series.plot()

In [None]:
print("Media: "+str(np.mean(df_series_pre)))

In [None]:
from pycaret.time_series import *
setup(data=df_series)

plot_model(plot = 'diagnostics', 
           fig_kwargs={'renderer': 'png'})

In [None]:
best_baseline_models = compare_models(sort='MAE', errors = 'raise', verbose =False) # Media absolute error
best_baseline_models

In [None]:
#best_baseline_models = 'auto_arima'
the_model = create_model(best_baseline_models)
print(the_model)

tuned_model= tune_model(the_model)
print(tuned_model)

In [None]:
final_best = finalize_model(best_baseline_models)
final_best = finalize_model(the_model)
predict_model(final_best, fh = 5)

plot_model(the_model, 
           plot = 'forecast', 
           data_kwargs = {'fh' : 5}, 
           fig_kwargs={'renderer': 'png'})

In [None]:
x =  list(data_to) + list(predict_model(final_best, fh = 5)['y_pred'])
x

In [None]:
post_pandemic_df.head(50)