In [1]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm, trange

import matplotlib.pyplot as plt
import plotly.express as px
from difflib import SequenceMatcher

In [2]:
def save_new_agg(data, name):
    data_temp = data[((data['PROVINCE']=='LIMA') & (data['STATE']=='LIMA')) |
                     (data['STATE']=='CALLAO')]
    data_temp = data_temp[data_temp['YEAR']==2020].reset_index(drop=True)
    data_temp['SECTOR'] = data_temp['SECTOR'].apply(lambda x: x if x == 'PRIVADO' else 'PUBLICO')
    columns_ = data_temp['SECTOR'].unique()
    data_temp = data_temp.groupby(['UBIGEO', 'DISTRICT','SECTOR','CO_IPRESS']).agg({ 'QTY_PEOPLE_SERVED': "sum"}).reset_index()
    data_temp = data_temp.groupby(['UBIGEO', 'DISTRICT','SECTOR']).agg({ 'CO_IPRESS': "count"}).reset_index()
    data_temp = data_temp.pivot(index =['UBIGEO', 'DISTRICT'], columns ='SECTOR')
    data_temp.columns = data_temp.columns.droplevel()
    data_temp = data_temp.reset_index()
    data_temp.fillna(0, inplace = True)

    data_temp['TOTAL'] = data_temp[columns_].sum(axis=1)
    data_temp.to_csv(name, index = False)
    return data_temp

In [3]:
all_data = pd.read_pickle('data/morbilidad_global.csv')

## Salvando resumen de datos de todo el peru desde el 2018

In [4]:
all_data.head(3)

Unnamed: 0,DATE,YEAR,MONTH,STATE,PROVINCE,DISTRICT,SECTOR,CATEGORY,CATEGORY2,CO_IPRESS,...,AGE,DIAGNOSIS_ID,QTY_PEOPLE_SERVED,DISEASE,DISEASE_GROUP,COD_IPRESS,x,y,SECTOR_R,TYPE
0,2021-12-01,2021,12,JUNIN,TARMA,TARMA,GOBIERNO REGIONAL,I-1,0,590,...,9,S70.0,1,CONTUSION OF HIP,"Injuries, poisoning and some other consequence...",590,-11.449408,-75.664042,PUBLICO,Outpatient Consultation
1,2021-12-01,2021,12,JUNIN,TARMA,TARMA,GOBIERNO REGIONAL,I-1,0,590,...,9,N39.0,1,"URINARY TRACT INFECTION, SITE NOT SPECIFIED",Diseases of the genitourinary system,590,-11.449408,-75.664042,PUBLICO,Outpatient Consultation
2,2021-12-01,2021,12,JUNIN,TARMA,TARMA,GOBIERNO REGIONAL,I-1,0,590,...,9,M54.5,1,LOW BACK PAIN,Diseases of the musculoskeletal system and con...,590,-11.449408,-75.664042,PUBLICO,Outpatient Consultation


In [5]:
all_data.columns

Index(['DATE', 'YEAR', 'MONTH', 'STATE', 'PROVINCE', 'DISTRICT', 'SECTOR',
       'CATEGORY', 'CATEGORY2', 'CO_IPRESS', 'NAME', 'SEX', 'AGE',
       'DIAGNOSIS_ID', 'QTY_PEOPLE_SERVED', 'DISEASE', 'DISEASE_GROUP',
       'COD_IPRESS', 'x', 'y', 'SECTOR_R', 'TYPE'],
      dtype='object')

In [7]:
data_temp = all_data
data_temp = data_temp[data_temp['YEAR']==2019].reset_index(drop=True)
data_temp = data_temp[data_temp['SECTOR_R'].notna()]
# data_temp['SECTOR_R'] = data_temp['SECTOR'].apply(lambda x: x if x == 'PRIVADO' else 'PUBLICO')
columns_ = data_temp['SECTOR_R'].unique()
data_temp = data_temp.groupby(['CATEGORY','SECTOR_R','CO_IPRESS']).agg({ 'QTY_PEOPLE_SERVED': "sum"}).reset_index()
data_temp = data_temp.groupby(['CATEGORY','SECTOR_R']).agg({ 'CO_IPRESS': "count"}).reset_index()
data_temp = data_temp.pivot(index =['CATEGORY'], columns ='SECTOR_R')
data_temp.columns = data_temp.columns.droplevel()
data_temp = data_temp.reset_index()
data_temp.fillna(0, inplace = True)
data_temp['TOTAL'] = data_temp[columns_].sum(axis=1)
data_temp.to_csv('results/qty_cat_peru.csv', index = False)

In [8]:
data_temp.head()

SECTOR_R,CATEGORY,PRIVADO,PUBLICO,TOTAL
0,0,19,56,75
1,I-1,1,1736,1737
2,I-2,17,994,1011
3,I-3,44,564,608
4,I-4,9,159,168


## Salvando resumen de datos de Lima desde el 2018

In [9]:
data_temp = all_data
data_temp = data_temp[data_temp['YEAR']==2019].reset_index(drop=True)
data_temp = data_temp[data_temp['SECTOR_R'].notna()]
data_temp = data_temp[((data_temp['PROVINCE']=='LIMA') & (data_temp['STATE']=='LIMA')) |
                      (data_temp['STATE']=='CALLAO')]
data_temp['SECTOR_R'] = data_temp['SECTOR'].apply(lambda x: x if x == 'PRIVADO' else 'PUBLICO')
columns_ = data_temp['SECTOR_R'].unique()
data_temp = data_temp.groupby(['CATEGORY','SECTOR_R','CO_IPRESS']).agg({ 'QTY_PEOPLE_SERVED': "sum"}).reset_index()
data_temp = data_temp.groupby(['CATEGORY','SECTOR_R']).agg({ 'CO_IPRESS': "count"}).reset_index()
data_temp = data_temp.pivot(index =['CATEGORY'], columns ='SECTOR_R')
data_temp.columns = data_temp.columns.droplevel()
data_temp = data_temp.reset_index()
data_temp.fillna(0, inplace = True)
data_temp['TOTAL'] = data_temp[columns_].sum(axis=1)
data_temp.to_csv('results/qty_cat_lim_callao.csv', index = False)

## categoria sexo, diagnosticos 

In [10]:
data_temp.head()

SECTOR_R,CATEGORY,PRIVADO,PUBLICO,TOTAL
0,0,11.0,4.0,15.0
1,I-1,1.0,0.0,1.0
2,I-2,11.0,48.0,59.0
3,I-3,23.0,36.0,59.0
4,I-4,1.0,7.0,8.0


In [12]:
data_temp = all_data
data_temp = data_temp[data_temp['YEAR']==2019].reset_index(drop=True)
data_temp = data_temp[((data_temp['PROVINCE']=='LIMA') & (data_temp['STATE']=='LIMA')) |
                      (data_temp['STATE']=='CALLAO')]
