In [2]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm, trange

In [3]:
# CO_IPRESS ICD-10
CO_IPRESS_df = pd.read_excel('web_resources/ICD10/CIE10-VOLUMEN-2018-EXCEL.xlsx', index_col=None, header=0) 
CO_IPRESS_df['DIAGNOSIS_ID'] = CO_IPRESS_df['CIE10_X'].apply(lambda x: x[:-1]+'.'+x[-1:])
CO_IPRESS_df = CO_IPRESS_df[['DIAGNOSIS_ID','DESCRIPCION CIE']]
CO_IPRESS_df = CO_IPRESS_df.rename(columns={'DESCRIPCION CIE': 'DISEASE'})

# CO_IPRESS ICD-10
CO_IPRESS_df = pd.read_csv('web_resources/ICD10/diagnosis.csv', encoding='utf_8')
CO_IPRESS_df['DISEASE'] = CO_IPRESS_df['ShortDescription'].apply(lambda x: x.upper())
CO_IPRESS_df = CO_IPRESS_df[['CodeWithSeparator','DISEASE']]
CO_IPRESS_df = CO_IPRESS_df.rename(columns={'CodeWithSeparator': 'DIAGNOSIS_ID'})

In [4]:
# CO_IPRESS CODIGOS
# CO_IPRESS_dict = pd.read_excel('src/CIE10-VOLUMEN-2018-EXCEL.xlsx', index_col=None, header=0) 
# CO_IPRESS_dict['CO_IPRESS'] = CO_IPRESS_dict['CIE10_X'].apply(lambda x: x[:-1]+'.'+x[-1:])
# CO_IPRESS_dict = CO_IPRESS_dict.set_index('CO_IPRESS').T.to_dict('list')

# CO_IPRESS CODIGOS
CO_IPRESS_dict = pd.read_csv('web_resources/ICD10/diagnosis.csv', encoding='utf_8')
CO_IPRESS_dict['DESCRIPCION'] = CO_IPRESS_dict['ShortDescription'].apply(lambda x: x.upper())
CO_IPRESS_dict = CO_IPRESS_dict[(CO_IPRESS_dict['CodeWithSeparator'].str.len() == 3)]
CO_IPRESS_dict = CO_IPRESS_dict.set_index('CodeWithSeparator').T.to_dict('list')

#CO_IPRESS_dict



CO_IPRESS_wikidict = pd.read_excel('web_resources/ICD10/wiki_codes_diseases.xlsx')
# CO_IPRESS_wikidict['DESCRIPCION'] = CO_IPRESS_wikidict['ShortDescription'].apply(lambda x: x.upper())
# CO_IPRESS_wikidict = CO_IPRESS_wikidict[(CO_IPRESS_wikidict['CodeWithSeparator'].str.len() == 3)]
CO_IPRESS_wikidict = CO_IPRESS_wikidict.set_index('letra').T.to_dict('list')


In [4]:
def diagnosis_grouped(val):
    val = str(val)[:3]
    try:
        val = CO_IPRESS_dict[val][6]
    except:
        val = 'UNIDENTIFIED'
    return val

def diagnosis_grouped2(val):
    val = str(val)[:2]
    try:
        val = CO_IPRESS_wikidict[val][3]
    except:
        val = 'Unidentified'
    return val

def to_CO_IPRESS(val):
    try:
        val = CO_IPRESS_dict[val][6]
    except:
        val = 'UNIDENTIFIED'
    return val

def to_number(val):
    try:
        val = int(val)
    except:
        val = 0
    return val

def category_label(val):
    category_label = None
    if val in ['I-3', 'I-4']:
        category_label = 'I'
    elif val in ['II-1', 'II-2', 'II-E']:
        category_label = 'II'
    elif val in ['III-1', 'III-2', 'III-E']:
        category_label = 'III'
    else:
        pass
        category_label = '0'
    return category_label

def category_sex(val):
    sex_label = None
    if val in ['1', '01', 'NE_0001']:
        sex_label = 'M'
    elif val in ['2', 'NE_0002', '02']:
        sex_label = 'F'
    else:
        sex_label = '0'
    return sex_label


### Reading resources

In [5]:
# Morbilidad B2
files = glob.glob("web_resources/morbidity/morbilidad_b2/*")

object_df = pd.DataFrame()
types = {'ANHO': int,
         'MES': int,
         'UBIGEO': int,
         'DEPARTAMENTO': str,
         'PROVINCIA': str,
         'DISTRITO': str,
         'SECTOR': str,
         'CATEGORIA': str,
         'CO_IPRESS': str,
         'RAZON_SOC': str,
         'SEXO': str,
         'EDAD': str,
         'DIAGNOSTICO': str,
         'NU_TOTAL_ATENDIDOS': str,}

for p in tqdm(files):
    try:
        data_temp = pd.read_csv(p, encoding='utf_8', dtype=types)
        pass
    except:
        data_temp = pd.read_csv(p, encoding='latin', dtype=types)
        
    object_df = pd.concat([object_df,data_temp])


object_df = object_df.rename(columns={'ANHO': 'YEAR',
                                      'MES': 'MONTH',
                                      'UBIGEO': 'UBIGEO',
                                      'DEPARTAMENTO': 'STATE',
                                      'PROVINCIA': 'PROVINCE',
                                      'DISTRITO': 'DISTRICT',
                                      'SECTOR': 'SECTOR',
                                      'CATEGORIA': 'CATEGORY',
                                      'CO_IPRESS': 'CO_IPRESS',
                                      'RAZON_SOC': 'NAME',
                                      'SEXO': 'SEX',
                                      'EDAD': 'AGE',
                                      'DIAGNOSTICO': 'DIAGNOSIS_ID',
                                      'NU_TOTAL_ATENDIDOS': 'QTY_PEOPLE_SERVED'})


object_df['QTY_PEOPLE_SERVED'] = object_df['QTY_PEOPLE_SERVED'].apply(lambda x : to_number(x))
object_df = pd.merge(object_df, CO_IPRESS_df, how='left', on=('DIAGNOSIS_ID'))
object_df['CATEGORY2'] = object_df['CATEGORY'].apply(lambda x : category_label(x))
object_df['SEX'] = object_df['SEX'].apply(lambda x : category_sex(x))
object_df["DISEASE"] = object_df["DISEASE"].fillna('UNIDENTIFIED')
object_df['DISEASE_GROUP'] = object_df['DIAGNOSIS_ID'].apply(lambda x : diagnosis_grouped2(x))
object_df['DATE'] = object_df['YEAR'].astype(str) +'-'+object_df['MONTH'].astype(str) 
object_df['DATE'] = pd.to_datetime(object_df['DATE'])

print('Guardando archivo...')
object_df.to_pickle('data/morbilidad_b2.pkl')  


100%|██████████| 48/48 [03:59<00:00,  4.99s/it]


Guardando archivo...


In [6]:
# Morbilidad C2
files = glob.glob("web_resources/morbidity/morbilidad_c2/*")

object_df = pd.DataFrame()
types = {'ANHO': int,
         'MES': int,
         'UBIGEO': int,
         'DEPARTAMENTO': str,
         'PROVINCIA': str,
         'DISTRITO': str,
         'SECTOR': str,
         'CATEGORIA': str,
         'CO_IPRESS': str,
         'RAZON_SOC': str,
         'SEXO': str,
         'EDAD': str,
         'ID_DIGNOSTICO': str,
         'DIAGNOSTICO': str,
         'NU_TOTAL_ATENDIDOS': str,}

for p in tqdm(files):
    try:
        data_temp = pd.read_csv(p, encoding='utf_8', dtype=types)
        pass
    except:
        data_temp = pd.read_csv(p, encoding='latin', dtype=types)
        
    object_df = pd.concat([object_df,data_temp])

object_df = object_df.rename(columns={'ANHO': 'YEAR',
                                      'MES': 'MONTH',
                                      'UBIGEO': 'UBIGEO',
                                      'DEPARTAMENTO': 'STATE',
                                      'PROVINCIA': 'PROVINCE',
                                      'DISTRITO': 'DISTRICT',
                                      'SECTOR': 'SECTOR',
                                      'CATEGORIA': 'CATEGORY',
                                      'CO_IPRESS': 'CO_IPRESS',
                                      'RAZON_SOC': 'NAME',
                                      'SEXO': 'SEX',
                                      'EDAD': 'AGE',
                                      'ID_DIGNOSTICO': 'DIAGNOSIS_ID',
                                      'DIAGNOSTICO': 'DIAGNOSIS',
                                      'NU_TOTAL_ATENDIDOS': 'QTY_PEOPLE_SERVED'})

object_df['QTY_PEOPLE_SERVED'] = object_df['QTY_PEOPLE_SERVED'].apply(lambda x : to_number(x))
#object_df['DISTRICT'] = object_df['DISTRICT'].apply(lambda x : to_correct_district(x))
object_df = pd.merge(object_df, CO_IPRESS_df, how='left', on=('DIAGNOSIS_ID'))
object_df['CATEGORY2'] = object_df['CATEGORY'].apply(lambda x : category_label(x))
object_df['SEX'] = object_df['SEX'].apply(lambda x : category_sex(x))
object_df["DISEASE"] = object_df["DISEASE"].fillna('UNIDENTIFIED')
object_df['DISEASE_GROUP'] = object_df['DIAGNOSIS_ID'].apply(lambda x : diagnosis_grouped2(x))
object_df['DATE'] = object_df['YEAR'].astype(str) +'-'+object_df['MONTH'].astype(str) 
object_df['DATE'] = pd.to_datetime(object_df['DATE'])
print('Guardando archivo...')

object_df.to_pickle('data/morbilidad_c2.pkl')  

100%|██████████| 49/49 [01:24<00:00,  1.73s/it]


Guardando archivo...


In [7]:
# Morbilidad D2
files = glob.glob("web_resources/morbidity/morbilidad_d2/*")

object_df = pd.DataFrame()
types = {'ANHO': int,
         'MES': int,
         'UBIGEO': int,
         'DEPARTAMENTO': str,
         'PROVINCIA': str,
         'DISTRITO': str,
         'SECTOR': str,
         'CATEGORIA': str,
         'CO_IPRESS': str,
         'RAZON_SOC': str,
         'SEXO': str,
         'EDAD': str,
         'ID_DIGNOSTICO': str,
         'DIGNOSTICO': str,
         'NU_TOTAL_ATENDIDOS': str,}

for p in tqdm(files):
    try:
        data_temp = pd.read_csv(p, encoding='utf_8', dtype=types)
        pass
    except:
        data_temp = pd.read_csv(p, encoding='latin', dtype=types)

    object_df = pd.concat([object_df,data_temp])
    
object_df = object_df.rename(columns={'ANHO': 'YEAR',
                                      'MES': 'MONTH',
                                      'UBIGEO': 'UBIGEO',
                                      'DEPARTAMENTO': 'STATE',
                                      'PROVINCIA': 'PROVINCE',
                                      'DISTRITO': 'DISTRICT',
                                      'SECTOR': 'SECTOR',
                                      'CATEGORIA': 'CATEGORY',
                                      'CO_IPRESS': 'CO_IPRESS',
                                      'RAZON_SOC': 'NAME',
                                      'SEXO': 'SEX',
                                      'EDAD': 'AGE',
                                      'ID_DIGNOSTICO': 'DIAGNOSIS_ID',
                                      'DIAGNOSTICO': 'DIAGNOSIS',
                                      'NU_TOTAL_ATENDIDOS': 'QTY_PEOPLE_SERVED'})

object_df['QTY_PEOPLE_SERVED'] = object_df['QTY_PEOPLE_SERVED'].apply(lambda x : to_number(x))
object_df = pd.merge(object_df, CO_IPRESS_df, how='left', on=('DIAGNOSIS_ID'))
object_df['CATEGORY2'] = object_df['CATEGORY'].apply(lambda x : category_label(x))
object_df['SEX'] = object_df['SEX'].apply(lambda x : category_sex(x))
object_df["DISEASE"] = object_df["DISEASE"].fillna('UNIDENTIFIED')
object_df['DISEASE_GROUP'] = object_df['DIAGNOSIS_ID'].apply(lambda x : diagnosis_grouped2(x))
object_df['DATE'] = object_df['YEAR'].astype(str) +'-'+object_df['MONTH'].astype(str) 
object_df['DATE'] = pd.to_datetime(object_df['DATE'])
print('Guardando archivo...')

object_df.to_pickle('data/morbilidad_d2.pkl')  

100%|██████████| 8/8 [00:14<00:00,  1.75s/it]


Guardando archivo...


### For all morbidity types

In [3]:
types = {'COD_IPRESS': str,}

geo_temp = pd.read_csv('web_resources/geografico/RENIPRESS_2022_v4.csv', encoding='latin', dtype=types)
geo_temp = geo_temp[['COD_IPRESS','NORTE','ESTE','INSTITUCION']]
geo_temp['SECTOR_R'] = geo_temp['INSTITUCION'].apply(lambda x: x if x == 'PRIVADO' else 'PUBLICO')
del geo_temp['INSTITUCION']
geo_temp = geo_temp.rename(columns={"NORTE": 'x', "ESTE":'y'})
geo_temp.to_csv('data/coordinates_hospital.csv')
geo_temp.head()

Unnamed: 0,COD_IPRESS,x,y,SECTOR_R
0,23013,,,PRIVADO
1,3978,-12.742162,-74.4427,PUBLICO
2,4114,-12.291517,-74.911738,PUBLICO
3,5895,-11.788095,-76.619818,PUBLICO
4,23192,,,PRIVADO


In [4]:
geo_temp.columns

Index(['COD_IPRESS', 'x', 'y', 'SECTOR_R'], dtype='object')

In [5]:
paths = [
    ['morbilidad_b2.pkl', 'Outpatient Consultation'],
    ['morbilidad_c2.pkl', 'Emergency'],
    ['morbilidad_d2.pkl', 'Hospitalization']]

columns = [
    'DATE',
    'YEAR',
    'MONTH',
    'STATE',
    'PROVINCE',
    'DISTRICT',
    'SECTOR',
    'CATEGORY',
    'CATEGORY2',
    'CO_IPRESS',
    'NAME',
    'SEX',
    'AGE',
    'DIAGNOSIS_ID',
    'QTY_PEOPLE_SERVED',
    'DISEASE',
    'DISEASE_GROUP'
]

all_data = pd.DataFrame()
for file in tqdm(paths):
    data = pd.read_pickle('data/'+file[0])
    data = data.filter(columns)
    data = data.merge(geo_temp, left_on='CO_IPRESS', right_on='COD_IPRESS', how='left')
    data['TYPE'] = file[1]
    all_data = pd.concat([all_data, data])


100%|██████████| 3/3 [02:34<00:00, 51.46s/it]


In [6]:
# all_data = all_data.merge(geo_temp, left_on='CO_IPRESS', right_on='COD_IPRESS', how='left')
all_data.to_pickle('data/morbilidad_global.pkl')
all_data.to_csv('data/morbilidad_global.csv')  

In [5]:
all_data = pd.read_pickle('data/morbilidad_global.pkl')
all_data = all_data[all_data['YEAR']>=2018]
all_data = all_data[(all_data['PROVINCE']=='LIMA') & (all_data['STATE']=='LIMA')].reset_index(drop=True)
all_data['SECTOR2'] = all_data['SECTOR'].apply(lambda x: 'PRIVATE' if x == 'PRIVADO' else 'PUBLIC')

all_data.to_pickle('data/morbilidad_global_lima.pkl')

In [6]:
all_data.head()

Unnamed: 0,DATE,YEAR,MONTH,STATE,PROVINCE,DISTRICT,SECTOR,CATEGORY,CATEGORY2,CO_IPRESS,...,DIAGNOSIS_ID,QTY_PEOPLE_SERVED,DISEASE,DISEASE_GROUP,COD_IPRESS,x,y,SECTOR_R,TYPE,SECTOR2
0,2021-12-01,2021,12,LIMA,LIMA,LOS OLIVOS,PRIVADO,II-E,II,23151,...,N39.0,5,"URINARY TRACT INFECTION, SITE NOT SPECIFIED",Diseases of the genitourinary system,23151,-11.99116,-77.073203,PRIVADO,Outpatient Consultation,PRIVATE
1,2021-12-01,2021,12,LIMA,LIMA,LOS OLIVOS,PRIVADO,II-E,II,23151,...,N39.0,4,"URINARY TRACT INFECTION, SITE NOT SPECIFIED",Diseases of the genitourinary system,23151,-11.99116,-77.073203,PRIVADO,Outpatient Consultation,PRIVATE
2,2021-12-01,2021,12,LIMA,LIMA,LOS OLIVOS,PRIVADO,II-E,II,23151,...,N77.1,8,"VAGINITIS, VULVITIS AND VULVOVAGINITIS IN DIS ...",Diseases of the genitourinary system,23151,-11.99116,-77.073203,PRIVADO,Outpatient Consultation,PRIVATE
3,2021-12-01,2021,12,LIMA,LIMA,LOS OLIVOS,PRIVADO,II-E,II,23151,...,N39.0,4,"URINARY TRACT INFECTION, SITE NOT SPECIFIED",Diseases of the genitourinary system,23151,-11.99116,-77.073203,PRIVADO,Outpatient Consultation,PRIVATE
4,2021-12-01,2021,12,LIMA,LIMA,LOS OLIVOS,PRIVADO,II-E,II,23151,...,N39.0,5,"URINARY TRACT INFECTION, SITE NOT SPECIFIED",Diseases of the genitourinary system,23151,-11.99116,-77.073203,PRIVADO,Outpatient Consultation,PRIVATE
