In [1]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm, trange

In [2]:
# Diagnosis / CO_IPRESS ICD-10
CO_IPRESS_df = pd.read_csv('web_resources/ICD10/diagnosis.csv', encoding='utf_8')
CO_IPRESS_df['DISEASE'] = CO_IPRESS_df['ShortDescription'].apply(lambda x: x.upper())
CO_IPRESS_df = CO_IPRESS_df[['CodeWithSeparator','DISEASE']]
CO_IPRESS_df = CO_IPRESS_df.rename(columns={'CodeWithSeparator': 'DIAGNOSIS_ID'})

# Diagnosis Groups
CO_IPRESS_wikidict = pd.read_excel('web_resources/ICD10/wiki_codes_diseases.xlsx')
CO_IPRESS_wikidict = CO_IPRESS_wikidict.set_index('letra').T.to_dict('list')

def diagnosis_grouped(val):
    val = str(val)[:2]
    try:
        val = CO_IPRESS_wikidict[val][3]
    except:
        val = 'Unidentified'
    return val

In [3]:
def to_number(val):
    try:
        val = int(val)
    except:
        val = 0
    return val

def category_label(val):
    category_label = None
    if val in ['I-3', 'I-4']:
        category_label = 'I'
    elif val in ['II-1', 'II-2', 'II-E']:
        category_label = 'II'
    elif val in ['III-1', 'III-2', 'III-E']:
        category_label = 'III'
    else:
        pass
        category_label = '0'
    return category_label

def category_sex(val):
    sex_label = None
    if val in ['1', '01', 'NE_0001']:
        sex_label = 'M'
    elif val in ['2', 'NE_0002', '02']:
        sex_label = 'F'
    else:
        sex_label = '0'
    return sex_label


### Reading resources

In [4]:
# Morbilidad B2
files = glob.glob("web_resources/morbidity/morbilidad_b2/*")

object_df = pd.DataFrame()
types = {'ANHO': int,
         'MES': int,
         'UBIGEO': int,
         'DEPARTAMENTO': str,
         'PROVINCIA': str,
         'DISTRITO': str,
         'SECTOR': str,
         'CATEGORIA': str,
         'CO_IPRESS': str,
         'RAZON_SOC': str,
         'SEXO': str,
         'EDAD': str,
         'DIAGNOSTICO': str,
         'NU_TOTAL_ATENDIDOS': str,}

for p in tqdm(files):
    try:
        data_temp = pd.read_csv(p, encoding='utf_8', dtype=types)
        pass
    except:
        data_temp = pd.read_csv(p, encoding='latin', dtype=types)
        
    object_df = pd.concat([object_df,data_temp])


object_df = object_df.rename(columns={'ANHO': 'YEAR',
                                      'MES': 'MONTH',
                                      'UBIGEO': 'UBIGEO',
                                      'DEPARTAMENTO': 'STATE',
                                      'PROVINCIA': 'PROVINCE',
                                      'DISTRITO': 'DISTRICT',
                                      'SECTOR': 'SECTOR',
                                      'CATEGORIA': 'CATEGORY',
                                      'CO_IPRESS': 'CO_IPRESS',
                                      'RAZON_SOC': 'NAME',
                                      'SEXO': 'SEX',
                                      'EDAD': 'AGE',
                                      'DIAGNOSTICO': 'DIAGNOSIS_ID',
                                      'NU_TOTAL_ATENDIDOS': 'QTY_PEOPLE_SERVED'})


object_df['QTY_PEOPLE_SERVED'] = object_df['QTY_PEOPLE_SERVED'].apply(lambda x : to_number(x))
object_df = pd.merge(object_df, CO_IPRESS_df, how='left', on=('DIAGNOSIS_ID'))
object_df['CATEGORY2'] = object_df['CATEGORY'].apply(lambda x : category_label(x))
object_df['SEX'] = object_df['SEX'].apply(lambda x : category_sex(x))
object_df["DISEASE"] = object_df["DISEASE"].fillna('UNIDENTIFIED')
object_df['DISEASE_GROUP'] = object_df['DIAGNOSIS_ID'].apply(lambda x : diagnosis_grouped(x))
object_df['DATE'] = object_df['YEAR'].astype(str) +'-'+object_df['MONTH'].astype(str) 
object_df['DATE'] = pd.to_datetime(object_df['DATE'])
object_df['SECTOR_R'] = object_df['SECTOR'].apply(lambda x: 'PRIVATE' if x == 'PRIVADO' else 'PUBLIC')
object_df = object_df[object_df['QTY_PEOPLE_SERVED']>=1].reset_index(drop=True)
print('Guardando archivo...')
object_df.to_pickle('data_preprocessed/morbilidad_b2.pkl')  


100%|██████████| 53/53 [06:53<00:00,  7.81s/it]


Guardando archivo...


In [5]:
# Morbilidad C2
files = glob.glob("web_resources/morbidity/morbilidad_c2/*")

object_df = pd.DataFrame()
types = {'ANHO': int,
         'MES': int,
         'UBIGEO': int,
         'DEPARTAMENTO': str,
         'PROVINCIA': str,
         'DISTRITO': str,
         'SECTOR': str,
         'CATEGORIA': str,
         'CO_IPRESS': str,
         'RAZON_SOC': str,
         'SEXO': str,
         'EDAD': str,
         'ID_DIGNOSTICO': str,
         'DIAGNOSTICO': str,
         'NU_TOTAL_ATENDIDOS': str,}

for p in tqdm(files):
    try:
        data_temp = pd.read_csv(p, encoding='utf_8', dtype=types)
        pass
    except:
        data_temp = pd.read_csv(p, encoding='latin', dtype=types)
        
    object_df = pd.concat([object_df,data_temp])

object_df = object_df.rename(columns={'ANHO': 'YEAR',
                                      'MES': 'MONTH',
                                      'UBIGEO': 'UBIGEO',
                                      'DEPARTAMENTO': 'STATE',
                                      'PROVINCIA': 'PROVINCE',
                                      'DISTRITO': 'DISTRICT',
                                      'SECTOR': 'SECTOR',
                                      'CATEGORIA': 'CATEGORY',
                                      'CO_IPRESS': 'CO_IPRESS',
                                      'RAZON_SOC': 'NAME',
                                      'SEXO': 'SEX',
                                      'EDAD': 'AGE',
                                      'ID_DIGNOSTICO': 'DIAGNOSIS_ID',
                                      'DIAGNOSTICO': 'DIAGNOSIS',
                                      'NU_TOTAL_ATENDIDOS': 'QTY_PEOPLE_SERVED'})

object_df['QTY_PEOPLE_SERVED'] = object_df['QTY_PEOPLE_SERVED'].apply(lambda x : to_number(x))
#object_df['DISTRICT'] = object_df['DISTRICT'].apply(lambda x : to_correct_district(x))
object_df = pd.merge(object_df, CO_IPRESS_df, how='left', on=('DIAGNOSIS_ID'))
object_df['CATEGORY2'] = object_df['CATEGORY'].apply(lambda x : category_label(x))
object_df['SEX'] = object_df['SEX'].apply(lambda x : category_sex(x))
object_df["DISEASE"] = object_df["DISEASE"].fillna('UNIDENTIFIED')
object_df['DISEASE_GROUP'] = object_df['DIAGNOSIS_ID'].apply(lambda x : diagnosis_grouped(x))
object_df['DATE'] = object_df['YEAR'].astype(str) +'-'+object_df['MONTH'].astype(str) 
object_df['DATE'] = pd.to_datetime(object_df['DATE'])
object_df['SECTOR_R'] = object_df['SECTOR'].apply(lambda x: 'PRIVATE' if x == 'PRIVADO' else 'PUBLIC')
object_df = object_df[object_df['QTY_PEOPLE_SERVED']>=1].reset_index(drop=True)
print('Guardando archivo...')

object_df.to_pickle('data_preprocessed/morbilidad_c2.pkl')  

100%|██████████| 53/53 [01:31<00:00,  1.73s/it]


Guardando archivo...


In [6]:
# Morbilidad D2
files = glob.glob("web_resources/morbidity/morbilidad_d2/*")

object_df = pd.DataFrame()
types = {'ANHO': int,
         'MES': int,
         'UBIGEO': int,
         'DEPARTAMENTO': str,
         'PROVINCIA': str,
         'DISTRITO': str,
         'SECTOR': str,
         'CATEGORIA': str,
         'CO_IPRESS': str,
         'RAZON_SOC': str,
         'SEXO': str,
         'EDAD': str,
         'ID_DIGNOSTICO': str,
         'DIGNOSTICO': str,
         'NU_TOTAL_ATENDIDOS': str,}

for p in tqdm(files):
    try:
        data_temp = pd.read_csv(p, encoding='utf_8', dtype=types)
        pass
    except:
        data_temp = pd.read_csv(p, encoding='latin', dtype=types)

    object_df = pd.concat([object_df,data_temp])
    
object_df = object_df.rename(columns={'ANHO': 'YEAR',
                                      'MES': 'MONTH',
                                      'UBIGEO': 'UBIGEO',
                                      'DEPARTAMENTO': 'STATE',
                                      'PROVINCIA': 'PROVINCE',
                                      'DISTRITO': 'DISTRICT',
                                      'SECTOR': 'SECTOR',
                                      'CATEGORIA': 'CATEGORY',
                                      'CO_IPRESS': 'CO_IPRESS',
                                      'RAZON_SOC': 'NAME',
                                      'SEXO': 'SEX',
                                      'EDAD': 'AGE',
                                      'ID_DIGNOSTICO': 'DIAGNOSIS_ID',
                                      'DIAGNOSTICO': 'DIAGNOSIS',
                                      'NU_TOTAL_ATENDIDOS': 'QTY_PEOPLE_SERVED'})

object_df['QTY_PEOPLE_SERVED'] = object_df['QTY_PEOPLE_SERVED'].apply(lambda x : to_number(x))
object_df = pd.merge(object_df, CO_IPRESS_df, how='left', on=('DIAGNOSIS_ID'))
object_df['CATEGORY2'] = object_df['CATEGORY'].apply(lambda x : category_label(x))
object_df['SEX'] = object_df['SEX'].apply(lambda x : category_sex(x))
object_df["DISEASE"] = object_df["DISEASE"].fillna('UNIDENTIFIED')
object_df['DISEASE_GROUP'] = object_df['DIAGNOSIS_ID'].apply(lambda x : diagnosis_grouped(x))
object_df['DATE'] = object_df['YEAR'].astype(str) +'-'+object_df['MONTH'].astype(str) 
object_df['DATE'] = pd.to_datetime(object_df['DATE'])
object_df['SECTOR_R'] = object_df['SECTOR'].apply(lambda x: 'PRIVATE' if x == 'PRIVADO' else 'PUBLIC')
object_df = object_df[object_df['QTY_PEOPLE_SERVED']>=1].reset_index(drop=True)
print('Guardando archivo...')

object_df.to_pickle('data_preprocessed/morbilidad_d2.pkl')  

100%|██████████| 8/8 [00:19<00:00,  2.45s/it]


Guardando archivo...


#### Coordinates

In [4]:
types = {'COD_IPRESS': str,}

geo_temp = pd.read_csv('web_resources/geografico/RENIPRESS_2022_v4.csv', encoding='latin', dtype=types)
geo_temp = geo_temp[['COD_IPRESS','NORTE','ESTE','INSTITUCION']]
del geo_temp['INSTITUCION']
geo_temp = geo_temp.rename(columns={"COD_IPRESS": 'CO_IPRESS',"NORTE": 'X', "ESTE":'Y'})
geo_temp.to_csv('data_preprocessed/coordinates_hospital.csv')
geo_temp.head()

Unnamed: 0,CO_IPRESS,X,Y
0,23013,,
1,3978,-12.742162,-74.4427
2,4114,-12.291517,-74.911738
3,5895,-11.788095,-76.619818
4,23192,,


#### For all morbidity types

In [5]:
paths = [
    ['morbilidad_b2.pkl', 'Outpatient Consultation'],
    ['morbilidad_c2.pkl', 'Emergency'],
    ['morbilidad_d2.pkl', 'Hospitalization']]

columns = [
    'DATE',
    'YEAR',
    'MONTH',
    'STATE',
    'PROVINCE',
    'DISTRICT',
    'SECTOR',
    'SECTOR_R',
    'CATEGORY',
    'CATEGORY2',
    'CO_IPRESS',
    'NAME',
    'SEX',
    'AGE',
    'DIAGNOSIS_ID',
    'QTY_PEOPLE_SERVED',
    'DISEASE',
    'DISEASE_GROUP'
]

all_data = pd.DataFrame()
for file in tqdm(paths):
    data = pd.read_pickle('data_preprocessed/'+file[0])
    data = data.filter(columns)
    data = data.merge(geo_temp, left_on='CO_IPRESS', right_on='CO_IPRESS', how='left')
    data['TYPE'] = file[1]
    data = data[data['QTY_PEOPLE_SERVED']>0].reset_index(drop=True)
    data['IS_LIMA'] = data[['PROVINCE','STATE']].apply(lambda x : 'Lima-Callao'
                                                       if (x['PROVINCE']=='LIMA')else 'Provincias', axis=1)
    data['AGE'] = data['AGE'].astype(float)
    all_data = pd.concat([all_data, data])


all_data.to_pickle('data_preprocessed/morbilidad_global.pkl')
#all_data.to_csv('data_preprocessed/morbilidad_global.csv')  

100%|██████████| 3/3 [22:57<00:00, 459.15s/it]


#### Data under 2018

In [6]:
#all_data = pd.read_pickle('data_preprocessed/morbilidad_global.pkl')
all_data = all_data[all_data['YEAR']>=2018]
all_data.to_pickle('data_preprocessed/morbilidad_global_2018.pkl')

#### Data from Lima under 2018

In [7]:
#all_data = pd.read_pickle('data_preprocessed/morbilidad_global_2018.pkl')
all_data = all_data[(all_data['PROVINCE']=='LIMA') & (all_data['STATE']=='LIMA')].reset_index(drop=True)
all_data.to_pickle('data_preprocessed/morbilidad_global_2018_lima.pkl')

In [8]:
all_data.head(3)

Unnamed: 0,DATE,YEAR,MONTH,STATE,PROVINCE,DISTRICT,SECTOR,SECTOR_R,CATEGORY,CATEGORY2,...,SEX,AGE,DIAGNOSIS_ID,QTY_PEOPLE_SERVED,DISEASE,DISEASE_GROUP,X,Y,TYPE,IS_LIMA
0,2018-04-01,2018,4,LIMA,LIMA,SAN JUAN DE MIRAFLORES,PRIVADO,PRIVATE,II-1,II,...,M,1.0,Z00.1,109,"ENCOUNTER FOR NEWBORN, INFANT AND CHILD HEALTH...",Factors influencing health status and contact ...,,,Outpatient Consultation,Lima-Callao
1,2018-04-01,2018,4,LIMA,LIMA,SAN JUAN DE MIRAFLORES,PRIVADO,PRIVATE,II-1,II,...,M,1.0,A09.0,13,UNIDENTIFIED,Some infectious and parasitic diseases,,,Outpatient Consultation,Lima-Callao
2,2018-04-01,2018,4,LIMA,LIMA,SAN JUAN DE MIRAFLORES,PRIVADO,PRIVATE,II-1,II,...,M,1.0,A09.X,22,UNIDENTIFIED,Some infectious and parasitic diseases,,,Outpatient Consultation,Lima-Callao
