In [1]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm, trange

In [2]:
# CO_IPRESS ICD-10
CO_IPRESS_df = pd.read_excel('web_resources/ICD10/CIE10-VOLUMEN-2018-EXCEL.xlsx', index_col=None, header=0) 
CO_IPRESS_df['DIAGNOSIS_ID'] = CO_IPRESS_df['CIE10_X'].apply(lambda x: x[:-1]+'.'+x[-1:])
CO_IPRESS_df = CO_IPRESS_df[['DIAGNOSIS_ID','DESCRIPCION CIE']]
CO_IPRESS_df = CO_IPRESS_df.rename(columns={'DESCRIPCION CIE': 'DISEASE'})

# CO_IPRESS ICD-10
CO_IPRESS_df = pd.read_csv('web_resources/ICD10/diagnosis.csv', encoding='utf_8')
CO_IPRESS_df['DISEASE'] = CO_IPRESS_df['ShortDescription'].apply(lambda x: x.upper())
CO_IPRESS_df = CO_IPRESS_df[['CodeWithSeparator','DISEASE']]
CO_IPRESS_df = CO_IPRESS_df.rename(columns={'CodeWithSeparator': 'DIAGNOSIS_ID'})

In [3]:
# CO_IPRESS CODIGOS
# CO_IPRESS_dict = pd.read_excel('src/CIE10-VOLUMEN-2018-EXCEL.xlsx', index_col=None, header=0) 
# CO_IPRESS_dict['CO_IPRESS'] = CO_IPRESS_dict['CIE10_X'].apply(lambda x: x[:-1]+'.'+x[-1:])
# CO_IPRESS_dict = CO_IPRESS_dict.set_index('CO_IPRESS').T.to_dict('list')

# CO_IPRESS CODIGOS
CO_IPRESS_dict = pd.read_csv('web_resources/ICD10/diagnosis.csv', encoding='utf_8')
CO_IPRESS_dict['DESCRIPCION'] = CO_IPRESS_dict['ShortDescription'].apply(lambda x: x.upper())
CO_IPRESS_dict = CO_IPRESS_dict[(CO_IPRESS_dict['CodeWithSeparator'].str.len() == 3)]
CO_IPRESS_dict = CO_IPRESS_dict.set_index('CodeWithSeparator').T.to_dict('list')

#CO_IPRESS_dict



CO_IPRESS_wikidict = pd.read_excel('web_resources/ICD10/wiki_codes_diseases.xlsx')
# CO_IPRESS_wikidict['DESCRIPCION'] = CO_IPRESS_wikidict['ShortDescription'].apply(lambda x: x.upper())
# CO_IPRESS_wikidict = CO_IPRESS_wikidict[(CO_IPRESS_wikidict['CodeWithSeparator'].str.len() == 3)]
CO_IPRESS_wikidict = CO_IPRESS_wikidict.set_index('letra').T.to_dict('list')


In [4]:
def diagnosis_grouped(val):
    val = str(val)[:3]
    try:
        val = CO_IPRESS_dict[val][6]
    except:
        val = 'UNIDENTIFIED'
    return val

def diagnosis_grouped2(val):
    val = str(val)[:2]
    try:
        val = CO_IPRESS_wikidict[val][3]
    except:
        val = 'Unidentified'
    return val

def to_CO_IPRESS(val):
    try:
        val = CO_IPRESS_dict[val][6]
    except:
        val = 'UNIDENTIFIED'
    return val

def to_number(val):
    try:
        val = int(val)
    except:
        val = 0
    return val

def category_label(val):
    category_label = None
    if val in ['I-3', 'I-4']:
        category_label = 'I'
    elif val in ['II-1', 'II-2', 'II-E']:
        category_label = 'II'
    elif val in ['III-1', 'III-2', 'III-E']:
        category_label = 'III'
    else:
        pass
        category_label = '0'
    return category_label

def category_sex(val):
    sex_label = None
    if val in ['1', '01', 'NE_0001']:
        sex_label = 'M'
    elif val in ['2', 'NE_0002', '02']:
        sex_label = 'F'
    else:
        sex_label = '0'
    return sex_label


### Reading resources

In [5]:
# Procedimientos
files = glob.glob("web_resources/procedures/procedimientos/*")

object_df = pd.DataFrame()
types = {'ANHO': int,
         'MES': int,
         'UBIGEO': int,
         'DEPARTAMENTO': str,
         'PROVINCIA': str,
         'DISTRITO': str,
         'SECTOR': str,
         'CATEGORIA': str,
         'CO_IPRESS': int,
         'RAZON_SOC': str,
         'ID_PROCEDIMIENTO': str,
         'PROCEDIMIENTO': str,
         'TOTAL': str,
         'ID_CODIGO': str,
         'DES_DESCRIPCION': str,}

for p in tqdm(files):
    try:
        data_temp = pd.read_csv(p, encoding='utf_8', dtype=types)
        pass
    except:
        data_temp = pd.read_csv(p, encoding='latin', dtype=types)
        
    object_df = pd.concat([object_df,data_temp])
    
object_df = object_df.rename(columns={'ANHO': 'YEAR',
                                      'MES': 'MONTH',
                                      'UBIGEO': 'UBIGEO',
                                      'DEPARTAMENTO': 'STATE',
                                      'PROVINCIA': 'PROVINCE',
                                      'DISTRITO': 'DISTRICT',
                                      'SECTOR': 'SECTOR',
                                      'CATEGORIA': 'CATEGORY',
                                      'CO_IPRESS': 'CO_IPRESS',
                                      'RAZON_SOC': 'NAME',
                                      'ID_PROCEDIMIENTO': 'PROCEDURE_ID',
                                      'PROCEDIMIENTO': 'PROCEDURE',
                                      'TOTAL': 'TOTAL',
                                      'ID_CODIGO': 'DESCRIPTION_ID',
                                      'DES_DESCRIPCION': 'DESCRIPTION'})

#object_df['DISEASE'] = object_df['CO_IPRESS'].apply(lambda x : to_CO_IPRESS(x))
#object_df = pd.merge(object_df, CO_IPRESS_df, how='left', on=('DIAGNOSIS'))
#object_df['DISTRICT'] = object_df['DISTRICT'].apply(lambda x : to_correct_district(x))
object_df['CATEGORY2'] = object_df['CATEGORY'].apply(lambda x : category_label(x))
object_df['DATE'] = object_df['YEAR'].astype(str) +'-'+object_df['MONTH'].astype(str) 
object_df['DATE'] = pd.to_datetime(object_df['DATE'])
print('Guardando archivo...')

object_df.to_pickle('data_preprocessed/procedimiento_G.pkl')  

  data_temp = pd.read_csv(p, encoding='latin', dtype=types)
  data_temp = pd.read_csv(p, encoding='latin', dtype=types)
  data_temp = pd.read_csv(p, encoding='latin', dtype=types)
100%|██████████| 12/12 [04:21<00:00, 21.80s/it]


Guardando archivo...


In [7]:
object_df.head(3)

Unnamed: 0,YEAR,MONTH,UBIGEO,STATE,PROVINCE,DISTRICT,SECTOR,CATEGORY,CO_IPRESS,NAME,...,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,CATEGORY2,DATE
0,2015,1,150141,LIMA,LIMA,SURQUILLO,MINSA,III-2,6210,INSTITUTO NACIONAL DE ENFERMEDADES NEOPLASICAS,...,,,,,,,,,III,2015-01-01
1,2015,1,150141,LIMA,LIMA,SURQUILLO,MINSA,III-2,6210,INSTITUTO NACIONAL DE ENFERMEDADES NEOPLASICAS,...,,,,,,,,,III,2015-01-01
2,2015,1,150141,LIMA,LIMA,SURQUILLO,MINSA,III-2,6210,INSTITUTO NACIONAL DE ENFERMEDADES NEOPLASICAS,...,,,,,,,,,III,2015-01-01
