In [1]:
import pandas as pd
import os

from datalayer import read_csv_sia, read_sia_model, _merge_by_year_and_month
import utils
import feature_engineering
import external_data

In [2]:
ESTABELECIMENTO_FILES = {'Estabelecimentos- Clínicas-Ambulatórios Especializados.csv' : 'CLINICAS_AMB_ESPECIALIZADO',
                              'Estabelecimentos- Hospital Especializado.csv': 'HOSPITAL_ESPECIALIZADO',
                              'Estabelecimentos- Hospital Geral.csv': 'HOSPITAL_GERAL',
                              'Estabelecimentos- Unidade Básica de Saúde.csv': 'UN_BASICA_SAUDE',
                              'Estabelecimentos- Unidade de Serviço de Apoio ao Diagnose e Terapia.csv': 'UN_DIAG_TERAPIA'}

In [42]:
data = read_csv_sia('../data/Mama Radioterapia SIA-SUS.csv', 'radioterapia')

data = data[data['AP_TPAPAC']==1] # removes data that are not from the first authorization

data = data[data['AR_DTIDEN'] >= pd.to_datetime('2014-01-01')].copy() # filter date: date >= 2014-01-01

data = feature_engineering.transform_cep_in_feature(data, ['AP_CEPPCN'])

data = external_data.get_municipio_info(data, ['AP_MUNPCN', 'AP_UFMUN'])
data = external_data.get_municipio_info_atlas(data, ['AP_MUNPCN'])

data = external_data.get_cep_info(data, ['AP_CEPPCN'])
data = external_data.get_cnes_loc(data, ['AP_CODUNI']) 
    
data = utils.create_year_month_date(data, ['AR_DTIDEN'])
data = utils.create_year_date(data, ['AR_DTIDEN'])

data = external_data.get_orcamento_publico(data, ['AP_MUNPCN'], 'AR_DTIDEN_YEAR') 

data = _merge_by_year_and_month(data, ESTABELECIMENTO_FILES, 'estabelecimento')

data['DISTANCE_HOSPITAL'] = data.apply(lambda x: utils.calc_distance_lat_long(x['AP_CEPPCN_LATITUDE'],
                                                                                 x['AP_CEPPCN_LONGITUDE'],
                                                                                 x['AP_CODUNI_LATITUDE'],
                                                                                 x['AP_CODUNI_LONGITUDE']), 1)

    
data = data[['AP_CODUNI', 'AP_CODUNI_LATITUDE' , 'AP_CODUNI_LONGITUDE', 'AP_UFMUN']]

b'Skipping line 100354: expected 74 fields, saw 87\n'


In [43]:
review = pd.read_csv('data/reviews.csv', sep=';')
new = set(data['AP_CODUNI'].unique()) - set(review['AP_CODUNI'].unique())
data_model = data[data['AP_CODUNI'].isin(new)]

In [45]:
municipio = pd.read_csv('../data/municipios.csv', sep=',')
municipio.columns = [x.upper() for x in municipio.columns]

municipio['CODIGO_IBGE'] = municipio['CODIGO_IBGE'].astype(str)
# numero verificador removed
municipio['CODIGO_IBGE'] = municipio['CODIGO_IBGE'].str[:-1]
municipio['CODIGO_IBGE'] = pd.to_numeric(municipio['CODIGO_IBGE'])

municipio = municipio.rename(columns={'CODIGO_IBGE': 'AP_UFMUN'})

municipio = municipio[['AP_UFMUN', 'NOME']].copy()

In [46]:
data_model = data_model.merge(municipio, on='AP_UFMUN', how='left')
data_model = data_model.drop_duplicates(subset='AP_CODUNI')
data_model = data_model.sort_values('AP_CODUNI')

In [48]:
cnes_df = pd.read_csv('data/espelho cnes nome fantasia.csv', sep=';', encoding='latin1')
cnes_df = cnes_df.rename(columns={'cnes': 'AP_CODUNI'})
data_model = data_model.merge(cnes_df, on='AP_CODUNI', how='left')

In [50]:
data_model.head(10)

Unnamed: 0,AP_CODUNI,AP_CODUNI_LATITUDE,AP_CODUNI_LONGITUDE,AP_UFMUN,NOME,nome fantasia
0,2273462,-22.91685,-43.26205,330455,Rio de Janeiro,MS INCA HOSPITAL DO CANCER III
1,2287447,-21.75851,-41.33476,330100,Campos dos Goytacazes,HOSPITAL ESCOLA ALVARO ALVIM
2,2697696,-2.53876,-44.27914,211130,São Luís,INSTITUTO MARANHENSE DE ONCOLOGIA ALDENORA BEL...
3,6497489,-5.52441,-47.47751,210530,Imperatriz,ONCORADIUM
4,7068336,-8.762,-63.904,110020,Porto Velho,HOSPITAL DE AMOR AMAZONIA
