# Initial Data Understanding

## Loading Data and Libraries

In [35]:
import pandas as pd
import numpy as np
import sys

import seaborn as sns
import matplotlib.pyplot as plt

sys.path.append('../src/')
pd.set_option('display.max_columns', None)

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Análise preliminar dos dados de Quimioterapia e Radioterapia para Câncer de Mama
Neste documento serão tratadas as primeiras identificações de padrão e possíveis incongruências - de acordo com o julgamento do grupo - em relação aos dados fornecidos para Quimioterapia e Radioterapia de pacientes atendidos para câncer de mama.

In [38]:
import preprocess

kwargs = {'error_bad_lines':False, 'sep':',', 'low_memory':False, 'index_col':0, 'skiprows':0}

df_mama_quim = preprocess.read_to_df('Linfomas Quimioterapia SIA-SUS.csv', kwargs)
df_mama_rad = preprocess.read_to_df('Linfomas Radioterapia SIA-SUS.csv', kwargs)

In [39]:
import constants

df_mama_quim.rename(columns=constants.LAYOUT_APAC, inplace=True)
df_mama_rad.rename(columns=constants.LAYOUT_APAC, inplace=True)

### Amostras de dados
Um primeiro panorama de formatos e disposição de ambos datasets.

In [42]:
df_mama_quim.head(5)

Unnamed: 0,dt_process,tp_gestao,cod_tp_gestao,cod_est_cnes,num_apac,dt_atend_paciente,proc_princ_apac,vl_tot_apac_aprov,cod_uf_mun,tp_est,tip_prest,est_man_ind,cnpj_est_exe,cnpj_mante,num_csn,cod_idade,num_idade,sexo,raca_cor,cod_uf_mun_paciente,nac_paciente,cep_paciente,uf_res_dif_paciente,mun_res_dif_paciente,dt_ini_val,dt_fim_val,tp_atend_apac,ind_apac,mot_sai_perm,ind_obito,ind_encerr,ind_perm,ind_alta,ind_transf,dt_ocorr,cod_org_emi,car_atend,num_apac_ant,cod_soli_cnes,dt_soli,dt_auto,cid_caus_assoc,cid_princ,cid_sec,etnia,aq_cid_10,aq_linfo_reg_inva,aq_estadi,aq_grahis,aq_dt_inden_pato,aq_trat_anter,aq_cidini1,aq_dt_ini_1_trat,aq_cidini2,aq_dt_ini_2_trat,aq_cidini3,aq_dt_ini_3_trat,aq_cont_trat,aq_dt_inic_trat_soli,aq_esqu_p1,aq_tot_mes_plan,aq_tot_mes_aut,aq_esu_p2
4,201401,EP,120000,2001586,1213200185967,201401,304060011,1258.64,120040,5,40,I,63602940000000.0,,{}{{}{{,4,20,M,3,120040,10,69900970,0,1,20131105,20140131,3,2,21,0,0,1,0,0,,E120000001,1,0,2001586,20131105.0,20131105,0,C81- Linfoma de Hodgkin,0,,,3,3.0,99,20130519,0,,,,,,,S,20130819,ABVD,6,3,
25,201401,EP,120000,2001586,1213200175561,201401,304060011,1258.64,120040,5,40,I,63602940000000.0,,{{|{||,4,46,M,3,120040,10,69900970,0,1,20131105,20140131,3,2,21,0,0,1,0,0,,E120000001,1,0,0,20131105.0,20131105,0,C81- Linfoma de Hodgkin,0,,,3,3.0,99,20050602,0,,,,,,,S,20130801,ABVD,6,3,
27,201401,EP,120000,2001586,1213200175528,201401,304060011,1258.64,120040,5,40,I,63602940000000.0,,{{{{|}{|},4,20,M,3,120040,10,69900970,0,1,20131105,20140131,3,2,21,0,0,1,0,0,,E120000001,1,0,0,20131105.0,20131105,0,C81- Linfoma de Hodgkin,0,,,3,3.0,99,20130519,0,,,,,,,S,20130819,ABVD,6,3,
28,201401,EP,120000,2001586,1213200177266,201401,304060011,1258.64,120040,5,40,I,63602940000000.0,,{{{{{}~},4,61,M,3,120001,10,69945000,0,1,20131105,20140131,3,2,21,0,0,1,0,0,,E120000001,1,0,0,20131105.0,20131105,0,C81- Linfoma de Hodgkin,0,,,3,3.0,99,20130919,0,,,,,,,N,20130809,VIMBL,6,3,AS+DAVARBA
32,201401,EP,120000,2001586,1213200186275,201401,304070017,1700.0,120040,5,40,I,63602940000000.0,,{{{{{,4,18,F,3,120040,10,69900970,0,1,20131128,20140131,3,2,21,0,0,1,0,0,,E120000001,1,0,2001586,20131128.0,20131128,0,C81- Linfoma de Hodgkin,0,,,3,2.0,99,20131031,0,,,,,,,N,20131128,ABVD,6,0,
34,201401,EP,120000,2001586,1213200179576,201401,304060038,1258.64,120040,5,40,I,63602940000000.0,,||}||{}~{{{{,4,25,F,3,120040,10,69900970,0,1,20131205,20140228,3,2,21,0,0,1,0,0,,E120000001,1,0,0,20131205.0,20131205,0,C81- Linfoma de Hodgkin,0,,,3,2.0,99,20120724,0,,,,,,,N,20130904,ICE,6,3,
47,201402,EP,120000,2001586,1214200031759,201402,304070017,1700.0,120040,5,40,I,63602940000000.0,,{{{{{,4,18,F,3,120040,10,69900970,0,1,20140204,20140430,3,1,21,0,0,1,0,0,,E120000001,1,0,2001586,20140204.0,20140204,0,C81- Linfoma de Hodgkin,0,,,3,2.0,99,20131031,0,,,,,,,N,20131128,ABVD,6,3,
52,201402,EP,120000,2001586,1214200031693,201402,304060011,1258.64,120040,5,40,I,63602940000000.0,,||{|~~{{{|,4,24,F,3,120040,10,69901024,0,1,20140207,20140430,3,1,21,0,0,1,0,0,,E120000001,1,0,2001586,20140207.0,20140207,0,C81- Linfoma de Hodgkin,0,,,3,3.0,99,20140123,0,,,,,,,N,20140207,ABVD,6,0,
76,201402,EP,120000,2001586,1213200179576,201402,304060038,1258.64,120040,5,40,I,63602940000000.0,,||}||{}~{{{{,4,25,F,3,120040,10,69900970,0,1,20131205,20140228,3,2,21,0,0,1,0,0,,E120000001,1,0,0,20131205.0,20131205,0,C81- Linfoma de Hodgkin,0,,,3,2.0,99,20120724,0,,,,,,,N,20130904,ICE,6,3,
77,201402,EP,120000,2001586,1214200031000,201402,304070025,1381.76,120040,5,40,I,63602940000000.0,,{{}}|~},4,19,M,1,130070,10,69850000,1,1,20140218,20140430,3,1,21,0,0,1,0,0,,E120000001,1,0,2001586,20140218.0,20140218,0,C81- Linfoma de Hodgkin,0,,,3,3.0,99,20110718,0,,,,,,,N,20140218,ICE,6,0,


In [43]:
df_mama_rad.head(10)

Unnamed: 0,dt_process,tp_gestao,cod_tp_gestao,cod_est_cnes,num_apac,dt_atend_paciente,proc_princ_apac,vl_tot_apac_aprov,cod_uf_mun,tp_est,tip_prest,est_man_ind,cnpj_est_exe,cnpj_mante,num_csn,cod_idade,num_idade,sexo,raca_cor,cod_uf_mun_paciente,nac_paciente,cep_paciente,uf_res_dif_paciente,mun_res_dif_paciente,dt_ini_val,dt_fim_val,tp_atend_apac,ind_apac,mot_sai_perm,ind_obito,ind_encerr,ind_perm,ind_alta,ind_transf,dt_ocorr,cod_org_emi,car_atend,num_apac_ant,cod_soli_cnes,dt_soli,dt_auto,cid_caus_assoc,cid_princ,cid_sec,etnia,ar_smrd,ar_cid_10,ar_linfo_reg_inva,ar_estadi,ar_grahis,ar_dt_inden_pato,ar_trat_anter,ar_cidini1,ar_dt_ini_1_trat,ar_cidini2,ar_dt_ini_2_trat,ar_cidini3,ar_dt_ini_3_trat,ar_cont_trat,ar_dt_inic_trat_soli,ar_finalid_trat,ar_cid_topo_1,ar_cid_topo_2,ar_cid_topo_3,ar_num_ins_1,ar_dt_ini_1,ar_dt_ini_2,ar_dt_ini_3,ar_dt_fim_1,ar_dt_fim_2,ar_dt_fim_3,ar_num_ins_2,ar_num_ins_3
8,201401,EP,120000,2001586,1214200029119,201401,304010090,614,120040,5,40,I,63602940000170,,{{~,4,8,M,3,120050,10,69940000,0,1,20140123,20140228,4,1,21,0,0,1,0,0,,E120000001,1,0.0,2001586.0,20140123.0,20140123.0,0,C81- Linfoma de Hodgkin,0,,,,S,2.0,99,20130619,,,,,,,,N,20130127,2,C811,,,34.0,20130127.0,,,20140228.0,,,,
14,201402,EP,120000,2001586,1214200045201,201402,304010090,2098,120040,5,40,I,63602940000170,,{{}~~{},4,75,M,3,120040,10,69900970,0,1,20140212,20140430,4,1,21,0,0,1,0,0,,E120000001,1,0.0,2001586.0,20140212.0,20140212.0,0,C81- Linfoma de Hodgkin,0,,,,N,2.0,99,20131016,,,,,,,,N,20140212,1,C61,,,144.0,20140212.0,,,20140430.0,,,,
21,201402,EP,120000,2001586,1214200029119,201402,304010090,630,120040,5,40,I,63602940000170,,{{~,4,8,M,3,120050,10,69940000,0,1,20140123,20140228,4,2,15,0,0,0,1,0,20140228.0,E120000001,1,0.0,2001586.0,20140123.0,20140123.0,0,C81- Linfoma de Hodgkin,0,,,,S,2.0,99,20130619,,,,,,,,N,20130127,2,C811,,,34.0,20130127.0,,,20140228.0,,,,
29,201403,EP,120000,2001586,1214200045201,201403,304010090,2550,120040,5,40,I,63602940000170,,{{}~~{},4,75,M,3,120040,10,69900970,0,1,20140212,20140430,4,2,15,0,0,0,1,0,20140331.0,E120000001,1,0.0,2001586.0,20140212.0,20140212.0,0,C81- Linfoma de Hodgkin,0,,,,N,2.0,99,20131016,,,,,,,,N,20140212,1,C61,,,144.0,20140212.0,,,20140331.0,,,,
76,201405,EP,120000,2001586,1214200084944,201405,304010090,810,120040,5,40,I,63602940000170,,}{{{{{|}{{{~,4,13,M,3,120020,10,69980000,0,1,20140318,20140531,4,2,15,0,0,0,1,0,20140531.0,E120000001,1,0.0,2001586.0,20140318.0,20140318.0,0,C81- Linfoma de Hodgkin,0,,,,S,3.0,99,20120505,,,,,,,,N,20140318,2,C810,,,60.0,20140318.0,,,20140531.0,,,,
77,201405,EP,120000,2001586,1214200085054,201405,304010090,1018,120040,5,40,I,63602940000170,,{{{~{,4,33,M,3,120040,10,69900970,0,1,20140514,20140731,4,1,21,0,0,1,0,0,,E120000001,1,0.0,2001586.0,20140514.0,20140514.0,0,C81- Linfoma de Hodgkin,0,,,,3,2.0,99,20131218,,,,,,,,N,20140514,4,C819,,,20.0,20140514.0,,,20140731.0,,,,
87,201405,EP,120000,2001586,1214200085043,201405,304010090,810,120040,5,40,I,63602940000170,,{{~~|{,4,72,M,3,120040,10,69900970,0,1,20140514,20140731,4,1,21,0,0,1,0,0,,E120000001,1,0.0,2001586.0,20140514.0,20140514.0,0,C81- Linfoma de Hodgkin,0,,,,3,3.0,99,20130706,,,,,,,,N,20140514,1,,,,,,,,,,,,
89,201405,EP,120000,2001586,1214200084944,201403,304010090,695,120040,5,40,I,63602940000170,,}{{{{{|}{{{~,4,13,M,3,120020,10,69980000,0,1,20140318,20140531,4,1,21,0,0,1,0,0,,E120000001,1,0.0,2001586.0,20140318.0,20140318.0,0,C81- Linfoma de Hodgkin,0,,,,S,3.0,99,20120505,,,,,,,,N,20140318,2,C810,,,60.0,20140318.0,,,20140531.0,,,,
94,201405,EP,120000,2001586,1214200084944,201404,304010090,510,120040,5,40,I,63602940000170,,}{{{{{|}{{{~,4,13,M,3,120020,10,69980000,0,1,20140318,20140531,4,2,21,0,0,1,0,0,,E120000001,1,0.0,2001586.0,20140318.0,20140318.0,0,C81- Linfoma de Hodgkin,0,,,,S,3.0,99,20120505,,,,,,,,N,20140318,2,C810,,,60.0,20140318.0,,,20140531.0,,,,
96,201406,EP,120000,2001586,1214200085043,201406,304010090,510,120040,5,40,I,63602940000170,,{{~~|{,4,72,M,3,120040,10,69900970,0,1,20140514,20140731,4,2,15,0,0,0,1,0,20140616.0,E120000001,1,0.0,2001586.0,20140514.0,20140514.0,0,C81- Linfoma de Hodgkin,0,,,,3,3.0,99,20130706,,,,,,,,N,20140514,1,,,,,,,,,,,,


### Geração de reports
Mesmo com análises exaustivas de frações dos dados, um overview total - de correlações, missing values et al - é imprescindível. Utilizando `pandas_profiling`, geraremos então relatórios que servirão de insumo do cenário macro desses dados.

In [40]:
from pandas_profiling import ProfileReport

ProfileReport(df_mama_quim).to_file('mama_quimioterapia_report.html')
ProfileReport(df_mama_rad).to_file('mama_radioterapia_report.html')

### Missing values

Devemos nos atentar ao valor informativo das colunas - porcentagens muito altas de missing values são indicativos de inaplicabilidade ou simplesmente não preenchimento da informação.

Primeiramente, nos dados de quimioterapia:

In [64]:
1 - df_mama_quim.count().sort_values().head(20) / len(df_mama_quim)

etnia                0.999716
aq_cidini3           0.973344
aq_cidini2           0.937684
aq_dt_ini_3_trat     0.927430
dt_ocorr             0.901655
aq_dt_ini_2_trat     0.887407
aq_cidini1           0.835039
aq_dt_ini_1_trat     0.771918
aq_cid_10            0.670408
aq_esu_p2            0.472046
cnpj_mante           0.271705
num_apac_ant         0.095159
dt_auto              0.075321
aq_estadi            0.041121
dt_soli              0.039926
cod_soli_cnes        0.003130
aq_grahis            0.000527
cnpj_est_exe         0.000035
aq_linfo_reg_inva    0.000000
aq_dt_inden_pato     0.000000
dtype: float64

A porcentagem de valores faltantes em `etnia` é certamente alarmante, uma coluna praticamente 100% nula. Vejamos seus poucos não-nulos

In [68]:
len(df_mama_quim[~df_mama_quim.etnia.isnull()]), len(df_mama_quim)

(90, 316965)

Seguidamente, nos dados de radioterapia:

In [69]:
1 - df_mama_rad.count().sort_values().head(20) / len(df_mama_rad)

ar_smrd             1.000000
etnia               0.999046
ar_cidini3          0.976004
ar_cid_topo_3       0.947691
ar_cidini2          0.943122
ar_dt_ini_3_trat    0.940562
ar_dt_fim_3         0.937952
ar_dt_ini_3         0.937952
ar_dt_ini_2_trat    0.896436
ar_cid_topo_2       0.839408
ar_dt_fim_2         0.830572
ar_dt_ini_2         0.830572
ar_cidini1          0.717871
ar_dt_ini_1_trat    0.659036
ar_num_ins_3        0.562801
dt_ocorr            0.561697
ar_num_ins_2        0.506928
ar_cid_10           0.464056
cnpj_mante          0.359689
ar_trat_anter       0.284137
dtype: float64

O cenário de 99.9%+ nulos observados em `etnia` se repetem também aqui, porém a `ar_smrd` - que inclusive não possui descrição no dicionário correlato - se apresenta com absolutos 100%. Iremos portanto descartar esta coluna.

In [70]:
df_mama_rad = df_mama_rad.drop(columns=['ar_smrd'])

### Valores constantes

De modo similar a missing values, colunas com valores constantes também não apresentam valor significativo. A partir do relatório, notamos essa característica no **Tipo de Atendimento APAC** para ambos sets.

In [114]:
df_mama_quim.tp_atend_apac.unique()

array([3], dtype=int64)

In [115]:
df_mama_rad.tp_atend_apac.unique()

array([4], dtype=int64)

Portanto, serão desconsideradas como colunas.

In [121]:
df_mama_quim = df_mama_quim.drop(columns=['tp_atend_apac'])
df_mama_rad = df_mama_rad.drop(columns=['tp_atend_apac'])

### Agrupamento de datas
Diversas colunas em ambos sets são marcações de tempo, que podem ajudar a inferir a duração dum tratamento.

Como primeiro passo para tratá-las, devemos convertê-las ao formato `datetime`.

In [127]:
dtcols = [c for c in df_mama_quim.columns if c.startswith('dt_')]
df_mama_quim[dtcols].head(10)

Unnamed: 0,dt_process,dt_atend_paciente,dt_ini_val,dt_fim_val,dt_ocorr,dt_soli,dt_auto
4,201401,201401,20131105,20140131,,20131105.0,20131105
25,201401,201401,20131105,20140131,,20131105.0,20131105
27,201401,201401,20131105,20140131,,20131105.0,20131105
28,201401,201401,20131105,20140131,,20131105.0,20131105
32,201401,201401,20131128,20140131,,20131128.0,20131128
34,201401,201401,20131205,20140228,,20131205.0,20131205
47,201402,201402,20140204,20140430,,20140204.0,20140204
52,201402,201402,20140207,20140430,,20140207.0,20140207
76,201402,201402,20131205,20140228,,20131205.0,20131205
77,201402,201402,20140218,20140430,,20140218.0,20140218


In [176]:
df_mama_quim[dtcols].dtypes

dt_process           period[M]
dt_atend_paciente    period[M]
dt_ini_val               int64
dt_fim_val               int64
dt_ocorr               float64
dt_soli                float64
dt_auto                 object
dtype: object

As duas primeiras colunas - mês de processamento e atendimento ao paciente - formatam-se em ano-mês, portanto terão conversão diferenciada

In [150]:
ymth_dt = [dtcols[0:2]]
to_year_month = lambda df: df.apply(pd.to_datetime, format='%Y%M').apply(lambda date: date.to_period('M'))

In [144]:
df_mama_quim[ymth_dt] = df_mama_quim[ymth_dt].apply(to_year_month)
df_mama_rad[ymth_dt] = df_mama_rad[ymth_dt].apply(to_year_month)

df_mama_quim[ymth_dt].head(5)

Unnamed: 0,dt_process,dt_atend_paciente
4,2014-01,2014-01
25,2014-01,2014-01
27,2014-01,2014-01
28,2014-01,2014-01
32,2014-01,2014-01


Nota-se que `dt_soli` (data de solicitação) e `dt_ocorr` (data de ocorrência) são colunas `float`, portanto contando com casas decimais que não fazem sentido para o formato `%Y%M%d`.

In [206]:
ymd_dt = [dtcols[2:-1]]
# Retirada de casas decimais e conversão para datetime
to_ymd = lambda df: df.fillna(0).astype(int).astype(str).replace('0', np.nan).apply(pd.to_datetime, format='%Y%M%d')

In [207]:
df_mama_rad[['dt_ocorr']].apply(to_ymd)

Unnamed: 0,dt_ocorr
8,NaT
14,NaT
21,2014-01-28 00:02:00
29,2014-01-31 00:03:00
76,2014-01-31 00:05:00
77,NaT
87,NaT
89,NaT
94,NaT
96,2014-01-16 00:06:00
