### Imports

In [4]:
import numpy as np
import pandas as pd

### Read Data Sets

In [5]:
types = {'SP_NAIH': str, 'SP_ATOPROF': str, 'SP_CIDPRI': str, 'SP_QTD_ATO': np.int32}

#### CIDS

In [6]:
o_cids = pd.read_csv('../csv/o_cids.csv', sep=';')
o_cids.head()

Unnamed: 0,influence_type,code,description
0,Morte Materna Direta,O000,Gravidez abdominal
1,Morte Materna Direta,O001,Gravidez tubária
2,Morte Materna Direta,O002,Gravidez ovariana
3,Morte Materna Direta,O008,Outras formas de gravidez ectópica
4,Morte Materna Direta,O009,"Gravidez ectópica, não especificada"


#### Procedures

In [7]:
procedures = pd.read_csv('../data/procedimentos_obstetricia.csv', sep=';')
procedures.head()

Unnamed: 0,CODE,DESCRIPTION
0,205020143,Ultrassonografia obstétrica
1,205020186,Ultrassonografia transvaginal
2,205010059,Ultrassonografia doppler de fluxo obstétrico
3,205020151,Ultrassonografia obstétrica com doppler colori...
4,201010011,Amniocentese


#### March

In [8]:
df_sp_mar = pd.read_csv('../../sp_csv/SPSP1803.csv', sep=',', usecols=types.keys(), dtype=types)
df_sp_mar["SP_ATOPROF"] = df_sp_mar["SP_ATOPROF"].str[1:]
df_sp_mar.head()

Unnamed: 0,SP_NAIH,SP_ATOPROF,SP_QTD_ATO,SP_CIDPRI
0,3518100401101,303010061,1,A09
1,3518100401101,301010170,8,A09
2,3518100401101,301010170,1,A09
3,3518100401101,301010170,1,A09
4,3518100401101,802010040,1,A09


In [9]:
df_sp_mar.shape

(3081817, 4)

##### Filter CID 

In [10]:
df_sp_mar_cids = df_sp_mar.merge(o_cids, left_on='SP_CIDPRI', right_on='code', how='left').drop('code', axis=1)

df_sp_mar_cids = df_sp_mar_cids[df_sp_mar_cids['influence_type'].notna()]

In [11]:
df_sp_mar_cids.shape

(568700, 6)

##### Filter Procedures

In [12]:
df_sp_mar_proc = df_sp_mar.merge(procedures, left_on='SP_ATOPROF', right_on='CODE', how='left').drop('CODE', axis=1)
df_sp_mar_proc = df_sp_mar_proc[df_sp_mar_proc['DESCRIPTION'].notna()]

In [13]:
df_sp_mar_proc.shape

(169809, 5)

#### April

In [14]:
df_sp_apr = pd.read_csv('../../sp_csv/SPSP1804.csv', sep=',', usecols=types.keys(), dtype=types)
df_sp_apr["SP_ATOPROF"] = df_sp_apr["SP_ATOPROF"].str[1:]
df_sp_apr.shape

(3088703, 4)

##### Filter CIDS

In [15]:
df_sp_apr_cids = df_sp_apr.merge(o_cids, left_on='SP_CIDPRI', right_on='code', how='left').drop('code', axis=1)
df_sp_apr_cids = df_sp_apr_cids[df_sp_apr_cids['influence_type'].notna()]

In [16]:
df_sp_apr_cids.shape

(565244, 6)

In [17]:
df_sp_apr_cids.shape

(565244, 6)

##### Filter Procedures

In [18]:
df_sp_apr_proc = df_sp_apr.merge(procedures, left_on='SP_ATOPROF', right_on='CODE', how='left').drop('CODE', axis=1)
df_sp_apr_proc = df_sp_apr_proc[df_sp_apr_proc['DESCRIPTION'].notna()]

In [19]:
df_sp_apr_proc.shape

(167604, 5)

#### RD

In [20]:
aih_types = {'N_AIH': str, 'SEXO': np.int32, 'IDADE': np.int32,'INSTRU':np.int32,}

In [22]:
df_rd = pd.read_csv('/home/gxfs/Desktop/_ufrj/ic_saude/analise-aih-rd-sp/sih_sus_m_10_49.csv', sep='|', usecols=aih_types.keys(), dtype=aih_types)
df_rd.head()

Unnamed: 0,N_AIH,SEXO,IDADE,INSTRU
0,3518101448609,3,10,0
1,3518101279825,3,47,0
2,3518101457475,3,25,0
3,3518101457662,3,49,0
4,3518101525246,3,14,0


### Concatenate March and April

In [23]:
# como as tabelas resultantes do filtro de procedimentos não retornaram valores, vamos concatenar apenas as de cid
df_concat_sp_mar_apr = pd.concat([df_sp_mar_cids, df_sp_mar_proc, df_sp_apr_cids, df_sp_apr_proc])
df_concat_sp_mar_apr.shape

(1471357, 7)

In [24]:
df_concat_sp_mar_apr.drop_duplicates(inplace=True)

In [25]:
df_sp_mar_cids.shape[0] + df_sp_apr_cids.shape[0]

1133944

In [26]:
del df_sp_mar
del df_sp_apr

### Left Merge between SP and RD datasets

In [27]:
# merged_dfs = sp_df.merge(o_cids, left_on='SP_CIDPRI', right_on='code', how='left').drop('code', axis=1)
merged_sp_rd = df_concat_sp_mar_apr.merge(df_rd, left_on='SP_NAIH', right_on='N_AIH', how='left').drop('N_AIH', axis=1)

In [28]:
merged_sp_rd.shape

(1204467, 10)

In [29]:
merged_sp_rd = merged_sp_rd[merged_sp_rd['SEXO'].notna()]
merged_sp_rd.shape

(1190649, 10)

In [30]:
df_rd.head()

Unnamed: 0,N_AIH,SEXO,IDADE,INSTRU
0,3518101448609,3,10,0
1,3518101279825,3,47,0
2,3518101457475,3,25,0
3,3518101457662,3,49,0
4,3518101525246,3,14,0


### Group by AIH Number

In [31]:
group_by_aih = merged_sp_rd.groupby('SP_NAIH').count()['SP_ATOPROF']
group_by_aih

SP_NAIH
3517103370550     7
3517105344490    17
3517105347096    16
3517105347349    24
3517105348724    26
                 ..
3518500587437     1
3518500617148     1
3518500639269     1
3518500649830     1
3518502074208     2
Name: SP_ATOPROF, Length: 85131, dtype: int64

In [32]:
group_by_aih.isna().sum()

0

In [33]:
group_by_aih.describe()

count    85131.000000
mean        13.986080
std          6.843525
min          1.000000
25%         10.000000
50%         14.000000
75%         17.000000
max        400.000000
Name: SP_ATOPROF, dtype: float64

In [34]:
group_by_aih[group_by_aih.where(group_by_aih > 90).notna()].sort_values(ascending=False).head(10)

SP_NAIH
3517107562881    400
3517128148556    177
3517126938336    138
3517128303238    117
3518101249234    115
3517128303216    111
3518102279373     93
3518104118518     93
3517128302765     92
3518115764383     91
Name: SP_ATOPROF, dtype: int64

### Group by ATO PROF

In [35]:
group_by_proc = merged_sp_rd.groupby('SP_ATOPROF').count()['SP_NAIH']
group_by_proc

SP_ATOPROF
101020066        6
101020074        1
201010011        9
201010020        1
201010216        1
             ...  
802010032    33988
802010083      393
802010091       98
802010105       28
802010199     3793
Name: SP_NAIH, Length: 633, dtype: int64

In [36]:
group_by_proc.describe()

count      633.000000
mean      1880.962085
std       8411.129658
min          1.000000
25%          3.000000
50%         13.000000
75%        209.000000
max      85083.000000
Name: SP_NAIH, dtype: float64

In [42]:
group_by_proc[group_by_proc.where(group_by_proc > 10000).notna()].sort_values(ascending=False).head(20)

SP_ATOPROF
214010040    85083
211040061    83083
310010039    66469
301010170    62649
310010020    62406
301010145    60989
202031179    58202
202120023    48572
417010010    41673
411010034    40071
802010032    33988
202020380    29324
202020541    26692
301010048    26554
202120082    20846
801010039    20772
211070149    20134
303100044    16620
801010047    16510
417010028    14229
Name: SP_NAIH, dtype: int64

In [80]:
top_20_gb_proc = group_by_proc[group_by_proc.where(group_by_proc > 10000).notna()].sort_values(ascending=False)
top_20_gb_proc_df = pd.DataFrame(data = top_20_gb_proc.values, columns = ['QTD'], index = top_20_gb_proc.index)
top_20_gb_proc_df
mergex = top_20_gb_proc_df.merge(procedures, left_on='SP_ATOPROF', right_on='CODE', how='left')
mergex

Unnamed: 0,QTD,CODE,DESCRIPTION
0,85083,214010040,Teste rápido para detecção de HIV em gestante
1,83083,211040061,Tococardiografia anteparto
2,66469,310010039,Parto normal
3,41673,417010010,Anestesia Obstétrica p/ Cesariana
4,40071,411010034,Parto cesariano
5,16620,303100044,Tratamento de intercorrências clínicas na grav...
6,14229,417010028,Analgesia Obstétrica p/ Parto Normal
7,13912,417010036,Anestesia Obstétrica p/ Cesariana em Gestaça...
8,13851,411010026,Parto cesariano em gestação de alto risco
9,13844,211040010,Amnioscopia


In [78]:
mergex[mergex['QTD'].notna()]

Unnamed: 0,QTD,CODE,DESCRIPTION
0,85083.0,214010040.0,Teste rápido para detecção de HIV em gestante
1,83083.0,211040061.0,Tococardiografia anteparto
2,66469.0,310010039.0,Parto normal
3,62649.0,,
4,62406.0,,
5,60989.0,,
6,58202.0,,
7,48572.0,,
8,41673.0,417010010.0,Anestesia Obstétrica p/ Cesariana
9,40071.0,411010034.0,Parto cesariano


In [48]:
gb_proc = group_by_proc.sort_values(ascending=False)
gb_proc_df = pd.DataFrame(data = gb_proc.values, columns = ['QTD'], index = gb_proc.index)
gb_proc_df
gb_proc_desc_df = gb_proc_df.merge(procedures, left_on='SP_ATOPROF', right_on='CODE', how='left')
gb_proc_desc_df[gb_proc_desc_df['DESCRIPTION'].notna()].to_csv()

',QTD,CODE,DESCRIPTION\n0,85083,214010040,Teste rápido para detecção de HIV em gestante\n1,83083,211040061,Tococardiografia anteparto\n2,66469,310010039,Parto normal\n8,41673,417010010,Anestesia Obstétrica p/ Cesariana\n9,40071,411010034,Parto cesariano\n17,16620,303100044,Tratamento de intercorrências clínicas na gravidez\n19,14229,417010028,Analgesia Obstétrica p/ Parto Normal\n20,13912,417010036,Anestesia Obstétrica p/ Cesariana em Gestação de Alto Risco\n21,13851,411010026,Parto cesariano em gestação de alto risco\n22,13844,211040010,Amnioscopia\n23,13676,205020143,Ultrassonografia obstétrica\n27,10154,411020013,Curetagem pos-abortamento/puerperal\n31,7884,310010047,Parto normal em gestação de alto risco\n36,6608,603030017,Imunoglobulina anti-Rh\n40,5341,205020186,Ultrassonografia transvaginal\n47,3969,205020151,Ultrassonografia obstétrica com doppler colorido e pulsado\n55,3747,205010059,Ultrassonografia doppler de fluxo obstétrico\n60,2653,411010042,Parto cesariano com laque

In [59]:
all_gb_proc = group_by_proc[group_by_proc.notna()].sort_values(ascending=False)
all_gb_proc_df = pd.DataFrame(data = all_gb_proc.values, columns = ['QTD'], index = all_gb_proc.index)
all_gb_proc_df.to_csv()

'SP_ATOPROF,QTD\n214010040,85083\n211040061,83083\n310010039,66469\n301010170,62649\n310010020,62406\n301010145,60989\n202031179,58202\n202120023,48572\n417010010,41673\n411010034,40071\n802010032,33988\n202020380,29324\n202020541,26692\n301010048,26554\n202120082,20846\n801010039,20772\n211070149,20134\n303100044,16620\n801010047,16510\n417010028,14229\n417010036,13912\n411010026,13851\n211040010,13844\n205020143,13676\n202031110,12217\n202010201,11777\n202050017,10400\n411020013,10154\n214010058,8475\n202031098,7967\n201020050,7916\n310010047,7884\n202010317,7304\n202030202,7128\n202010694,6923\n202030300,6651\n603030017,6608\n202120031,6248\n202010643,5942\n202010651,5808\n205020186,5341\n301100055,5274\n202030083,5182\n203020030,5008\n214010074,4869\n202010368,4490\n401010023,4300\n205020151,3969\n202020371,3958\n202020142,3947\n202010635,3887\n202010600,3868\n202020134,3843\n202120090,3806\n802010199,3793\n205010059,3747\n202010120,3521\n202020304,3425\n202080080,2777\n212010026,2