### Imports

In [3]:
import numpy as np
import pandas as pd

### Read Data Sets

In [4]:
types = {'SP_NAIH': str, 'SP_ATOPROF': str, 'SP_CIDPRI': str, 'SP_QTD_ATO': np.int32}

#### March

In [17]:
df_sp_mar = pd.read_csv('../../SPSP/SPSP1803.csv', sep=',', usecols=types.keys(), dtype=types)
df_sp_mar.head()

Unnamed: 0,SP_NAIH,SP_ATOPROF,SP_QTD_ATO,SP_CIDPRI
0,3518100401101,303010061,1,A09
1,3518100401101,301010170,8,A09
2,3518100401101,301010170,1,A09
3,3518100401101,301010170,1,A09
4,3518100401101,802010040,1,A09


In [8]:
df_sp_mar.shape

(3081817, 2)

#### April

In [18]:
df_sp_apr = pd.read_csv('../../SPSP/SPSP1804.csv', sep=',', usecols=types.keys(), dtype=types)
df_sp_apr.head()

Unnamed: 0,SP_NAIH,SP_ATOPROF,SP_QTD_ATO,SP_CIDPRI
0,3518102098456,303010061,1,A09
1,3518102098456,802010083,5,A09
2,3518102098456,206030010,1,A09
3,3518102098456,206030037,1,A09
4,3518102098456,603070019,1,A09


In [9]:
aih_types = {'N_AIH': str, 'SEXO': np.int32, 'IDADE': np.int32, 'INSTRU': np.int32}

#### RD

In [23]:
df_rd = pd.read_csv('../../AIH/sih_sus_m_10_49.csv', sep='|', usecols=aih_types.keys(), dtype=aih_types)
df_rd.head()

Unnamed: 0,N_AIH,SEXO,IDADE,INSTRU
0,3518101448609,3,10,0
1,3518101279825,3,47,0
2,3518101457475,3,25,0
3,3518101457662,3,49,0
4,3518101525246,3,14,0


### Concatenate March and April

In [20]:
df_concat_sp_mar_apr = pd.concat([df_sp_mar, df_sp_apr])
df_concat_sp_mar_apr.shape

(6170520, 4)

In [21]:
df_sp_mar.shape[0] + df_sp_apr.shape[0]

6170520

In [22]:
del df_sp_mar
del df_sp_apr

### Left Merge between SP and RD datasets

In [24]:
# merged_dfs = sp_df.merge(o_cids, left_on='SP_CIDPRI', right_on='code', how='left').drop('code', axis=1)
merged_sp_rd = df_concat_sp_mar_apr.merge(df_rd, left_on='SP_NAIH', right_on='N_AIH', how='left').drop('N_AIH', axis=1)

In [25]:
merged_sp_rd.shape

(6267475, 7)

In [26]:
merged_sp_rd = merged_sp_rd[merged_sp_rd['SEXO'].notna()]
merged_sp_rd.shape

(1950224, 7)

### Group by AIH Number

In [31]:
group_by_aih = merged_sp_rd.groupby('SP_NAIH').count()['SP_ATOPROF']
group_by_aih

SP_NAIH
3508100259059     72
3508100259081     64
3508100259125    288
3508100259235     24
3508100259290     64
                ... 
9918300002886      4
9918300002908      4
9918300002919     16
9918300002920      4
9918300002996      6
Name: SP_ATOPROF, Length: 142622, dtype: int64

In [41]:
group_by_aih.isna().sum()

0

In [33]:
group_by_aih.describe()

count    142622.000000
mean         13.674076
std          16.703053
min           1.000000
25%           7.000000
50%          12.000000
75%          16.000000
max        1248.000000
Name: SP_ATOPROF, dtype: float64

In [46]:
group_by_aih[group_by_aih.where(group_by_aih > 500).notna()] 

SP_NAIH
3512106003340     576
3512114842379     704
3513116296085    1183
3513121638940     640
3514104833668     768
3514106583086     686
3515112983833     960
3516108690590    1152
3516108690611    1216
3516115989606    1216
3516115989617    1248
3516115989639    1200
3516118590853    1008
3516123381639     896
3517107560835     912
3517107562089    1120
3517107562782     576
3517107562881     736
3517107563827    1088
3517121081628     532
Name: SP_ATOPROF, dtype: int64

### Group by ATO PROF

In [53]:
group_by_proc = merged_sp_rd.groupby('SP_ATOPROF').count()['SP_NAIH']
group_by_proc

SP_ATOPROF
0101020066      6
0101020074      2
0101020090      1
0201010011      5
0201010020     16
             ... 
0802010229     18
0802010253     77
0802010261     20
0802010270     22
0802020011    106
Name: SP_NAIH, Length: 2254, dtype: int64

In [54]:
group_by_proc.describe()

count      2254.000000
mean        865.228039
std        5584.232565
min           1.000000
25%           6.000000
50%          24.000000
75%         119.500000
max      152444.000000
Name: SP_NAIH, dtype: float64

In [57]:
group_by_proc[group_by_proc.where(group_by_proc > 119).notna()].sort_values()

SP_ATOPROF
0408060719       120
0202020207       120
0205010024       120
0408050659       120
0405020015       121
               ...  
0301010145     62094
0310010020     63537
0310010039     66622
0301010048     71885
0301010170    152444
Name: SP_NAIH, Length: 564, dtype: int64