In [1]:
import pandas as pd
%matplotlib inline

In [2]:
df = pd.read_sas('data/rata_s_srs_an.sas7bdat', encoding='iso8859-1')

In [3]:
df.head()

Unnamed: 0,COD_PRDA_RATE,IMP_APL_FINZ,IMP_CPT_FINZ,NUM_RATE,IMP_RATE_MENS,IMP_MAX_UNPAIDINS,Contract_ID
0,M,950000.0,950000.0,24.0,41310.0,,S_000001
1,M,50000.0,50000.0,12.0,4611.0,,S_000003
2,M,100000.0,100000.0,42.0,2960.0,0.0,S_000004
3,M,3600434.0,2324160.0,49.0,48420.0,60143.0,S_000005
4,M,211776.0,211776.0,85.0,2930.0,1256.0,S_000006


In [4]:
df['COD_PRDA_RATE'].unique()

array(['M', 'V', 'A', 'T', 'N', 'S'], dtype=object)

**Goal:** Summary statistics/operations per group in `COD_PRDA_RATE`.

In [5]:
df_avg = df.groupby('COD_PRDA_RATE')['IMP_CPT_FINZ', 'IMP_APL_FINZ'].mean()

**IMPORTANT:** To be sure that we really do create a copy (and not a reference) we can add .copy()

`small_df = df[['IMP_CPT_FINZ', 'IMP_APL_FINZ']].copy()`

In [6]:
df_avg

Unnamed: 0_level_0,IMP_CPT_FINZ,IMP_APL_FINZ
COD_PRDA_RATE,Unnamed: 1_level_1,Unnamed: 2_level_1
A,494849.6,
M,201167.4,235536.0
N,6465252.0,10564740.0
S,1223761.0,1408094.0
T,3653297.0,3536394.0
V,212637.2,259355.5


In [7]:
import numpy as np

In [8]:
def my_func(array):
    return np.sum(array)

In [9]:
df.groupby(['COD_PRDA_RATE', 'NUM_RATE'])['IMP_CPT_FINZ', 'IMP_APL_FINZ'].agg({
                                                                'IMP_CPT_FINZ':['mean', 'std'],
                                                                'IMP_APL_FINZ':my_func
})

Unnamed: 0_level_0,Unnamed: 1_level_0,IMP_CPT_FINZ,IMP_CPT_FINZ,IMP_APL_FINZ
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,my_func
COD_PRDA_RATE,NUM_RATE,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,5.0,1.059000e+06,,0.0
A,10.0,1.000000e+05,,0.0
A,16.0,1.644646e+06,,0.0
A,48.0,3.000000e+04,,0.0
A,51.0,3.000000e+05,,0.0
A,67.0,1.000000e+05,,0.0
A,77.0,1.200000e+05,,0.0
A,78.0,1.000000e+06,,0.0
A,88.0,1.000000e+05,,0.0
M,1.0,5.020890e+04,4.410830e+05,191320453.0


In [10]:
df.columns

Index(['COD_PRDA_RATE', 'IMP_APL_FINZ', 'IMP_CPT_FINZ', 'NUM_RATE',
       'IMP_RATE_MENS', 'IMP_MAX_UNPAIDINS', 'Contract_ID'],
      dtype='object')

### Percentage of total
- **Goal**: Calculate percentage of total of `num_rate` vs `cod_prda_rate`.

In [21]:
cod_num = df.pivot_table(columns='COD_PRDA_RATE', 
               index='NUM_RATE', 
               values='IMP_CPT_FINZ', 
               aggfunc='count')

In [22]:
num = df.groupby('NUM_RATE')['IMP_CPT_FINZ'].count()

In [20]:
100*cod_num.div(num, level='NUM_RATE', axis=0, )

COD_PRDA_RATE,A,M,N,S,T,V
NUM_RATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,,59.891540,0.010846,,,40.097614
2.0,,64.524422,,,,35.475578
3.0,,77.209302,0.465116,,,22.325581
4.0,,83.041401,,,,16.958599
5.0,0.056625,90.147225,0.283126,,0.056625,9.456399
6.0,,91.819292,0.040700,,,8.140008
7.0,,78.468368,,0.110988,,21.420644
8.0,,79.346867,,0.176523,,20.476611
9.0,,94.163424,,,,5.836576
10.0,0.005700,98.495298,0.011399,0.011399,,1.476204


In [24]:
100*pd.crosstab(index=df['NUM_RATE']
            , columns=df['COD_PRDA_RATE']
            , values=df['IMP_CPT_FINZ']
            , normalize='index'
            , dropna=False, aggfunc='count')

COD_PRDA_RATE,A,M,N,S,T,V
NUM_RATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,0.000000,59.891540,0.010846,0.000000,0.000000,40.097614
2.0,0.000000,64.524422,0.000000,0.000000,0.000000,35.475578
3.0,0.000000,77.209302,0.465116,0.000000,0.000000,22.325581
4.0,0.000000,83.041401,0.000000,0.000000,0.000000,16.958599
5.0,0.056625,90.147225,0.283126,0.000000,0.056625,9.456399
6.0,0.000000,91.819292,0.040700,0.000000,0.000000,8.140008
7.0,0.000000,78.468368,0.000000,0.110988,0.000000,21.420644
8.0,0.000000,79.346867,0.000000,0.176523,0.000000,20.476611
9.0,0.000000,94.163424,0.000000,0.000000,0.000000,5.836576
10.0,0.005700,98.495298,0.011399,0.011399,0.000000,1.476204
