# Aula 3 - exemplos

## Módulos

In [None]:
import numpy as np
import pandas as pd

## Carregar dados

In [None]:
dados_paises = pd.read_csv('dados_1997_2011_paises_csv.csv', sep = ';', decimal = ',', encoding='latin1')

## Métricas

### min, max, mean - Slide 6

In [None]:
dados_paises.groupby('pais') \
            .agg(min_idh = pd.NamedAgg('idh', 'min'),
                 max_idh = pd.NamedAgg('idh', 'max'),
                 media_idh = pd.NamedAgg('idh', 'mean')) \
            .reset_index()

Unnamed: 0,pais,min_idh,max_idh,media_idh
0,Alemanha,0.8466,0.905,0.88288
1,Austrália,0.8958,0.929,0.914507
2,Brasil,0.6464,0.718,0.684653
3,Bélgica,0.8628,0.886,0.876107
4,Canadá,0.8736,0.908,0.89048
5,China,0.5598,0.687,0.62524
6,Cingapura,0.7404,0.866,0.821587
7,Coréia,0.8078,0.897,0.856907
8,Espanha,0.8162,0.878,0.852427
9,Estados Unidos,0.8886,0.913,0.900947


### Moda - Slide 7

In [None]:
dados_paises.groupby('pais')['idh'] \
            .apply(lambda x: x.mode().iloc[0]) \
            .to_frame() \
            .reset_index()

Unnamed: 0,pais,idh
0,Alemanha,0.8466
1,Austrália,0.8958
2,Brasil,0.6464
3,Bélgica,0.8628
4,Canadá,0.903
5,China,0.5598
6,Cingapura,0.7404
7,Coréia,0.8078
8,Espanha,0.8162
9,Estados Unidos,0.8886


### Median - Slide 9

In [None]:
dados_paises.groupby('pais') \
            .agg(median_idh = pd.NamedAgg('idh', 'median')) \
            .reset_index()

Unnamed: 0,pais,median_idh
0,Alemanha,0.8888
1,Austrália,0.9156
2,Brasil,0.6866
3,Bélgica,0.8754
4,Canadá,0.8894
5,China,0.624
6,Cingapura,0.8282
7,Coréia,0.8588
8,Espanha,0.8534
9,Estados Unidos,0.901


### Percentile

- Exemplo numpy - Slide 13

- O numpy utiliza esta interpolação como default

In [None]:
'https://stackoverflow.com/questions/60467081/linear-interpolation-in-numpy-quantile'

def my_quantile(array, q):
    n = len(array)
    index = (n - 1) * q
    if int(index) == index:  # has no fractional part
        return array[index]
    fraction = index - int(index)
    left = int(index)
    right = left + 1
    i, j = array[left], array[right]
    return i + (j - i) * fraction

In [None]:
dados = [1, 1, 1, 2, 2, 3, 4, 4, 7, 8, 9]

In [None]:
np.quantile(dados, 0.35)

2.0

- Exemplo pandas - Slide 14

In [None]:
dados_paises.groupby('pais')['idh'] \
            .apply(lambda x: x.quantile([0.05, 0.95])) \
            .to_frame() \
            .reset_index() \
            .rename(columns={'level_1': 'percentil'})

Unnamed: 0,pais,percentil,idh
0,Alemanha,0.05,0.85066
1,Alemanha,0.95,0.9036
2,Austrália,0.05,0.89818
3,Austrália,0.95,0.9276
4,Brasil,0.05,0.65074
5,Brasil,0.95,0.7159
6,Bélgica,0.05,0.86588
7,Bélgica,0.95,0.8853
8,Canadá,0.05,0.87486
9,Canadá,0.95,0.9073


- Ajustes adicionais

In [None]:
percentis = dados_paises.groupby('pais')['idh'] \
                        .apply(lambda x: x.quantile([0.05, 0.95])) \
                        .to_frame() \
                        .reset_index() \
                        .rename(columns={'level_1': 'percentil'})

In [None]:
percentis.pivot(index='pais', columns='percentil', values='idh') \
         .reset_index() \
         .rename(columns={0.05:'p_5', 0.95:'p_95'})

percentil,pais,p_5,p_95
0,Alemanha,0.85066,0.9036
1,Austrália,0.89818,0.9276
2,Brasil,0.65074,0.7159
3,Bélgica,0.86588,0.8853
4,Canadá,0.87486,0.9073
5,China,0.56638,0.6835
6,Cingapura,0.75454,0.8646
7,Coréia,0.81298,0.8949
8,Espanha,0.82152,0.8766
9,Estados Unidos,0.89056,0.9095


### Quantiles

In [None]:
dados_paises.groupby('pais')['idh'] \
            .apply(lambda x: x.quantile([0.25, 0.5, 0.75])) \
            .to_frame() \
            .reset_index() \
            .rename(columns={'level_1': 'percentil'})

Unnamed: 0,pais,percentil,idh
0,Alemanha,0.25,0.8671
1,Alemanha,0.5,0.8888
2,Alemanha,0.75,0.9005
3,Austrália,0.25,0.9072
4,Austrália,0.5,0.9156
5,Austrália,0.75,0.923
6,Brasil,0.25,0.6677
7,Brasil,0.5,0.6866
8,Brasil,0.75,0.7025
9,Bélgica,0.25,0.8733


- Ajustes adicionais

In [None]:
quartis = dados_paises.groupby('pais')['idh'] \
                      .apply(lambda x: x.quantile([0.25, 0.5, 0.75])) \
                      .to_frame() \
                      .reset_index() \
                      .rename(columns={'level_1': 'percentil'})

In [None]:
quartis.pivot(index='pais', columns='percentil', values='idh') \
       .reset_index() \
       .rename(columns={0.25:'p_25', 0.5:'p_5', 0.75:'p_75'})

percentil,pais,p_25,p_5,p_75
0,Alemanha,0.8671,0.8888,0.9005
1,Austrália,0.9072,0.9156,0.923
2,Brasil,0.6677,0.6866,0.7025
3,Bélgica,0.8733,0.8754,0.881
4,Canadá,0.8803,0.8894,0.9015
5,China,0.5925,0.624,0.6605
6,Cingapura,0.8044,0.8282,0.8525
7,Coréia,0.8336,0.8588,0.8835
8,Espanha,0.8408,0.8534,0.8685
9,Estados Unidos,0.8975,0.901,0.9055


### Desvio - Slide 20

In [None]:
media_paises = dados_paises.groupby('pais') \
                           .agg(media_idh = pd.NamedAgg('idh', 'mean')) \
                           .reset_index()

In [None]:
dados_paises = dados_paises.merge(media_paises, on = 'pais', how = 'left')

In [None]:
dados_paises['desvio_idh'] = dados_paises['idh'] - dados_paises['media_idh']

In [None]:
dados_paises[['pais', 'ano', 'idh', 'desvio_idh']]

Unnamed: 0,pais,ano,idh,desvio_idh
0,África do Sul,1997,0.6328,0.02056
1,África do Sul,1998,0.6272,0.01496
2,África do Sul,1999,0.6216,0.00936
3,África do Sul,2000,0.6160,0.00376
4,África do Sul,2001,0.6126,0.00036
...,...,...,...,...
280,Reino Unido,2007,0.8560,0.00956
281,Reino Unido,2008,0.8600,0.01356
282,Reino Unido,2009,0.8600,0.01356
283,Reino Unido,2010,0.8620,0.01556


### Desvio médio - Slide 21

In [None]:
dados_paises.groupby('pais') \
            .agg(desvio_medio_idh = pd.NamedAgg('desvio_idh', 'mean')) \
            .reset_index()

Unnamed: 0,pais,desvio_medio_idh
0,Alemanha,1.258253e-16
1,Austrália,1.110223e-16
2,Brasil,-2.2204460000000003e-17
3,Bélgica,8.141636e-17
4,Canadá,-3.700743e-17
5,China,1.036208e-16
6,Cingapura,8.881784000000001e-17
7,Coréia,8.141636e-17
8,Espanha,0.0
9,Estados Unidos,1.184238e-16


### Desvio médio absoluto - Slide 23

In [None]:
dados_paises.groupby('pais') \
            .agg(desvio_medio_abs_idh = pd.NamedAgg('idh', 'mad')) \
            .reset_index()

Unnamed: 0,pais,desvio_medio_abs_idh
0,Alemanha,0.017301
1,Austrália,0.00874
2,Brasil,0.018983
3,Bélgica,0.004848
4,Canadá,0.010219
5,China,0.035243
6,Cingapura,0.029934
7,Coréia,0.025273
8,Espanha,0.015732
9,Estados Unidos,0.005124


### Variância e Desvio Padrão - Slide 25

In [None]:
dados_paises.groupby('pais') \
            .agg(variancia_idh = pd.NamedAgg('idh', 'var'),
                 dp_idh = pd.NamedAgg('idh', 'std')) \
            .reset_index()

Unnamed: 0,pais,variancia_idh,dp_idh
0,Alemanha,0.000408,0.020187
1,Austrália,0.00011,0.010485
2,Brasil,0.000517,0.022727
3,Bélgica,4.1e-05,0.006401
4,Canadá,0.000142,0.011899
5,China,0.001758,0.041928
6,Cingapura,0.001443,0.037984
7,Coréia,0.000886,0.029773
8,Espanha,0.000372,0.019299
9,Estados Unidos,4.3e-05,0.006558


In [None]:
dados_paises.columns

Index(['pais', 'ano', 'idh', 'corrupcao_indice', 'competitividade_indice',
       'globalizacao_indice', 'pib', 'populacao', 'media_idh', 'desvio_idh'],
      dtype='object')