**Objetivos**

Este capítulo abordará:
1. Operações `groupby` para agregar, transformar e filtrar dados.
2. Funções embutidas e funções de usuário personalizadas para realizar operações `groupby`.

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv("gapminder.tsv", sep = "\t")
df.head(5)

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [2]:
# agregação básic acom agrupamento de uma única variável

# calcula a expectativa de vida média para cada ano
avg_life_exp_by_year = df.groupby("year")["lifeExp"].mean()
avg_life_exp_by_year

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [3]:
# obtém uma lista de anos únicos dos dados
years = df["year"].unique()
years

array([1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, 2002,
       2007], dtype=int64)

In [4]:
# obtém um subconjunto dos dados do ano 1952
y1952 = df.loc[df["year"] == 1952, :]
y1952.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
12,Albania,Europe,1952,55.23,1282697,1601.056136
24,Algeria,Africa,1952,43.077,9279525,2449.008185
36,Angola,Africa,1952,30.015,4232095,3520.610273
48,Argentina,Americas,1952,62.485,17876956,5911.315053


In [5]:
y1952_mean = y1952["lifeExp"].mean()
y1952_mean

49.05761971830987

In [6]:
# agrupa por continente e descreve cada grupo
continent_describe = df.groupby("continent")["lifeExp"].describe()
continent_describe

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Africa,624.0,48.86533,9.15021,23.599,42.3725,47.792,54.4115,76.442
Americas,300.0,64.658737,9.345088,37.579,58.41,67.048,71.6995,80.653
Asia,396.0,60.064903,11.864532,28.801,51.42625,61.7915,69.50525,82.603
Europe,360.0,71.903686,5.433178,43.585,69.57,72.241,75.4505,81.757
Oceania,24.0,74.326208,3.795611,69.12,71.205,73.665,77.5525,81.235


In [9]:
# usando agg ou aggregate + numpy

# calcula a expectativa de vida média por continente, mas usa a função np.mean
cont_le_agg = df.groupby("continent")["lifeExp"].agg(np.mean) # .aggregate(np.mean) faz o mesmo
cont_le_agg

continent
Africa      48.865330
Americas    64.658737
Asia        60.064903
Europe      71.903686
Oceania     74.326208
Name: lifeExp, dtype: float64

In [10]:
# criando minha própria função
def my_mean(values):
    """Minha versão de cálculo de uma média
    """
    # obtém a quantidade total de números para o denominador
    n = len(values)
    
    # inicia a soma com 0
    my_sum = 0
    
    for value in values:
        my_sum += value
        
    return (my_sum / n)

# passando a função para agg ou aggregate
agg_my_mean = df.groupby("year")["lifeExp"].agg(my_mean)
agg_my_mean

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [11]:
def my_mean_diff(values, diff_value):
    """
    Diferença entre a média e diff_value
    """
    n = len(values)
    my_sum = 0
    
    for value in values:
        my_sum += value
    
    mean = my_sum / n
    return (mean - diff_value)

# calcula a média global da expectativa de vida média
global_mean = df["lifeExp"].mean()
global_mean

59.47443936619714

In [13]:
# função de agregação personalizada com vários parâmetros
agg_mean_diff = df.groupby("year")["lifeExp"].agg(my_mean_diff, diff_value = global_mean)
agg_mean_diff

year
1952   -10.416820
1957    -7.967038
1962    -5.865190
1967    -3.796150
1972    -1.827053
1977     0.095718
1982     2.058758
1987     3.738173
1992     4.685899
1997     5.540237
2002     6.220483
2007     7.532983
Name: lifeExp, dtype: float64

In [15]:
# várias funções simultaneamente são passadas como uma lista para agg ou aggregate

# calcula o contador, a média e o DP de lifeExp por ano
gdf = df.groupby("year")["lifeExp"].agg([np.count_nonzero, np.mean, np.std])
gdf

Unnamed: 0_level_0,count_nonzero,mean,std
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,142,49.05762,12.225956
1957,142,51.507401,12.231286
1962,142,53.609249,12.097245
1967,142,55.67829,11.718858
1972,142,57.647386,11.381953
1977,142,59.570157,11.227229
1982,142,61.533197,10.770618
1987,142,63.212613,10.556285
1992,142,64.160338,11.22738
1997,142,65.014676,11.559439


In [16]:
# transformação

def zscore(x):
    """
    Calcula o escore z dos dados fornecidos; 
    "x" é um vetor ou uma série de valores
    """
    return((x - x.mean()) / x.std())

transform_z = df.groupby("year")["lifeExp"].transform(zscore)
df.shape

(1704, 6)

In [17]:
transform_z.shape

(1704,)

In [18]:
transform_z

0      -1.656854
1      -1.731249
2      -1.786543
3      -1.848157
4      -1.894173
          ...   
1699   -0.081621
1700   -0.336974
1701   -1.574962
1702   -2.093346
1703   -1.948180
Name: lifeExp, Length: 1704, dtype: float64

In [20]:
transform_z.mean()

1.8334316850323887e-16

In [21]:
transform_z.std()

0.9967651731148638

In [22]:
# import a função zscore de scipy.stats
from scipy.stats import zscore

# calcula um zscore com agrupamento
sp_z_grouped = df.groupby("year")["lifeExp"].transform(zscore)

# calcula um zscore sem agrupamento
sp_z_nogroup = zscore(df["lifeExp"])

In [23]:
# escore z com agrupamento
transform_z.head()

0   -1.656854
1   -1.731249
2   -1.786543
3   -1.848157
4   -1.894173
Name: lifeExp, dtype: float64

In [24]:
# escore z com agrupamento usando scipy
sp_z_grouped.head()

0   -1.662719
1   -1.737377
2   -1.792867
3   -1.854699
4   -1.900878
Name: lifeExp, dtype: float64

In [25]:
# escore z sem agrupamento
sp_z_nogroup[:5]

0   -2.375334
1   -2.256774
2   -2.127837
3   -1.971178
4   -1.811033
Name: lifeExp, dtype: float64

In [3]:
import seaborn as sns

# define a semente para que os resultados sejam determinísticos
np.random.seed(42)

# amostra 10 linhas de tips
tips_10 = sns.load_dataset("tips").sample(10)

# escolhe aleatoriamente 4 valores de "total_bill" e os transforma em ausentes
tips_10.loc[np.random.permutation(tips_10.index)[:4],
           "total_bill"] = np.NaN

tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,,2.0,Male,No,Sun,Dinner,4
211,,5.16,Male,Yes,Sat,Dinner,4
198,,2.0,Female,Yes,Thur,Lunch,2
176,,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [28]:
# valores não ausentes por sex
count_sex = tips_10.groupby("sex").count()
count_sex

Unnamed: 0_level_0,total_bill,tip,smoker,day,time,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Male,4,7,7,7,7,7
Female,2,3,3,3,3,3


In [29]:
# média agrupada
def fill_na_mean(x):
    """Devolve a média de um dado vetor
    """
    avg = x.mean()
    return(x.fillna(avg))

# calcula um "total_bill" médio por sex
total_bill_group_mean = tips_10.groupby("sex")["total_bill"].transform(fill_na_mean)

# faz a atribuição a uma nova coluna nos dados originais;
# você também pode substituir a coluna original usando "total_bill"
tips_10["fill_total_bill"] = total_bill_group_mean
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,fill_total_bill
24,19.82,3.18,Male,No,Sat,Dinner,2,19.82
6,8.77,2.0,Male,No,Sun,Dinner,2,8.77
153,,2.0,Male,No,Sun,Dinner,4,17.9525
211,,5.16,Male,Yes,Sat,Dinner,4,17.9525
198,,2.0,Female,Yes,Thur,Lunch,2,13.93
176,,2.0,Male,Yes,Sun,Dinner,2,17.9525
192,28.44,2.56,Male,Yes,Thur,Lunch,2,28.44
124,12.48,2.52,Female,No,Thur,Lunch,2,12.48
9,14.78,3.23,Male,No,Sun,Dinner,2,14.78
101,15.38,3.0,Female,Yes,Fri,Dinner,2,15.38


In [30]:
tips_10[["sex", "total_bill", "fill_total_bill"]]

Unnamed: 0,sex,total_bill,fill_total_bill
24,Male,19.82,19.82
6,Male,8.77,8.77
153,Male,,17.9525
211,Male,,17.9525
198,Female,,13.93
176,Male,,17.9525
192,Male,28.44,28.44
124,Female,12.48,12.48
9,Male,14.78,14.78
101,Female,15.38,15.38


In [4]:
# carrega o conjunto de dados tips
tips = sns.load_dataset("tips")

# observa o número de linhas nos dados originais
tips.shape

(244, 7)

In [5]:
# observa os contadores de frequência para o tamanho da mesa
tips["size"].value_counts()

2    156
3     38
4     37
5      5
1      4
6      4
Name: size, dtype: int64

In [6]:
# filtra os dados de modo que cada grupo tenha mais de 30 observações
tips_filtered = tips.groupby("size").filter(lambda x: x["size"].count() >= 30)
tips_filtered.shape

(231, 7)

In [7]:
tips_filtered["size"].value_counts()

2    156
3     38
4     37
Name: size, dtype: int64

In [8]:
tips_10 = sns.load_dataset("tips").sample(10, random_state = 42)
tips_10

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
24,19.82,3.18,Male,No,Sat,Dinner,2
6,8.77,2.0,Male,No,Sun,Dinner,2
153,24.55,2.0,Male,No,Sun,Dinner,4
211,25.89,5.16,Male,Yes,Sat,Dinner,4
198,13.0,2.0,Female,Yes,Thur,Lunch,2
176,17.89,2.0,Male,Yes,Sun,Dinner,2
192,28.44,2.56,Male,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
9,14.78,3.23,Male,No,Sun,Dinner,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [9]:
# salva somente o objeto agrupado
grouped = tips_10.groupby("sex")

# observe que temos apenas o objeto e seu endereço de memória
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000002D679F5730>

In [10]:
# vê realmente os grupos de groupby;
# somente o índice é devolvido
grouped.groups

{'Male': [24, 6, 153, 211, 176, 192, 9], 'Female': [198, 124, 101]}

In [11]:
# calcula a média nas colunas relevantes
avgs = grouped.mean()
avgs

Unnamed: 0_level_0,total_bill,tip,size
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Male,20.02,2.875714,2.571429
Female,13.62,2.506667,2.0


In [12]:
# lista todas as colunas
tips_10.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [13]:
# obtém o grupo "Female"
female = grouped.get_group("Female")
female

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
198,13.0,2.0,Female,Yes,Thur,Lunch,2
124,12.48,2.52,Female,No,Thur,Lunch,2
101,15.38,3.0,Female,Yes,Fri,Dinner,2


In [14]:
for sex_group in grouped:
    print(sex_group)

('Male',      total_bill   tip   sex smoker   day    time  size
24        19.82  3.18  Male     No   Sat  Dinner     2
6          8.77  2.00  Male     No   Sun  Dinner     2
153       24.55  2.00  Male     No   Sun  Dinner     4
211       25.89  5.16  Male    Yes   Sat  Dinner     4
176       17.89  2.00  Male    Yes   Sun  Dinner     2
192       28.44  2.56  Male    Yes  Thur   Lunch     2
9         14.78  3.23  Male     No   Sun  Dinner     2)
('Female',      total_bill   tip     sex smoker   day    time  size
198       13.00  2.00  Female    Yes  Thur   Lunch     2
124       12.48  2.52  Female     No  Thur   Lunch     2
101       15.38  3.00  Female    Yes   Fri  Dinner     2)


In [15]:
# trabalhando com MultiIndex

intv_df = pd.read_csv("epi_sim.txt")
intv_df.shape

(9434653, 6)

In [16]:
intv_df.head()

Unnamed: 0,ig_type,intervened,pid,rep,sid,tr
0,3,40,294524448,1,201,0.000135
1,3,40,294571037,1,201,0.000135
2,3,40,290699504,1,201,0.000135
3,3,40,288354895,1,201,0.000135
4,3,40,292271290,1,201,0.000135


In [17]:
count_only = intv_df.groupby(["rep", "intervened", "tr"])["ig_type"].count()
count_only.head(n = 10)

rep  intervened  tr      
0    8           0.000166    1
     9           0.000152    3
                 0.000166    1
     10          0.000152    1
                 0.000166    1
     12          0.000152    3
                 0.000166    5
     13          0.000152    1
                 0.000166    3
     14          0.000152    3
Name: ig_type, dtype: int64