In [1]:
import numpy as np
import pandas as pd

In [2]:
drinks = pd.read_csv('drinks.csv')

In [3]:
def consumo(a):
    if a <= 75:
        return 'low'
    if a >= 151:
        return 'high'
    else:
        return 'middle'

In [4]:
# Apply aplica função ao longo do dataframe de acordo com a coluna indicada
drinks['consumo'] = drinks['beer_servings'].apply(consumo)

In [5]:
drinks.head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,population,continent,consumo
0,Afghanistan,0,0,0,0.0,34.660.000,Asia,low
1,Albania,89,132,54,4.9,2.876.000,Europe,middle
2,Algeria,25,0,14,0.7,40.061.000,Africa,low
3,Andorra,245,138,312,12.4,77.281.000,Europe,high
4,Angola,217,57,45,5.9,28.081.000,Africa,high


# Exportando Dataframes

In [7]:
# sem indexação
drinks.to_csv('drinks_consumo.csv', encoding='utf-8', index=False)

In [8]:
# com indexação
drinks.to_csv('drinks_indice.csv', encoding='utf-8', index=True)

In [9]:
# alterando o separador e indexação padrão
drinks.to_csv('drinks_default.csv', encoding='utf-8', sep='\t')

# Importando

In [13]:
# dataframe de trabalho
drinks = pd.read_csv('drinks_consumo.csv')
drinks.columns

Index(['country', 'beer_servings', 'spirit_servings', 'wine_servings',
       'total_litres_of_pure_alcohol', 'population', 'continent', 'consumo'],
      dtype='object')

In [None]:
# outros
drinks_indice = pd.read_csv('drinks_indice.csv')
drinks_default = pd.read_table('drinks_default.csv') # com tab é read_table

In [14]:
del drinks['population']

In [15]:
drinks1 = drinks.groupby('continent')

In [None]:
drinks1.groups
len(drinks_1.groups)

In [17]:
orderedDrinks = drinks1.size()

In [18]:
orderedDrinks

continent
Africa           53
Asia             44
Europe           45
North America    23
Oceania          16
South America    12
dtype: int64

In [19]:
orderedDrinks.sort_values(ascending=True)

continent
South America    12
Oceania          16
North America    23
Asia             44
Europe           45
Africa           53
dtype: int64

In [21]:
drinks2 = drinks.groupby(['continent','consumo'])

In [22]:
total = drinks2.size()
total.sort_values(ascending=False)
total

continent      consumo
Africa         high        6
               low        39
               middle      8
Asia           high        1
               low        36
               middle      7
Europe         high       30
               low         6
               middle      9
North America  high       10
               low         5
               middle      8
Oceania        high        4
               low        10
               middle      2
South America  high        8
               middle      4
dtype: int64

# Movimentando indices

In [23]:
drinks3 = drinks.set_index('consumo')

In [24]:
# group by por level
drinks3 = drinks3.groupby(level='consumo')

In [25]:
drinks3.sum()

Unnamed: 0_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
consumo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
high,14143,7789,6427,517.0
low,2269,3453,1033,179.7
middle,4077,4390,2084,213.7


In [26]:
drinks4 = drinks.set_index(['consumo','continent'])

In [29]:
drinks4 = drinks4.groupby(level = ['consumo', 'continent'])

In [30]:
drinks4.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
consumo,continent,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
high,Africa,1495,294,272,39.3
high,Asia,247,326,73,11.5
high,Europe,7579,4190,4954,305.1
high,North America,2229,1827,362,78.7
high,Oceania,958,414,417,33.6
high,South America,1635,738,349,48.8
low,Africa,946,452,308,86.4
low,Asia,676,1246,271,42.1
low,Europe,125,439,293,15.4
low,North America,227,848,33,19.9


In [31]:
# Fazendo com numpy

In [32]:
drinks4.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
consumo,continent,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
high,Africa,1495,294,272,39.3
high,Asia,247,326,73,11.5
high,Europe,7579,4190,4954,305.1
high,North America,2229,1827,362,78.7
high,Oceania,958,414,417,33.6
high,South America,1635,738,349,48.8
low,Africa,946,452,308,86.4
low,Asia,676,1246,271,42.1
low,Europe,125,439,293,15.4
low,North America,227,848,33,19.9


In [34]:
drinks4.aggregate([np.sum, np.mean, np.size])

Unnamed: 0_level_0,Unnamed: 1_level_0,beer_servings,beer_servings,beer_servings,spirit_servings,spirit_servings,spirit_servings,wine_servings,wine_servings,wine_servings,total_litres_of_pure_alcohol,total_litres_of_pure_alcohol,total_litres_of_pure_alcohol
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,size,sum,mean,size,sum,mean,size,sum,mean,size
consumo,continent,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
high,Africa,1495,249.166667,6,294,49.0,6,272,45.333333,6,39.3,6.55,6.0
high,Asia,247,247.0,1,326,326.0,1,73,73.0,1,11.5,11.5,1.0
high,Europe,7579,252.633333,30,4190,139.666667,30,4954,165.133333,30,305.1,10.17,30.0
high,North America,2229,222.9,10,1827,182.7,10,362,36.2,10,78.7,7.87,10.0
high,Oceania,958,239.5,4,414,103.5,4,417,104.25,4,33.6,8.4,4.0
high,South America,1635,204.375,8,738,92.25,8,349,43.625,8,48.8,6.1,8.0
low,Africa,946,24.25641,39,452,11.589744,39,308,7.897436,39,86.4,2.215385,39.0
low,Asia,676,18.777778,36,1246,34.611111,36,271,7.527778,36,42.1,1.169444,36.0
low,Europe,125,20.833333,6,439,73.166667,6,293,48.833333,6,15.4,2.566667,6.0
low,North America,227,45.4,5,848,169.6,5,33,6.6,5,19.9,3.98,5.0


# Agrupamento por mais índices 

In [38]:
# No groupby level = 
# 1ºsetar o index
# 2ºfazer o groupby no index setado
drinks5 = drinks.set_index(['continent','consumo'])
drinks5 = drinks5.groupby(level=['continent','consumo'])
drinks5.sum().head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
continent,consumo,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Africa,high,1495,294,272,39.3
Africa,low,946,452,308,86.4
Africa,middle,817,120,282,33.7
Asia,high,247,326,73,11.5
Asia,low,676,1246,271,42.1
Asia,middle,707,1105,55,41.9
Europe,high,7579,4190,4954,305.1
Europe,low,125,439,293,15.4
Europe,middle,1016,1336,1153,67.3
North America,high,2229,1827,362,78.7


In [44]:
# Deixando apenas a coluna categórica = país
drinks6 = drinks.set_index(['consumo','continent','beer_servings','spirit_servings','wine_servings','total_litres_of_pure_alcohol'])
drinks6 = drinks6.groupby(level=['consumo','continent'])
drinks6.count().head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,country
consumo,continent,Unnamed: 2_level_1
high,Africa,6
high,Asia,1
high,Europe,30
high,North America,10
high,Oceania,4
high,South America,8
low,Africa,39
low,Asia,36
low,Europe,6
low,North America,5


In [45]:
drinks7 = drinks.set_index(['consumo','continent','beer_servings','spirit_servings','wine_servings','total_litres_of_pure_alcohol'])
drinks7 = drinks7.groupby(level=['consumo','continent','beer_servings'])
drinks7.sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,country
consumo,continent,beer_servings,Unnamed: 3_level_1
high,Africa,157,Seychelles
high,Africa,173,Botswana
high,Africa,217,Angola
high,Africa,225,South Africa
high,Africa,347,Gabon
high,Africa,376,Namibia
high,Asia,247,Russian Federation
high,Europe,152,Sweden
high,Europe,169,Norway
high,Europe,185,Switzerland
