# Groupby

The groupby method allows grouping rows of data together and perform aggregate functions (such as sum, median, mean, split, etc.).
___

In [1]:
import pandas as pd

data = {'Company':['Soup','Soup','Kebab','Kebab','Nuts','Nuts'], 'Region':['A','C','A','B','B','C'], 'Sales':[399,245,99,731,15,1047]}

In [2]:
df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Company,Region,Sales
0,Soup,A,399
1,Soup,C,245
2,Kebab,A,99
3,Kebab,B,731
4,Nuts,B,15
5,Nuts,C,1047


**Split: The .groupby() method groups multiple rows together based off of a column name.**

In [4]:
# To group a 'Company' to create a DataFrameGroupBy object:
df.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E0FF46E0A0>

In [5]:
# Save this object as a new variable:
a = df.groupby("Company")

In [18]:
# And then call an aggregate function (in the case, mean) on 'sales' column of the object:
a.mean('Sales')

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
Kebab,415.0
Nuts,531.0
Soup,322.0


In [19]:
# Call an aggregate function (sum) on 'sales' column:
a.sum('Sales')

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
Kebab,830
Nuts,1062
Soup,644


In [28]:
# An aggregate function + .loc[] method
a.sum('Sales').loc['Soup']

Sales    644
Name: Soup, dtype: int64

In [29]:
# A single line of code:
df.groupby('Company').mean('Sales').loc['Nuts']

Sales    531.0
Name: Nuts, dtype: float64

**Other aggregate functions:**

In [35]:
# Count the number of instances
df.groupby('Company').count()

Unnamed: 0_level_0,Region,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Kebab,2,2
Nuts,2,2
Soup,2,2


In [33]:
# Earliest of the alphabet
a.min()

Unnamed: 0_level_0,Region,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Kebab,A,99
Nuts,B,15
Soup,A,245


In [34]:
# Latest of the alphabet
a.max()

Unnamed: 0_level_0,Region,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
Kebab,B,731
Nuts,C,1047
Soup,C,399


In [36]:
# Basic statistical info of the dataframe
a.describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Kebab,2.0,415.0,446.891486,99.0,257.0,415.0,573.0,731.0
Nuts,2.0,531.0,729.734198,15.0,273.0,531.0,789.0,1047.0
Soup,2.0,322.0,108.894444,245.0,283.5,322.0,360.5,399.0


In [38]:
# A different display arrangement
a.describe().transpose()

Unnamed: 0,Company,Kebab,Nuts,Soup
Sales,count,2.0,2.0,2.0
Sales,mean,415.0,531.0,322.0
Sales,std,446.891486,729.734198,108.894444
Sales,min,99.0,15.0,245.0
Sales,25%,257.0,273.0,283.5
Sales,50%,415.0,531.0,322.0
Sales,75%,573.0,789.0,360.5
Sales,max,731.0,1047.0,399.0


In [39]:
# Display for a single company
a.describe().transpose()['Nuts']

Sales  count       2.000000
       mean      531.000000
       std       729.734198
       min        15.000000
       25%       273.000000
       50%       531.000000
       75%       789.000000
       max      1047.000000
Name: Nuts, dtype: float64

___

In [42]:
a = {'A':['KR','KR','KR','GB','GB','GB'], 'B':[1,2,3,4,5,5], 'C':[200,120,340,124,243,350]}
df1 = pd.DataFrame(a)
df1

Unnamed: 0,A,B,C
0,KR,1,200
1,KR,2,120
2,KR,3,340
3,GB,4,124
4,GB,5,243
5,GB,5,350


In [43]:
df1.groupby('A').sum('C')

Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
GB,14,717
KR,6,660
