# Aggregation and Grouping

## Exercises

For these exercises we will use the `titanic` dataset from `seaborn`.

In [45]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

plt.style.use('seaborn')

titanic = sns.load_dataset('titanic')

titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### Exercise 1

Create a Pivot Table which shows the number of people that embarked at each port categorized on `class` and `sex`. 

In [22]:
titanic[['class','sex']].value_counts()

pd.pivot_table(titanic, index = 'class', columns= 'sex', values='age', aggfunc='count')

sex,female,male
class,Unnamed: 1_level_1,Unnamed: 2_level_1
First,85,101
Second,74,99
Third,102,253


### Exercise 2

Modify the table to show the minumum and maximum fares based on the same categorizations.

In [62]:
pd.pivot_table(titanic, index = 'class', columns= 'sex', values= 'fare', aggfunc=['max','min'], margins=True)

Unnamed: 0_level_0,max,max,max,min,min,min
sex,female,male,All,female,male,All
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
First,512.3292,512.3292,512.3292,25.9292,0.0,0.0
Second,65.0,73.5,73.5,10.5,0.0,0.0
Third,69.55,69.55,69.55,6.75,0.0,0.0
All,512.3292,512.3292,512.3292,6.75,0.0,0.0


### Exercise 3

Use `df.groupby()` to show the descriptive statistics of the age of passengers based on their sex and whether or not they survived.

In [114]:
titanic_describe= titanic.groupby(['sex','alive'])['age'].describe()
titanic_describe


Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
sex,alive,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
female,no,64.0,25.046875,13.618591,2.0,16.75,24.5,33.25,57.0
female,yes,197.0,28.847716,14.175073,0.75,19.0,28.0,38.0,63.0
male,no,360.0,31.618056,14.056019,1.0,21.75,29.0,39.25,74.0
male,yes,93.0,27.276022,16.504803,0.42,18.0,28.0,36.0,80.0


### Exercise 4

Use `pd.cut` to bin the ages into four equal bins. Then use `df.groupby()` to show the number of survivors based on these age bins, their sex and class.

In [122]:
num_bins = 4 
# number of bins
age_bins = pd.cut(titanic['age'], num_bins)
titanic_set = titanic.groupby([age_bins, 'sex','class'])

cat_num_survivors = titanic_set['survived'].sum().unstack()
cat_num_survivors

Unnamed: 0_level_0,class,First,Second,Third
age,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0.34, 20.315]",female,13,16,24
"(0.34, 20.315]",male,4,10,15
"(20.315, 40.21]",female,44,41,22
"(20.315, 40.21]",male,22,3,21
"(40.21, 60.105]",female,23,11,0
"(40.21, 60.105]",male,13,1,2
"(60.105, 80.0]",female,2,0,1
"(60.105, 80.0]",male,1,1,0
