# Pandas Aggregation
In this chapter we will discussion data group by and aggregation using Pandas.

### Group By
A dataframe, can group by a list of key columns. 

In [3]:
# Initilaize data set
from pandas import DataFrame
import pandas as pd
import numpy as np
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)});
print(df)

  key1 key2     data1     data2
0    a  one  0.566805 -0.100761
1    a  two  0.611113 -0.332433
2    b  one -1.107966  0.157592
3    b  two  0.287503 -0.668305
4    a  one  0.693716  0.950847


#### Calculate mean based on key1

In [24]:
# calculate mean value
mean = df.groupby('key1')['data1'].mean();
print(mean);

key1
a    0.623878
b   -0.410231
Name: data1, dtype: float64


#### we can put a list of columns as group by

In [23]:
# calculate mean value
mean = df.groupby(['key1', 'key2'])['data2'].mean();
print(mean);

key1  key2
a     one     0.425043
      two    -0.332433
b     one     0.157592
      two    -0.668305
Name: data2, dtype: float64


In [10]:
# calculate group size
mean = df.groupby(df['key1']).size();
print(mean);

key1
a    3
b    2
dtype: int64


In [14]:
print(dict(list(df.groupby('key1'))))

{'a':   key1 key2     data1     data2
0    a  one  0.566805 -0.100761
1    a  two  0.611113 -0.332433
4    a  one  0.693716  0.950847, 'b':   key1 key2     data1     data2
2    b  one -1.107966  0.157592
3    b  two  0.287503 -0.668305}


#### Calculate quantile on key

In [30]:
print(df.groupby('key1')['data1'].quantile(0.9))

key1
a    0.677195
b    0.147956
Name: data1, dtype: float64


#### use customized function as aggregation funcion.

In [48]:
# define your own aggregation function
def peak_to_peak(arr):
    return arr.max() - arr.min();
result = df.groupby('key1')['data1'].agg(peak_to_peak);
print(result);


key1
a    0.126910
b    1.395469
Name: data1, dtype: float64


#### Mapping columns

In [49]:
# Initilaize data set
from pandas import DataFrame
import pandas as pd
import numpy as np
people = DataFrame(
                   np.random.randn(5,5), 
                   columns = ['a', 'b', 'c', 'd', 'e'],
                   index = ['Joe', 'Steve', 'Wes', 'Jim', 'Travis'] 
                   );
# map the columns to color
mapping = {'a' : 'red', 'b' : 'red', 'c' : 'blue', 'd': 'blue', 'e': 'red', 'f' : 'organge'};
#calculate sum
print(people.groupby(mapping, axis=1).sum());


            blue       red
Joe     0.963148  2.772641
Steve  -0.068170  1.132276
Wes    -0.029720  0.266524
Jim     0.807887  1.358557
Travis -0.852801 -0.508133


#### Optimized groupby methods
| Function Name | Description |
| --- | --- |
| count | Number of non-NA values in the group. |
| sum | Sum of non-NA values |
| mean | Mean of non-NA values |
| median | Arithmetic median of non-NA values. |
| std, var | Unbiased(n-1 denominator) standard deviation and variance. |
| min, max | Minimum and maximum of non-NA values. |
| prod | Product of non-NA values. |
| first, last | First and last non-NA values. |

In [76]:
# Initilaize data set
from pandas import DataFrame
import pandas as pd
import numpy as np
tips = DataFrame({
                  'total_bill' : [16.99, 10.34, 21.01, 23.68, 24.59, 25.29],
                  'tip' :[1.01, 1.66, 3.50, 3.31, 3.61, 4.71],
                  'sex' : ['Female', 'Male', 'Male', 'Male', 'Female', 'Male'],
                  'smoker' : ['No', 'No', 'No', 'Yes', 'Yes', 'Yes'],
                  'day' : ['Sun', 'Sat', 'Sat', 'Sun', 'Sun', 'Sun'],
                  'time' : ['Dinner', 'Lunch', 'Dinner', 'Dinner', 'Lunch', 'Dinner'],
                  'size' : [2, 3, 3, 2, 4, 4]
                });
# map the columns to color
tips['tips_pct'] = tips['tip'] / tips['total_bill'];
#calculate sum
print(tips);
aggr= tips.groupby(['sex', 'smoker'])['tips_pct'].apply(np.mean);
print(aggr)


   total_bill   tip     sex smoker  day    time  size  tips_pct
0       16.99  1.01  Female     No  Sun  Dinner     2  0.059447
1       10.34  1.66    Male     No  Sat   Lunch     3  0.160542
2       21.01  3.50    Male     No  Sat  Dinner     3  0.166587
3       23.68  3.31    Male    Yes  Sun  Dinner     2  0.139780
4       24.59  3.61  Female    Yes  Sun   Lunch     4  0.146808
5       25.29  4.71    Male    Yes  Sun  Dinner     4  0.186240
sex     smoker
Female  No        0.059447
        Yes       0.146808
Male    No        0.163564
        Yes       0.163010
Name: tips_pct, dtype: float64


### Random Sampling and Permutation

In [6]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
# Spade, Heart, Diamond, Club
suits = ['S', 'H', 'D', 'C'];
card_val = (list(range(1, 11)) + [10] * 3) * 4;
base_names = ['A'] + list(range(2, 11)) + ['J', 'Q', 'K']
cards = []
for suit in ['S', 'H', 'D', 'C']:
    cards.extend(suit + str(name) for name in base_names)
deck = Series(card_val, index = cards);

def draw(deck, n = 5):
    return deck.take(np.random.permutation(len(deck))[:n])

candidates = deck.groupby(lambda card:card[0], group_keys = False).apply(draw, 2)
print(candidates)

CQ    10
C4     4
DA     1
D4     4
H5     5
HK    10
S2     2
SJ    10
dtype: int64


#### Group Weighted Average and Correlation

In [8]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
df = DataFrame({'category' : ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
                'data': np.random.randn(8),
                'weights' : np.random.randn(8)})
category = df.groupby('category').apply(lambda g : np.average(g['data'], weights=g['weights']))
print(category)

category
a    0.594622
b    1.272249
dtype: float64
