<a href="https://colab.research.google.com/github/jiangenhe/insc-486-fall-2021/blob/main/week6/Week_6_lecture_groupby.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Aggregation and Group Operations

In [None]:
import numpy as np
import pandas as pd


## GroupBy Mechanics

In [None]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.560488,1.034524
1,a,two,0.250405,0.264297
2,b,one,0.162447,1.126248
3,b,two,0.612355,0.178326
4,a,one,0.341008,0.4773


### Iterating Over Groups

In [None]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one -0.204708  1.393406
1    a  two  0.478943  0.092908
4    a  one  1.965781  1.246435
b
  key1 key2     data1     data2
2    b  one -0.519439  0.281746
3    b  two -0.555730  0.769023


In [None]:
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one -0.204708  1.393406
4    a  one  1.965781  1.246435
('a', 'two')
  key1 key2     data1     data2
1    a  two  0.478943  0.092908
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.519439  0.281746
('b', 'two')
  key1 key2    data1     data2
3    b  two -0.55573  0.769023


### Grouping with Dicts and Series

In [None]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Information', 'Information', 'Communication', 'Marketing', 'Accounting'])

people

Unnamed: 0,a,b,c,d,e
Information,-0.92342,0.118504,1.319457,-2.077877,0.423268
Information,0.673992,-1.234317,-2.470079,-0.018862,-0.828012
Communication,0.005276,1.935357,0.729479,1.778803,-0.05815
Marketing,0.844224,0.390586,-0.604902,0.024523,-0.313069
Accounting,-0.183892,-0.060227,1.025027,1.087672,1.078066


In [None]:
mapping = {'Information': 'CCI', 'Communication': 'CCI',
           'Marketing': 'Business', 'Accounting': 'Business'}

In [None]:
by_column = people.groupby(mapping)
by_column.sum()

Unnamed: 0,a,b,c,d,e
Business,0.660332,0.330359,0.420124,1.112195,0.764997
CCI,-0.244152,0.819545,-0.421143,-0.317937,-0.462894


### Grouping with Functions

In [None]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
9,0.844224,0.390586,-0.604902,0.024523,-0.313069
10,-0.183892,-0.060227,1.025027,1.087672,1.078066
11,-0.249428,-1.115812,-1.150622,-2.09674,-0.404744
13,0.005276,1.935357,0.729479,1.778803,-0.05815


### Grouping by Index Levels

In [None]:
people = pd.DataFrame(np.random.randn(5, 5),
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=[['CCI', 'CCI', 'CCI', 'Business', 'Business'],
                             ['Information', 'Information', 'Communication', 'Marketing', 'Accounting']])
people.index.set_names(['college', 'department'], inplace=True)
people

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d,e
college,department,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CCI,Information,-0.690678,-1.60228,-2.612404,-0.277266,-0.109416
CCI,Information,-1.919981,-1.249646,0.101367,-0.944347,1.703704
CCI,Communication,-0.724422,-1.224436,-0.577544,-1.82106,0.781122
Business,Marketing,-0.51322,0.524163,-1.108485,1.752837,-0.290873
Business,Accounting,-0.181595,0.93844,0.734771,1.110126,-0.762199


In [None]:
people.groupby(level='college').count()

Unnamed: 0_level_0,a,b,c,d,e
college,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Business,2,2,2,2,2
CCI,3,3,3,3,3


## Data Aggregation

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.agg.html

In [None]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,0.560488,1.034524
1,a,two,0.250405,0.264297
2,b,one,0.162447,1.126248
3,b,two,0.612355,0.178326
4,a,one,0.341008,0.4773


In [None]:
grouped = df.groupby('key1')

In [None]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg([peak_to_peak])

Unnamed: 0_level_0,data1,data2
Unnamed: 0_level_1,peak_to_peak,peak_to_peak
key1,Unnamed: 1_level_2,Unnamed: 2_level_2
a,0.310083,0.770227
b,0.449908,0.947921


In [None]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,0.383967,0.159443,0.250405,0.295706,0.341008,0.450748,0.560488,3.0,0.59204,0.397726,0.264297,0.370798,0.4773,0.755912,1.034524
b,2.0,0.387401,0.318133,0.162447,0.274924,0.387401,0.499878,0.612355,2.0,0.652287,0.670282,0.178326,0.415307,0.652287,0.889267,1.126248


### Example: Random Sampling and Permutation

In [None]:
# Hearts, Spades, Clubs, Diamonds
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)

deck = pd.Series(card_val, index=cards)

In [None]:
deck

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
KS     10
QS     10
AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
KC     10
QC     10
AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
KD     10
QD     10
dtype: int64

In [None]:
len(deck)

52

In [None]:
def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

10C    10
2C      2
3C      3
6S      6
8S      8
dtype: int64

In [None]:
get_suit = lambda card: card[-1] # last letter is suit
deck.groupby(get_suit).apply(draw, n=2)

C  6C     6
   7C     7
D  8D     8
   QD    10
H  QH    10
   8H     8
S  KS    10
   5S     5
dtype: int64