# Data Grouping and Aggregation In Pandas

## Data Grouping: 'groupby'

#### The purpose of pivot tables for reporting and data visualization is to analyze the data based on certain grouping mechanics mthe pandas 'groupby' method will serve this purpose with high demand.

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.DataFrame.groupby?

In [3]:
marks = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a', 'a'],
                      'key2' : ['one', 'two', 'one', 'two', 'one', 'one'],
                     'data1' : np.arange(10, 16),
                     'data2' : np.arange(16, 22)})
marks

Unnamed: 0,key1,key2,data1,data2
0,a,one,10,16
1,a,two,11,17
2,b,one,12,18
3,b,two,13,19
4,a,one,14,20
5,a,one,15,21


In [4]:
marks['data1']

0    10
1    11
2    12
3    13
4    14
5    15
Name: data1, dtype: int32

In [5]:
grouped = marks['data1'].groupby(by=marks['key1'])
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000016553A73310>

In [6]:
grouped.mean()

key1
a    12.5
b    12.5
Name: data1, dtype: float64

In [7]:
print('a mean:', (10 + 11 + 14 + 15) / 4)
print('b mean:', (12 + 13)/2)

a mean: 12.5
b mean: 12.5


In [8]:
grouped.sum()

key1
a    50
b    25
Name: data1, dtype: int32

In [9]:
group_tk = marks['data1'].groupby(by=[marks['key1'], marks['key2']])
group_tk

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000016553AEBB50>

In [10]:
print(marks)
print()
print('result:', group_tk.mean())

  key1 key2  data1  data2
0    a  one     10     16
1    a  two     11     17
2    b  one     12     18
3    b  two     13     19
4    a  one     14     20
5    a  one     15     21

result: key1  key2
a     one     13.0
      two     11.0
b     one     12.0
      two     13.0
Name: data1, dtype: float64


In [11]:
print((10 + 14 +15)/3)

13.0


In [12]:
group_tk.mean().unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,13.0,11.0
b,12.0,13.0


In [13]:
feedb = np.array(['good', 'avg', 'good', 'avg', 'good', 'avg'])
actual = np.array(['good', 'med', 'good', 'med', 'good', 'med'])
mean1 = marks['data1'].groupby(by=[feedb, actual]).mean()
mean1

avg   med     13.0
good  good    12.0
Name: data1, dtype: float64

In [14]:
mean_df = marks.groupby(by=['key1']).mean()
mean_df

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,12.5,18.5
b,12.5,18.5


In [15]:
print(marks)
print()
marks.groupby(by=['key1', 'key2']).size()

  key1 key2  data1  data2
0    a  one     10     16
1    a  two     11     17
2    b  one     12     18
3    b  two     13     19
4    a  one     14     20
5    a  one     15     21



key1  key2
a     one     3
      two     1
b     one     1
      two     1
dtype: int64

#### How To Iterate Over Groups?

In [16]:
marks

Unnamed: 0,key1,key2,data1,data2
0,a,one,10,16
1,a,two,11,17
2,b,one,12,18
3,b,two,13,19
4,a,one,14,20
5,a,one,15,21


In [17]:
marks.groupby(by='key1')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000016553B26550>

In [18]:
for key_name, group_name in marks.groupby(by='key1'):
    print(key_name)
    print(group_name)
    print(type(key_name))
    print(type(group_name))

a
  key1 key2  data1  data2
0    a  one     10     16
1    a  two     11     17
4    a  one     14     20
5    a  one     15     21
<class 'str'>
<class 'pandas.core.frame.DataFrame'>
b
  key1 key2  data1  data2
2    b  one     12     18
3    b  two     13     19
<class 'str'>
<class 'pandas.core.frame.DataFrame'>


In [19]:
for (k1_name, k2_name), group_name in marks.groupby(by=['key1', 'key2']):
    print(k1_name, k2_name)
    print(group_name)

a one
  key1 key2  data1  data2
0    a  one     10     16
4    a  one     14     20
5    a  one     15     21
a two
  key1 key2  data1  data2
1    a  two     11     17
b one
  key1 key2  data1  data2
2    b  one     12     18
b two
  key1 key2  data1  data2
3    b  two     13     19


In [20]:
for (k1_name, k2_name), group_name in marks.groupby(by=['data1', 'data2']):
    print(k1_name, k2_name)
    print(group_name)

10 16
  key1 key2  data1  data2
0    a  one     10     16
11 17
  key1 key2  data1  data2
1    a  two     11     17
12 18
  key1 key2  data1  data2
2    b  one     12     18
13 19
  key1 key2  data1  data2
3    b  two     13     19
14 20
  key1 key2  data1  data2
4    a  one     14     20
15 21
  key1 key2  data1  data2
5    a  one     15     21


In [21]:
marks.dtypes

key1     object
key2     object
data1     int32
data2     int32
dtype: object

In [22]:
grouped = marks.groupby(marks.dtypes, axis=1)
for datatype, group in grouped:
    print(datatype)
    print(group)

int32
   data1  data2
0     10     16
1     11     17
2     12     18
3     13     19
4     14     20
5     15     21
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
5    a  one


#### Column Selection For Aggregation via 'groupby'

In [23]:
print(marks['data1']); print(marks[['data1', 'data2']]) 


0    10
1    11
2    12
3    13
4    14
5    15
Name: data1, dtype: int32
   data1  data2
0     10     16
1     11     17
2     12     18
3     13     19
4     14     20
5     15     21


In [24]:
sk_g = marks['data1'].groupby(by=marks['key1'])
dk_g = marks[['data1', 'data2']].groupby(by=marks['key2'])
sk_g
dk_g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000016553B3E310>

In [25]:
print(sk_g.sum())
print(dk_g.sum())

key1
a    50
b    25
Name: data1, dtype: int32
      data1  data2
key2              
one      51     75
two      24     36


In [26]:
sk_g = marks.groupby(by='key1')['data1'] 
dk_g = marks.groupby(by='key2')[['data1', 'data2']]   
print(sk_g.sum())
print(dk_g.sum())

key1
a    50
b    25
Name: data1, dtype: int32
      data1  data2
key2              
one      51     75
two      24     36


#### How To Group With Dictionaries and Series?

In [27]:
rmlist = pd.DataFrame(np.random.randn(4, 5),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['one', 'two', 'three', 'four'])
rmlist

Unnamed: 0,a,b,c,d,e
one,-0.309969,0.987877,0.452586,-0.522682,-0.901014
two,-0.565597,-1.10294,0.489426,-0.083648,-0.069644
three,-0.745153,0.890678,-1.401226,-0.885056,0.153773
four,0.829458,-0.674074,-0.365537,0.201638,1.533325


In [28]:
dic_map = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}
dic_map

{'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f': 'orange'}

In [29]:
g_column = rmlist.groupby(by=dic_map, axis=1)
g_column

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000016553B3EEB0>

In [30]:
g_column.sum()

Unnamed: 0,blue,red
one,-0.070096,-0.223106
two,0.405778,-1.738181
three,-2.286282,0.299297
four,-0.163899,1.688709


In [31]:
s_map = pd.Series(dic_map)

In [32]:
g_column = rmlist.groupby(by=s_map, axis=1)
g_column.sum()

Unnamed: 0,blue,red
one,-0.070096,-0.223106
two,0.405778,-1.738181
three,-2.286282,0.299297
four,-0.163899,1.688709


#### How To Group With Functions?

In [33]:
print(rmlist)
rmlist.groupby(len).sum()

              a         b         c         d         e
one   -0.309969  0.987877  0.452586 -0.522682 -0.901014
two   -0.565597 -1.102940  0.489426 -0.083648 -0.069644
three -0.745153  0.890678 -1.401226 -0.885056  0.153773
four   0.829458 -0.674074 -0.365537  0.201638  1.533325


Unnamed: 0,a,b,c,d,e
3,-0.875567,-0.115063,0.942012,-0.60633,-0.970658
4,0.829458,-0.674074,-0.365537,0.201638,1.533325
5,-0.745153,0.890678,-1.401226,-0.885056,0.153773


In [34]:
key_list = ['one', 'one', 'one', 'two']
print(rmlist)
rmlist.groupby([len, key_list]).sum() 

              a         b         c         d         e
one   -0.309969  0.987877  0.452586 -0.522682 -0.901014
two   -0.565597 -1.102940  0.489426 -0.083648 -0.069644
three -0.745153  0.890678 -1.401226 -0.885056  0.153773
four   0.829458 -0.674074 -0.365537  0.201638  1.533325


Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.875567,-0.115063,0.942012,-0.60633,-0.970658
4,two,0.829458,-0.674074,-0.365537,0.201638,1.533325
5,one,-0.745153,0.890678,-1.401226,-0.885056,0.153773


In [35]:
key_list = ['one', 'two', 'three', 'four']
print(rmlist)
rmlist.groupby([len, key_list]).sum() 

              a         b         c         d         e
one   -0.309969  0.987877  0.452586 -0.522682 -0.901014
two   -0.565597 -1.102940  0.489426 -0.083648 -0.069644
three -0.745153  0.890678 -1.401226 -0.885056  0.153773
four   0.829458 -0.674074 -0.365537  0.201638  1.533325


Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,-0.309969,0.987877,0.452586,-0.522682,-0.901014
3,two,-0.565597,-1.10294,0.489426,-0.083648,-0.069644
4,four,0.829458,-0.674074,-0.365537,0.201638,1.533325
5,three,-0.745153,0.890678,-1.401226,-0.885056,0.153773


#### How To Group by Index Level?

In [36]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'UK', 'RS', 'RS'],
                                    [1, 3, 5, 1, 3]],
                                    names=['city', 'tenor'])

hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,UK,RS,RS
tenor,1,3,5,1,3
0,-0.473201,1.71788,0.250799,-0.685004,-1.804484
1,1.178921,0.444449,-1.1914,-0.18056,-0.369455
2,-0.557842,-0.126816,2.154186,-0.084566,0.25181
3,-1.014344,-1.359445,-0.619905,0.051216,-0.821149


In [37]:
hier_df.groupby(level='city', axis=1).count()

city,RS,UK,US
0,2,1,2
1,2,1,2
2,2,1,2
3,2,1,2


## Data Aggregation

#### Aggregations refer to any data transformation that produces scalar values from arrays.
#### Some common aggregation methods are
* count 
* sum 
* mean 
* median 
* std, var 
* min, max 
* prod 
* first, last

#### still you can find many methods, these are just to illustrate

In [38]:
book = pd.read_csv(r'dataset/books_discount.csv', encoding='latin')
book.head() 

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/books_discount.csv'

In [None]:
print(book['price'].min()); print(book['price'].max())

In [None]:
print(book['min_dis'].min()); print(book['max_dis'].min())

In [None]:
grouped = book.groupby(by=['feedback', 'author'], axis=0)

In [None]:
def max_min(arr):
    return arr.max(), arr.min()

grouped.agg(max_min)

In [None]:
grouped.describe()

#### How To Aggregate Column-wise and with Multiple Functions?

In [None]:
grouped = book.groupby(by=['feedback', 'author'])

In [None]:
grouped['price'].agg('min')

In [None]:
grouped['price'].agg('max')

In [None]:
grouped['price'].agg('mean')

In [None]:
grouped['price'].agg(['min', 'max', 'mean', 'std'])

In [None]:
grouped['price'].agg([('min_value', 'min'), ('max_value', 'max'), 'mean', 'std'])

In [None]:
functions = [('min_value', 'min'), ('max_value', 'max'), 'mean', 'std']
result = grouped['price', 'max_dis'].agg(functions)
result

In [None]:
result['price']

In [None]:
functions = {'min_value': 'min', 'max_value':'max', 'mean_value':'mean', 'std_value':'std'}
result = grouped['price', 'max_dis'].agg(functions)
result

In [None]:
grouped = book.groupby(by=['feedback', 'author'], as_index=False).min()
grouped