# GroupBy

In [1]:
import numpy as np
import pandas as pd

In [2]:
frame = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
                      'key2': ['one', 'two', 'one', 'two', 'one'],
                      'data1': np.random.randn(5),
                      'data2': np.random.randn(5)})

In [3]:
frame

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.002912,0.707687
1,a,two,-1.147644,1.733858
2,b,one,0.069725,-0.062189
3,b,two,0.463306,0.407336
4,a,one,2.445751,0.649328


In [4]:
# dataframe['values'].groupby(dataframe['key labels'])

grouped = frame['data1'].groupby(frame['key1'])   #split
grouped.mean()                                    #apply

key1
a    0.767006
b    0.266515
Name: data1, dtype: float64

In [5]:
means = frame['data1'].groupby([frame['key1'], frame['key2']]).mean()
means

key1  key2
a     one     1.724332
      two    -1.147644
b     one     0.069725
      two     0.463306
Name: data1, dtype: float64

In [6]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1.724332,-1.147644
b,0.069725,0.463306


In [7]:
#grouping keys and values should have equal length to perform group ops

states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])

frame['data1'].groupby([states, years]).mean()

California  2005   -1.147644
            2006    0.069725
Ohio        2005    0.733109
            2006    2.445751
Name: data1, dtype: float64

In [8]:
frame.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.767006,1.030291
b,0.266515,0.172573


In [9]:
frame.groupby(['key1', 'key2']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,1.724332,0.678507
a,two,-1.147644,1.733858
b,one,0.069725,-0.062189
b,two,0.463306,0.407336


In [10]:
frame.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## 1. Iterating Over Groups

In [11]:
frame

Unnamed: 0,key1,key2,data1,data2
0,a,one,1.002912,0.707687
1,a,two,-1.147644,1.733858
2,b,one,0.069725,-0.062189
3,b,two,0.463306,0.407336
4,a,one,2.445751,0.649328


In [12]:
for name, group in frame.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  1.002912  0.707687
1    a  two -1.147644  1.733858
4    a  one  2.445751  0.649328
b
  key1 key2     data1     data2
2    b  one  0.069725 -0.062189
3    b  two  0.463306  0.407336


In [13]:
#multiple keys
# k1 * n distinct x k2 * p distinct

for (k1, k2), group in frame.groupby(['key1', 'key2']):
    print((k1, k2))
    print(group)

('a', 'one')
  key1 key2     data1     data2
0    a  one  1.002912  0.707687
4    a  one  2.445751  0.649328
('a', 'two')
  key1 key2     data1     data2
1    a  two -1.147644  1.733858
('b', 'one')
  key1 key2     data1     data2
2    b  one  0.069725 -0.062189
('b', 'two')
  key1 key2     data1     data2
3    b  two  0.463306  0.407336


In [14]:
#converting groups to dictionaries

pieces = dict(list(frame.groupby('key1')))
pieces['b']

Unnamed: 0,key1,key2,data1,data2
2,b,one,0.069725,-0.062189
3,b,two,0.463306,0.407336


### 1.1 axis=1

In [15]:
grouped = frame.groupby(frame.dtypes, axis=1)

for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0  1.002912  0.707687
1 -1.147644  1.733858
2  0.069725 -0.062189
3  0.463306  0.407336
4  2.445751  0.649328
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


## 2. Selecting a Column or Subset of Columns

In [16]:
frame.groupby('key1')['data1'].mean()

key1
a    0.767006
b    0.266515
Name: data1, dtype: float64

In [17]:
#alternatively
#Assign DF and the desired column in brackets if you beun with dataframe['data']... syntax
frame['data1'].groupby(frame['key1']).mean()

key1
a    0.767006
b    0.266515
Name: data1, dtype: float64

In [18]:
frame.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,0.678507
a,two,1.733858
b,one,-0.062189
b,two,0.407336


## 3. Grouping with Dicts and Series

In [19]:
people = pd.DataFrame(np.random.randn(5, 5),
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'],
                      columns=list('abcde'))

In [20]:
people.iloc[2:3, [1, 2]] = np.nan
people

Unnamed: 0,a,b,c,d,e
Joe,-1.117064,-0.367546,-0.070528,-0.042357,0.158856
Steve,0.965294,2.886174,-0.588504,-2.217082,-0.481397
Wes,-0.882598,,,-0.766463,-0.822216
Jim,-0.865897,1.380999,0.042916,0.477538,-0.303251
Travis,-0.186332,-0.404637,-0.480055,0.685154,-0.943558


In [21]:
mapping = {'a': 'blue', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

In [22]:
by_column = people.groupby(mapping, axis=1)
by_column.sum()

Unnamed: 0,blue,red
Joe,-1.229949,-0.20869
Steve,-1.840292,2.404777
Wes,-1.649061,-0.822216
Jim,-0.345443,1.077748
Travis,0.018767,-1.348195


In [23]:
#the same for Series
map_series = pd.Series(mapping)
map_series

a      blue
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [24]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,3,2
Steve,3,2
Wes,2,1
Jim,3,2
Travis,3,2


## 4. Grouping with Functions

In [25]:
people

Unnamed: 0,a,b,c,d,e
Joe,-1.117064,-0.367546,-0.070528,-0.042357,0.158856
Steve,0.965294,2.886174,-0.588504,-2.217082,-0.481397
Wes,-0.882598,,,-0.766463,-0.822216
Jim,-0.865897,1.380999,0.042916,0.477538,-0.303251
Travis,-0.186332,-0.404637,-0.480055,0.685154,-0.943558


In [26]:
#index name length
# 3ta hxarfli ismlar bi guruh, 5 va 6 lar alohida
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,-2.865559,1.013453,-0.027612,-0.331282,-0.96661
5,0.965294,2.886174,-0.588504,-2.217082,-0.481397
6,-0.186332,-0.404637,-0.480055,0.685154,-0.943558


## 5. Grouping by Index Levels

In [27]:
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]],
                                    names=['city', 'tenor'])

In [28]:
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df

city,US,US,US,JP,JP
tenor,1,3,5,1,3
0,-0.878164,-0.313601,1.15785,-1.520339,0.465082
1,-0.34727,-1.256108,1.336741,1.529138,0.760455
2,-1.653854,0.382043,-0.161351,-1.65564,1.004971
3,-0.004181,-0.612391,-1.671184,0.218623,0.496293


In [29]:
hier_df.groupby(level='city', axis=1).count()

city,JP,US
0,2,3
1,2,3
2,2,3
3,2,3
