# GROUPING OF DATA

In [129]:
import pandas as pd
import numpy as np
import math

In [97]:
df = pd.read_csv('../data/items.csv')
df.head(6)

Unnamed: 0,Team,Region,Code,Quantiy,Value
0,A1,D,D1-96432,56.0,73454.0
1,A2,E,D2-09528,45.0,87674.0
2,A1,C,D3-09643,863.0,25447.0
3,A3,A,D2-63432,45.0,956875.0
4,A5,B,D1-09876,,245456.0
5,A6,A,D4-73524,674.0,365145.0


## The groupby operation

In [98]:
dfGroupedByTeam1 = df.groupby('Team')
dfGroupedByTeam1

<pandas.core.groupby.DataFrameGroupBy object at 0x7f2a60bb22b0>

In [99]:
dfGroupedByTeam1.groups

{'A1': Int64Index([0, 2, 7], dtype='int64'),
 'A2': Int64Index([1, 6, 10, 11, 13], dtype='int64'),
 'A3': Int64Index([3, 8, 14], dtype='int64'),
 'A4': Int64Index([9], dtype='int64'),
 'A5': Int64Index([4, 12], dtype='int64'),
 'A6': Int64Index([5], dtype='int64')}

In [100]:
dfSum = dfGroupedByTeam1.size()
dfSum.sort_values(ascending=False)

Team
A2    5
A3    3
A1    3
A5    2
A6    1
A4    1
dtype: int64

In [101]:
dfCode = df.set_index('Code')
dfGroupedByProduct = dfCode.groupby(lambda code: code.split('-')[0])
dfGroupedByProduct

<pandas.core.groupby.DataFrameGroupBy object at 0x7f2a60bca6d8>

In [102]:
for (name, group) in dfGroupedByProduct:
    print(name)
    print(group)
    print('\n')

D1
         Team Region  Quantiy     Value
Code                                   
D1-96432   A1      D     56.0   73454.0
D1-09876   A5      B      NaN  245456.0
D1-63793   A2      C     84.0  234546.0


D2
         Team Region  Quantiy     Value
Code                                   
D2-09528   A2      E     45.0   87674.0
D2-63432   A3      A     45.0  956875.0
D2-90858   A1      D    876.0       NaN
D2-98734   A3      E    524.0   24536.0
D2-96532   A2      D     51.0  472876.0
D2-97435   A3      B    653.0   53343.0


D3
         Team Region  Quantiy    Value
Code                                  
D3-09643   A1      C    863.0  25447.0
D3-53870   A4      B     67.0  37567.0


D4
         Team Region  Quantiy     Value
Code                                   
D4-73524   A6      A    674.0  365145.0
D4-73484   A5      A   2435.0   34745.0


D5
         Team Region  Quantiy    Value
Code                                  
D5-09453   A2      C     54.0  84673.0
D5-09844   A2      A    

In [103]:
dfIndexed = df.set_index('Team')
dfGroupedLevel1 = dfIndexed.groupby(level=0)
dfGroupedLevel1

<pandas.core.groupby.DataFrameGroupBy object at 0x7f2a60bca710>

In [104]:
for (name, group) in dfGroupedLevel0:
    print(name)
    print(group)
    print('\n')

A1
      Region      Code  Quantiy
Team                           
A1         3  D1-96432       56
A1         2  D3-09643      863
A1         2  D2-90858      876


A2
      Region      Code  Quantiy
Team                           
A2         5  D2-09528       45
A2         4  D5-09453       54
A2         1  D2-96532       51
A2         2  D1-63793       84
A2         1  D5-09844      752


A3
      Region      Code  Quantiy
Team                           
A3         3  D2-63432       45
A3         5  D2-98734      524


A4
      Region      Code  Quantiy
Team                           
A4         3  D3-53870       67


A5
      Region      Code  Quantiy
Team                           
A5         1  D1-09876      326
A5         4  D4-73484     2435


A6
      Region      Code  Quantiy
Team                           
A6         1  D4-73524      674




In [105]:
df.reset_index()
dfIndexed = df.set_index(['Team', 'Region'])
dfGroupedLevel2 = dfIndexed.groupby(level=['Team', 'Region'])
for (name, group) in dfGroupedLevel2:
    print(name)
    print(group)
    print('\n')

('A1', 'C')
                 Code  Quantiy    Value
Team Region                            
A1   C       D3-09643    863.0  25447.0


('A1', 'D')
                 Code  Quantiy    Value
Team Region                            
A1   D       D1-96432     56.0  73454.0
     D       D2-90858    876.0      NaN


('A2', 'A')
                 Code  Quantiy    Value
Team Region                            
A2   A       D5-09844    752.0  24753.0


('A2', 'C')
                 Code  Quantiy     Value
Team Region                             
A2   C       D5-09453     54.0   84673.0
     C       D1-63793     84.0  234546.0


('A2', 'D')
                 Code  Quantiy     Value
Team Region                             
A2   D       D2-96532     51.0  472876.0


('A2', 'E')
                 Code  Quantiy    Value
Team Region                            
A2   E       D2-09528     45.0  87674.0


('A3', 'A')
                 Code  Quantiy     Value
Team Region                             
A3   A       D2

## Using groupby with a MultiIndex

In [106]:
dfGroupedLevel3 = dfIndexed.groupby(level='Region')
dfGroupedLevel3Sum = dfGroupedLevel3.sum()
dfGroupedLevel3Sum

Unnamed: 0_level_0,Quantiy,Value
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3906.0,1381518.0
B,720.0,336366.0
C,1001.0,344666.0
D,983.0,546330.0
E,569.0,112210.0


In [107]:
dfIndexed.sum(level='Region')

Unnamed: 0_level_0,Quantiy,Value
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3906.0,1381518.0
B,720.0,336366.0
C,1001.0,344666.0
D,983.0,546330.0
E,569.0,112210.0


In [108]:
dfGroupedLevel3Sum.ix['B']/dfGroupedLevel3Sum.ix['D']

Quantiy    0.732452
Value      0.615683
dtype: float64

## Using the aggregate method

In [109]:
dfGroupedLevel3.aggregate(np.sum)

Unnamed: 0_level_0,Quantiy,Value
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
A,3906.0,1381518.0
B,720.0,336366.0
C,1001.0,344666.0
D,983.0,546330.0
E,569.0,112210.0


## Applying multiple functions

In [110]:
dfGroupedLevel3.agg([np.sum, np.mean, np.std])

Unnamed: 0_level_0,Quantiy,Quantiy,Quantiy,Value,Value,Value
Unnamed: 0_level_1,sum,mean,std,sum,mean,std
Region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,3906.0,976.5,1022.549265,1381518.0,345379.5,437269.012014
B,720.0,360.0,414.364574,336366.0,112122.0,115739.739118
C,1001.0,333.666667,458.661458,344666.0,114888.666667,107774.47673
D,983.0,327.666667,474.877177,546330.0,273165.0,282434.004755
E,569.0,284.5,338.704148,112210.0,56105.0,44645.307951


In [111]:
dfGroupedLevel3['Value'].agg({'Total': np.sum, 'Average': np.mean, 'Count': np.size})

Unnamed: 0_level_0,Total,Average,Count
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1381518.0,345379.5,4.0
B,336366.0,112122.0,3.0
C,344666.0,114888.666667,3.0
D,546330.0,273165.0,3.0
E,112210.0,56105.0,2.0


## The transform() method

In [122]:
dfIndexedCode = df.set_index('Code')
dfGroupedByProduct2 = dfIndexedCode.groupby(lambda code: code.split('-')[0])
print(dfGroupedByProduct2.mean())
print(dfGroupedByProduct2.count())

def transformGroup(x):
#     print(x, type(x))
    filled = x.fillna(x.mean())
#     print(filled, type(filled))
    return filled
transformed = dfGroupedByProduct2.transform(transformGroup)
dfGroupedTransformed = transformed.groupby(lambda code: code.split('-')[0])
print(dfGroupedTransformed.mean())
print(dfGroupedTransformed.count())

        Quantiy          Value
D1    70.000000  184485.333333
D2   365.666667  319060.800000
D3   465.000000   31507.000000
D4  1554.500000  199945.000000
D5   403.000000   54713.000000
    Team  Region  Quantiy  Value
D1     3       3        2      3
D2     6       6        6      5
D3     2       2        2      2
D4     2       2        2      2
D5     2       2        2      2
        Quantiy          Value
D1    70.000000  184485.333333
D2   365.666667  319060.800000
D3   465.000000   31507.000000
D4  1554.500000  199945.000000
D5   403.000000   54713.000000
    Quantiy  Value
D1        3      3
D2        6      6
D3        2      2
D4        2      2
D5        2      2


## Filtering

In [140]:
def filterQuantity(x):
    print(x)
    quantity = x.get('Quantity', math.inf)
    print(quantity)
    return True

df.set_index('Team').groupby(level='Team').filter(filterQuantity)

     Region      Code  Quantiy    Value
Team                                   
A1        D  D1-96432     56.0  73454.0
A1        C  D3-09643    863.0  25447.0
A1        D  D2-90858    876.0      NaN
inf
     Region      Code  Quantiy     Value
Team                                    
A2        E  D2-09528     45.0   87674.0
A2        C  D5-09453     54.0   84673.0
A2        D  D2-96532     51.0  472876.0
A2        C  D1-63793     84.0  234546.0
A2        A  D5-09844    752.0   24753.0
inf
     Region      Code  Quantiy     Value
Team                                    
A3        A  D2-63432     45.0  956875.0
A3        E  D2-98734    524.0   24536.0
A3        B  D2-97435    653.0   53343.0
inf
     Region      Code  Quantiy    Value
Team                                   
A4        B  D3-53870     67.0  37567.0
inf
     Region      Code  Quantiy     Value
Team                                    
A5        B  D1-09876      NaN  245456.0
A5        A  D4-73484   2435.0   34745.0
inf
    

Unnamed: 0_level_0,Region,Code,Quantiy,Value
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A1,D,D1-96432,56.0,73454.0
A2,E,D2-09528,45.0,87674.0
A1,C,D3-09643,863.0,25447.0
A3,A,D2-63432,45.0,956875.0
A5,B,D1-09876,,245456.0
A6,A,D4-73524,674.0,365145.0
A2,C,D5-09453,54.0,84673.0
A1,D,D2-90858,876.0,
A3,E,D2-98734,524.0,24536.0
A4,B,D3-53870,67.0,37567.0
