## Group-Wise Analysis

### 1. Basic Concept

In [1]:
import pandas as pd

In [None]:
df = pd.read_excel('food.xlsx')
df.head()

In [3]:
df.to_pickle('food.pkl')

In [None]:
df = pd.read_pickle('food.pkl')
df

In [None]:
df.info()

In [None]:
df.ProdCategory.value_counts()

In [None]:
df.groupby(by='ProdCategory').size()

In [None]:
print('The number of groups =',df.groupby(by ='ProdCategory').ngroups)

In [None]:
# retail.groupby(by ='Country').groups --> dictionary
print('groups labels are :', list(df.groupby(by ='ProdCategory').groups.keys()))

In [None]:
print('groups items are :', list(df.groupby(by ='ProdCategory').groups.items()))

In [None]:
for group in df.groupby(by ='ProdCategory').groups.keys():
    display(df.groupby(by ='ProdCategory').get_group(group))

In [None]:
df['Price'].mean()

In [None]:
df.groupby('ProdCategory')['Price'].mean()

In [None]:
df.groupby(by = 'ProdCategory')[['Quantity','Price']].mean()

In [None]:
df.groupby(by=['SalesChannel']).ngroups

In [None]:
df['SalesChannel'].value_counts()

In [None]:
df.groupby(by=['ProdCategory','SalesChannel']).ngroups

In [None]:
list(df.groupby(by=['ProdCategory','SalesChannel']).groups.keys())

In [None]:
df.groupby(by=['ProdCategory','SalesChannel']).size()

In [None]:
df.groupby(by=['ProdCategory','SalesChannel']).get_group(('Sesame Products','Internet'))

### 2. Simple Aggregation

In [None]:
print('Total mean Price =',df.Price.mean())
df.groupby(by='ProdCategory').Price.mean()

In [None]:
df.groupby('ProdCategory')['Price'].mean().reset_index()

In [None]:
df.groupby(['ProdCategory','SalesChannel']).Price.mean()

In [None]:
df.groupby(['ProdCategory','SalesChannel']).Price.mean().reset_index()

"""
SELECT 
        ProdCategory
        ,SalesChannel
        ,AVG(Price)
FROM df
GROUP BY ProdCategory
         ,SalesChannel

"""

In [None]:
df.groupby(['ProdCategory','SalesChannel']).Price.max().reset_index()

### 3. Complex Aggregation

import<img src='Agg.jpg'  height="400px" width="600px" style = "float:left"/>


|**Aggregation Function**	| **Description**| 
|:---|:---|
**count ()**	   |        Total number of items.  
**first () , last ()**|  	First and last item.  
**mean ()  , median ()**|	Mean and median.  
**min (), max ()**      |	Minimum and maximum.  
**std (), var ()**      |	Standard deviation and variance.  
**mad ()**             |	Mean absolute deviation.  
**prod ()**            |	Product of all items.  
**sum ()**             | Sum of all items

In [None]:
df.groupby(by=
     ['ProdCategory','SalesChannel']
 ).agg(
     average_price = ('Price'    , 'mean'),
     min_price     = ('Price'    , 'min'), 
     max_price     = ('Price'    , 'max'),
     median_quant  = ('Quantity' , 'median'), 
     var_quant     = ('Quantity' , 'var'),
     count_inv     = ('Quantity' , 'count')
      ).reset_index()
 
"""
SELECT 
        ProdCategory
        ,SalesChannel
        ,AVG(Price) AS average_price
        ,MIN(Price) AS min_price
FROM df
GROUP BY ProdCategory
         ,SalesChannel

"""

In [None]:
import numpy as np

df.groupby(
     ['ProdCategory','SalesChannel']
 ).agg(
     average_price = ('Price'    , np.sqrt),
     min_price     = ('Price'    , 'min'), 
     max_price     = ('Price'    , 'max'),
     median_quant  = ('Quantity' , 'median'), 
     var_quant     = ('Quantity' , 'var'),
     count_inv     = ('Quantity' , 'count')
      ).reset_index()

In [None]:
# use custom aggregate function

def range_calc(x):
    return x.max() - x.min()

def sum_square_function(x):
    return sum(x**2)

def sum_square_method(x):
    return (x**2).sum()

df.groupby(
     ['ProdCategory','SalesChannel']
 ).agg(
     min_price     = ('Price'   , 'min'), 
     average_price = ('Price'   , 'mean'),
     range_price   = ('Price'   , range_calc),
     sumsq_direct  = ('Price'   , sum_square_function),
     sumsq_vector  = ('Price'   , sum_square_method)
 ).reset_index()

In [None]:
# Not Recommended
df.groupby(
     ['ProdCategory','SalesChannel']
 ).agg(
    {
      'Price'     :'mean',  
      'Quantity'  :'median', 
      'Quantity'  : 'var'
    }
).reset_index()

In [None]:
# Not Recommended
df.groupby(
     ['ProdCategory','SalesChannel']
 ).agg(
    {
      'Price':'mean',  
      'Quantity':['median','var']
    }
).reset_index()

In [None]:
df.groupby('ProdCategory')['Price'].agg(['count', 'min', 'max']).reset_index()

# df.groupby('ProdCategory').agg(
#                              count   = ('Price','count'),
#                              minimun = ('Price','min'),
#                              maximum = ('Price','max')
#                              ).reset_index()