# `Aggregation` and `Groupby` Operations in a `DataFrame`:

In [60]:
import numpy as np 
import pandas as pd 

In [61]:
sales_dataset = {
    'Category': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'Store': ['S1', 'S1', 'S2', 'S2', 'S1', 'S2', 'S2', 'S1'],
    'Sales': [100, 200, 150, 250, 120, 180, 200, 300],
    'Quantity': [10, 15, 12, 18, 8, 20, 15, 25],
    'Date': pd.date_range('2023-01-01', periods=8)
}
sales_df = pd.DataFrame(data=sales_dataset)


In [62]:
sales_df

Unnamed: 0,Category,Store,Sales,Quantity,Date
0,A,S1,100,10,2023-01-01
1,B,S1,200,15,2023-01-02
2,A,S2,150,12,2023-01-03
3,B,S2,250,18,2023-01-04
4,A,S1,120,8,2023-01-05
5,B,S2,180,20,2023-01-06
6,A,S2,200,15,2023-01-07
7,B,S1,300,25,2023-01-08


## `Aggregation` of `DataFrame`:
### Aggregation is the method of `DataFrame` to calulate something(`mean`, `median`, `min`, `max`, `count`, `std`, `median`, `mode` etc...) of the Provided `DataFrame/Series`

In [63]:
sales_df['Sales'].mean()
"""mean median min max count std """

'mean median min max count std '

In [64]:
sales_df['Sales'].agg(['sum', 'mean', 'min', 'max', 'count', 'std', 'median'])

sum       1500.000000
mean       187.500000
min        100.000000
max        300.000000
count        8.000000
std         66.062741
median     190.000000
Name: Sales, dtype: float64

In [None]:
#Mode works alone for some reason:
sales_df['Sales'].agg(['mode'])

Unnamed: 0,mode
0,200


## Using `GroupBy` in `DataFrame`:
* ### `groupby` is method of a DataFrame to group a categorical data w.r.t a specific column.

In [66]:
sample_df2 = sales_df.groupby('Category')
sample_df2
#groupby() does not return a DataFrame but returns a GroupBy object.

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f05f31945f0>

In [72]:
#To print groupby():

for group_key, group in sample_df2:
    print("Group:", group_key)
    print(group)

# or
sample_df2.apply(lambda group_key, group: group_key, group , include_groups=False)

Group: A
  Category Store  Sales  Quantity       Date
0        A    S1    100        10 2023-01-01
2        A    S2    150        12 2023-01-03
4        A    S1    120         8 2023-01-05
6        A    S2    200        15 2023-01-07
Group: B
  Category Store  Sales  Quantity       Date
1        B    S1    200        15 2023-01-02
3        B    S2    250        18 2023-01-04
5        B    S2    180        20 2023-01-06
7        B    S1    300        25 2023-01-08


Unnamed: 0_level_0,Unnamed: 1_level_0,Store,Sales,Quantity,Date
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,0,S1,100,10,2023-01-01
A,2,S2,150,12,2023-01-03
A,4,S1,120,8,2023-01-05
A,6,S2,200,15,2023-01-07
B,1,S1,200,15,2023-01-02
B,3,S2,250,18,2023-01-04
B,5,S2,180,20,2023-01-06
B,7,S1,300,25,2023-01-08


In [78]:
# groupby() Category and groupby() store:

sample_df2 = sales_df.groupby(['Category','Store'])
for group_key, group in sample_df2:
    print("Group:", group_key)
    print(group, '\n')


Group: ('A', 'S1')
  Category Store  Sales  Quantity       Date
0        A    S1    100        10 2023-01-01
4        A    S1    120         8 2023-01-05 

Group: ('A', 'S2')
  Category Store  Sales  Quantity       Date
2        A    S2    150        12 2023-01-03
6        A    S2    200        15 2023-01-07 

Group: ('B', 'S1')
  Category Store  Sales  Quantity       Date
1        B    S1    200        15 2023-01-02
7        B    S1    300        25 2023-01-08 

Group: ('B', 'S2')
  Category Store  Sales  Quantity       Date
3        B    S2    250        18 2023-01-04
5        B    S2    180        20 2023-01-06 



In [83]:
# groupby() Category then by Store and calculating the sum of Quantity by aggregation;

sample_df2 = sales_df.groupby(['Category','Store'])['Quantity'].sum()
sample_df2

Category  Store
A         S1       18
          S2       27
B         S1       40
          S2       38
Name: Quantity, dtype: int64

In [None]:
# groupby() Category then by Store and calculating the sum of Sales by aggregation;

sample_df2 = sales_df.groupby(['Category','Store'])['Sales'].agg(['sum', 'mean', 'min', 'max', 'count', 'std', 'median'])
sample_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,mean,min,max,count,std,median
Category,Store,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,S1,220,110.0,100,120,2,14.142136,110.0
A,S2,350,175.0,150,200,2,35.355339,175.0
B,S1,500,250.0,200,300,2,70.710678,250.0
B,S2,430,215.0,180,250,2,49.497475,215.0
