In [3]:
import pandas as pd
import numpy as np

# GroupBy Function

### It allows you to split your data into separate groups to perform computations for better analysis.
### GroupBy allows us to group our data based on different features and get a more accurate idea about your data

## Example

### Let’s say we are trying to analyze the weight of a person in a city. We can easily get a fair idea of their weight by determining the mean weight of all the people. We can group the people into different gender groups and calculate their mean weight. This would give us a better insight into the weight of a person living in the city. We get an even better picture if we further separate these gender groups into different age groups and then take their mean weight.

## Understanding the Dataset and the Problem Statement

### Load Data

In [4]:
sales_data = pd.read_csv('Test.csv')
sales_data

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.300,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.600,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.0340,OUT017,2007,,Tier 2,Supermarket Type1
4,FDY38,,Regular,0.118599,Dairy,234.2300,OUT027,1985,Medium,Tier 3,Supermarket Type3
...,...,...,...,...,...,...,...,...,...,...,...
5676,FDB58,10.500,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1
5677,FDD47,7.600,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2
5678,NCO17,10.000,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1
5679,FDJ26,15.300,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1


### Check for missing values in dataframe

In [5]:
sales_data.isnull()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,True,False,False
3,False,False,False,False,False,False,False,False,True,False,False
4,False,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
5676,False,False,False,False,False,False,False,False,False,False,False
5677,False,False,False,False,False,False,False,False,False,False,False
5678,False,False,False,False,False,False,False,False,True,False,False
5679,False,False,False,False,False,False,False,False,True,False,False


### Missing values in outlet_size column.. need to handle it

In [6]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 0 to 5680
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            5681 non-null   object 
 1   Item_Weight                4705 non-null   float64
 2   Item_Fat_Content           5681 non-null   object 
 3   Item_Visibility            5681 non-null   float64
 4   Item_Type                  5681 non-null   object 
 5   Item_MRP                   5681 non-null   float64
 6   Outlet_Identifier          5681 non-null   object 
 7   Outlet_Establishment_Year  5681 non-null   int64  
 8   Outlet_Size                4075 non-null   object 
 9   Outlet_Location_Type       5681 non-null   object 
 10  Outlet_Type                5681 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 488.3+ KB


In [7]:
sales_data.dropna(axis=0)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
5,FDH56,9.800,Regular,0.063817,Fruits and Vegetables,117.1492,OUT046,1997,Small,Tier 1,Supermarket Type1
6,FDL48,19.350,Regular,0.082602,Baking Goods,50.1034,OUT018,2009,Medium,Tier 3,Supermarket Type2
13,FDU11,4.785,Low Fat,0.092738,Breads,122.3098,OUT049,1999,Medium,Tier 1,Supermarket Type1
14,DRL59,16.750,LF,0.021206,Hard Drinks,52.0298,OUT013,1987,High,Tier 3,Supermarket Type1
...,...,...,...,...,...,...,...,...,...,...,...
5673,FDF46,7.070,Low Fat,0.094053,Snack Foods,116.0834,OUT018,2009,Medium,Tier 3,Supermarket Type2
5674,DRL35,15.700,Low Fat,0.030704,Hard Drinks,43.2770,OUT046,1997,Small,Tier 1,Supermarket Type1
5675,FDW46,13.000,Regular,0.070411,Snack Foods,63.4484,OUT049,1999,Medium,Tier 1,Supermarket Type1
5676,FDB58,10.500,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1


### Group the dataset based on the outlet location type using GroupBy

In [8]:
# Pass Outlet_Location_Type  column as argument
sales_data.groupby('Outlet_Location_Type')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f86ad212f70>

### GroupBy has conveniently returned a DataFrameGroupBy object. 
### It has split the data into separate groups. However, it won’t do anything unless it is being told explicitly to do so.
### So, let’s find the count of different outlet location types:

In [9]:
sales_data.groupby('Outlet_Location_Type').count()

### We can see data is grouped into Tier1, Tier2 and Tier3 which r the values of 'Outlet_Location_Type'.
### So Tier1, Tier2 and Tier3 become indexes of the dataframe.

Unnamed: 0_level_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Type
Outlet_Location_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Tier 1,1592,1240,1592,1592,1592,1592,1592,1592,1592,1592
Tier 2,1856,1856,1856,1856,1856,1856,1856,1856,620,1856
Tier 3,2233,1609,2233,2233,2233,2233,2233,2233,1863,2233


### Let’s find out the total sale amount for each location type.

In [10]:
sales_data.groupby('Outlet_Location_Type')['Item_MRP']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f8685f291c0>

#### Here, GroupBy has returned a SeriesGroupBy object. 
#### No computation will be done until we specify the aggregation function:

In [11]:
sales_data.groupby('Outlet_Location_Type')['Item_MRP'].sum()

## The Split-Apply-Combine Strategy
#### Using this strategy, a data analyst can break down a big problem into manageable parts, perform operations on individual parts and combine them back together to answer a specific question.

#### Let's understand how this strategy works in GroupBy by working with a sample dataset to get the average height for males and females in a group. 

In [12]:
gender_data = {'Gender': ['m','f','f','m','f','m','m'], 'Height':[172,171,169,173,170,175,178]}

In [13]:
df = pd.DataFrame(gender_data)
df

Unnamed: 0,Gender,Height
0,m,172
1,f,171
2,f,169
3,m,173
4,f,170
5,m,175
6,m,178


### Splitting the data into separate groups:

#### Split based on females

In [14]:
f_filter = df['Gender']=='f'
df[f_filter]

Unnamed: 0,Gender,Height
1,f,171
2,f,169
4,f,170


#### Split based on males

In [15]:
m_filter = df['Gender']=='m'
df[m_filter]

Unnamed: 0,Gender,Height
0,m,172
3,m,173
5,m,175
6,m,178


### Find the mean height of female and male groups

In [16]:
male_avg_h = df[m_filter]['Height'].mean()
male_avg_h

174.5

In [17]:
female_avg_h = df[f_filter]['Height'].mean()
female_avg_h

170.0

### Combine the result to output a DataFrame:

In [18]:
df_output = pd.DataFrame({'Gender': ['m','g'], 'Mean_Height': [male_avg_h, female_avg_h]})
df_output

# We got the mean height for each gender

Unnamed: 0,Gender,Mean_Height
0,m,174.5
1,g,170.0


### Now comes the Power of GroupBy
### All these three steps can be achieved by using GroupBy with just a single line of code

In [19]:
df.groupby('Gender').mean()

Unnamed: 0_level_0,Height
Gender,Unnamed: 1_level_1
f,170.0
m,174.5


#### Now that is smart! Have a look at how GroupBy did that in the image below:

https://cdn.analyticsvidhya.com/wp-content/uploads/2020/03/Split_Apply_Combine.png


1. Get the data
2. Split the data based on gender.
3. Apply aggregation function to compute mean height of each group.
4. Finally combine the result to a new dataframe.

Now that you understand what the Split-Apply-Combine strategy is, let’s dive deeper into the GroupBy function and unlock its full potential.


In [20]:
df = sales_data.groupby('Outlet_Location_Type')

In [21]:
df.count()

Unnamed: 0_level_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Type
Outlet_Location_Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Tier 1,1592,1240,1592,1592,1592,1592,1592,1592,1592,1592
Tier 2,1856,1856,1856,1856,1856,1856,1856,1856,620,1856
Tier 3,2233,1609,2233,2233,2233,2233,2233,2233,1863,2233


- Display the indices in each group
- call groups on GroupBy Object

In [22]:
df.groups

{'Tier 1': [0, 5, 12, 13, 15, 22, 23, 27, 28, 30, 49, 50, 55, 58, 67, 70, 73, 76, 77, 78, 82, 86, 87, 95, 103, 104, 106, 107, 111, 116, 123, 134, 135, 137, 138, 141, 142, 150, 157, 159, 163, 164, 167, 185, 188, 190, 192, 193, 196, 198, 199, 205, 206, 209, 211, 216, 218, 219, 220, 222, 227, 228, 234, 235, 237, 238, 247, 252, 258, 261, 264, 265, 268, 276, 278, 283, 285, 286, 287, 289, 292, 302, 307, 317, 320, 330, 333, 342, 343, 344, 348, 349, 352, 358, 369, 371, 372, 375, 380, 383, ...], 'Tier 2': [1, 3, 8, 9, 10, 11, 16, 21, 24, 26, 29, 34, 38, 42, 47, 48, 52, 53, 59, 60, 61, 63, 64, 66, 69, 74, 75, 80, 84, 85, 88, 90, 96, 98, 101, 102, 114, 118, 119, 121, 122, 126, 132, 143, 144, 145, 146, 148, 151, 153, 158, 160, 161, 168, 170, 171, 174, 180, 182, 183, 186, 189, 194, 197, 201, 202, 203, 207, 208, 213, 215, 223, 224, 225, 229, 232, 233, 236, 239, 241, 243, 245, 249, 250, 253, 254, 255, 256, 260, 267, 270, 271, 274, 275, 277, 282, 296, 297, 301, 303, ...], 'Tier 3': [2, 4, 6, 7, 14, 17

- Can Iterate over all the groups

In [23]:
for i, j in df:
    print(f'{i} contains {j.shape[0]} rows')

Tier 1 contains 1592 rows
Tier 2 contains 1856 rows
Tier 3 contains 2233 rows


#### But what if you want to get a specific group out of all the groups.
#### Eg: Currently data splitted into 3 groups - Tier 1, Tier 2, Tier 3. We want to get features of Tier 1 group out of other groups.
#### Just provide the specific group name when calling get_group() on the group object. Here, I want to check out the features for the ‘Tier 1’ group

In [24]:
df.get_group('Tier 1')

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
5,FDH56,9.800,Regular,0.063817,Fruits and Vegetables,117.1492,OUT046,1997,Small,Tier 1,Supermarket Type1
12,NCC54,,Low Fat,0.171079,Health and Hygiene,240.4196,OUT019,1985,Small,Tier 1,Grocery Store
13,FDU11,4.785,Low Fat,0.092738,Breads,122.3098,OUT049,1999,Medium,Tier 1,Supermarket Type1
15,FDM24,6.135,Regular,0.079451,Baking Goods,151.6366,OUT049,1999,Medium,Tier 1,Supermarket Type1
...,...,...,...,...,...,...,...,...,...,...,...
5671,FDA01,15.000,reg,0.054463,Canned,59.5904,OUT049,1999,Medium,Tier 1,Supermarket Type1
5672,NCH42,6.860,Low Fat,0.036594,Household,231.1010,OUT049,1999,Medium,Tier 1,Supermarket Type1
5674,DRL35,15.700,Low Fat,0.030704,Hard Drinks,43.2770,OUT046,1997,Small,Tier 1,Supermarket Type1
5675,FDW46,13.000,Regular,0.070411,Snack Foods,63.4484,OUT049,1999,Medium,Tier 1,Supermarket Type1


In [25]:
df.get_group('Tier 2')

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
1,FDW14,8.300,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.0340,OUT017,2007,,Tier 2,Supermarket Type1
8,FDN33,6.305,Regular,0.123365,Snack Foods,95.7436,OUT045,2002,,Tier 2,Supermarket Type1
9,FDA36,5.985,Low Fat,0.005698,Baking Goods,186.8924,OUT017,2007,,Tier 2,Supermarket Type1
10,FDT44,16.600,Low Fat,0.103569,Fruits and Vegetables,118.3466,OUT017,2007,,Tier 2,Supermarket Type1
...,...,...,...,...,...,...,...,...,...,...,...
5662,FDK22,9.800,Low Fat,0.026234,Snack Foods,214.3850,OUT017,2007,,Tier 2,Supermarket Type1
5670,FDO03,10.395,Regular,0.037092,Meat,229.4352,OUT017,2007,,Tier 2,Supermarket Type1
5678,NCO17,10.000,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1
5679,FDJ26,15.300,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1


### Apply Functions to GroupBy Groups

The apply step is unequivocally the most important step of a GroupBy function where we can perform a variety of operations using aggregation, transformation, filtration or even with your own function!

### 1. Aggregation

count() – Number of non-null observations

sum() – Sum of values

mean() – Mean of values

median() – Arithmetic median of values

min() – Minimum

max() – Maximum

mode() – Mode

std() – Standard deviation

var() – Variance

In [26]:
sales_data.groupby('Outlet_Location_Type').agg([np.mean])
# Get mean value of columns for each groups.
# Eg, mean of Item_Weight column is computed for each tiers.

Unnamed: 0_level_0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
Unnamed: 0_level_1,mean,mean,mean,mean
Outlet_Location_Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Tier 1,12.68221,0.068593,141.043542,1995.125628
Tier 2,12.833486,0.060839,140.71879,2004.33028
Tier 3,12.546964,0.067637,141.2619,1994.352441


In [27]:
sales_data.groupby('Outlet_Location_Type').agg([np.median])

Unnamed: 0_level_0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
Unnamed: 0_level_1,median,median,median,median
Outlet_Location_Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Tier 1,12.35,0.055336,141.2483,1997.0
Tier 2,12.6,0.049646,140.3996,2004.0
Tier 3,12.3,0.055655,142.6128,1987.0


In [28]:
sales_data.groupby('Outlet_Location_Type').agg([np.min, np.max])

Unnamed: 0_level_0,Item_Identifier,Item_Identifier,Item_Weight,Item_Weight,Item_Fat_Content,Item_Fat_Content,Item_Visibility,Item_Visibility,Item_Type,Item_Type,Item_MRP,Item_MRP,Outlet_Identifier,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Establishment_Year,Outlet_Type,Outlet_Type
Unnamed: 0_level_1,amin,amax,amin,amax,amin,amax,amin,amax,amin,amax,amin,amax,amin,amax,amin,amax,amin,amax
Outlet_Location_Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
Tier 1,DRA12,NCZ53,4.615,21.35,LF,reg,0.0,0.323637,Baking Goods,Starchy Foods,32.09,266.4884,OUT019,OUT049,1985,1999,Grocery Store,Supermarket Type1
Tier 2,DRA24,NCZ54,4.555,21.35,LF,reg,0.0,0.186911,Baking Goods,Starchy Foods,32.0558,266.5884,OUT017,OUT045,2002,2007,Supermarket Type1,Supermarket Type1
Tier 3,DRA12,NCZ54,4.555,21.35,LF,reg,0.0,0.313935,Baking Goods,Starchy Foods,31.99,266.3226,OUT010,OUT027,1985,2009,Grocery Store,Supermarket Type3


We can even run GroupBy with multiple indexes to get better insights from our data:

In [29]:
sales_data.groupby(['Outlet_Location_Type', 'Outlet_Establishment_Year']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Type
Outlet_Location_Type,Outlet_Establishment_Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Tier 1,1985,352,0,352,352,352,352,352,352,352
Tier 1,1997,620,620,620,620,620,620,620,620,620
Tier 1,1999,620,620,620,620,620,620,620,620,620
Tier 2,2002,619,619,619,619,619,619,619,0,619
Tier 2,2004,620,620,620,620,620,620,620,620,620
Tier 2,2007,617,617,617,617,617,617,617,0,617
Tier 3,1985,624,0,624,624,624,624,624,624,624
Tier 3,1987,621,621,621,621,621,621,621,621,621
Tier 3,1998,370,370,370,370,370,370,370,0,370
Tier 3,2009,618,618,618,618,618,618,618,618,618


In [30]:
sales_data.groupby(['Outlet_Location_Type', 'Outlet_Establishment_Year'], as_index=False).count()

# as_index = False, means columns by which we group data will not be outputted as index.
# No index of result is 0,1,2,...

Unnamed: 0,Outlet_Location_Type,Outlet_Establishment_Year,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Type
0,Tier 1,1985,352,0,352,352,352,352,352,352,352
1,Tier 1,1997,620,620,620,620,620,620,620,620,620
2,Tier 1,1999,620,620,620,620,620,620,620,620,620
3,Tier 2,2002,619,619,619,619,619,619,619,0,619
4,Tier 2,2004,620,620,620,620,620,620,620,620,620
5,Tier 2,2007,617,617,617,617,617,617,617,0,617
6,Tier 3,1985,624,0,624,624,624,624,624,624,624
7,Tier 3,1987,621,621,621,621,621,621,621,621,621
8,Tier 3,1998,370,370,370,370,370,370,370,0,370
9,Tier 3,2009,618,618,618,618,618,618,618,618,618


- Call Aggregate functions for better insights

In [31]:
sales_data.groupby(['Outlet_Location_Type', 'Outlet_Establishment_Year'], as_index=False).agg({'Item_MRP': np.mean})

# Group by 2 columns 
# Set index to 0,1,2
# Call agg() on result
# Pass column and aggregation function to apply on same column in dictionary as argument.
    # Eg: agg({'Item_MRP': np.mean})
# Here we explicitly apply aggregate function on a feature called Item_MRP rather that whole dataset.

Unnamed: 0,Outlet_Location_Type,Outlet_Establishment_Year,Item_MRP
0,Tier 1,1985,142.200334
1,Tier 1,1997,138.968874
2,Tier 1,1999,142.46145
3,Tier 2,2002,140.717908
4,Tier 2,2004,138.084348
5,Tier 2,2007,143.366926
6,Tier 3,1985,142.826241
7,Tier 3,1987,140.681858
8,Tier 3,1998,141.732965
9,Tier 3,2009,139.983201


In [32]:
sales_data.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type'],
      dtype='object')

In [33]:
sales_data.groupby(['Outlet_Location_Type', 'Item_Fat_Content'], as_index=False,).agg({'Item_Weight': np.mean})

Unnamed: 0,Outlet_Location_Type,Item_Fat_Content,Item_Weight
0,Tier 1,LF,12.239574
1,Tier 1,Low Fat,12.83765
2,Tier 1,Regular,12.529066
3,Tier 1,low fat,12.146333
4,Tier 1,reg,11.404
5,Tier 2,LF,12.067206
6,Tier 2,Low Fat,13.20995
7,Tier 2,Regular,12.340594
8,Tier 2,low fat,11.52375
9,Tier 2,reg,11.866522


### Rename the aggregated columns

In [34]:
sales_data.groupby(['Outlet_Location_Type', 'Item_Fat_Content'], as_index=False).agg(Mean_Weight=('Item_Weight',np.mean), MRP_MEDIAN=('Item_MRP', np.median))

# We rename the resultant aggregated columns to new names 'Mean_Weight' and 'MRP_Median'.

Unnamed: 0,Outlet_Location_Type,Item_Fat_Content,Mean_Weight,MRP_MEDIAN
0,Tier 1,LF,12.239574,119.5769
1,Tier 1,Low Fat,12.83765,143.3154
2,Tier 1,Regular,12.529066,139.4838
3,Tier 1,low fat,12.146333,144.676
4,Tier 1,reg,11.404,105.4964
5,Tier 2,LF,12.067206,128.2678
6,Tier 2,Low Fat,13.20995,142.447
7,Tier 2,Regular,12.340594,129.6994
8,Tier 2,low fat,11.52375,158.0275
9,Tier 2,reg,11.866522,104.5648


In [35]:
sales_data.groupby('Item_Type', as_index=False).agg(Mean_Weight=('Item_Weight',np.mean))

Unnamed: 0,Item_Type,Mean_Weight
0,Baking Goods,12.272418
1,Breads,10.866799
2,Breakfast,13.759603
3,Canned,12.393565
4,Dairy,12.95504
5,Frozen Foods,12.101543
6,Fruits and Vegetables,13.146659
7,Hard Drinks,11.844417
8,Health and Hygiene,13.216929
9,Household,13.270504


## Transformation

Transformation allows us to perform some computation on the groups as a whole and then return the combined DataFrame. This is done using the transform() function.

We will try to compute the null values in the **Item_Weight** column using the transform() function.

In [36]:
sample_1 = sales_data.groupby('Outlet_Location_Type', as_index=False)
sample_1['Item_Weight'].transform(lambda x:x.fillna(x.mean()))

Unnamed: 0,Item_Weight
0,20.750000
1,8.300000
2,14.600000
3,7.315000
4,12.546964
...,...
5676,10.500000
5677,7.600000
5678,10.000000
5679,15.300000


In [37]:
gender_data = {'Gender': ['m','f','f','m','f','m','m', 'f', 'm', 'f', 'f'], 'Height':[172,171,169,173,170,175,178, 145, 171,150, 160]}

In [38]:
data_df_1 = pd.DataFrame(gender_data)

In [39]:
data_df_1

Unnamed: 0,Gender,Height
0,m,172
1,f,171
2,f,169
3,m,173
4,f,170
5,m,175
6,m,178
7,f,145
8,m,171
9,f,150


In [40]:
grouped_data = data_df_1.groupby('Gender')
grouped_data['Height'].transform(lambda h: (h.max()-h.min()))

## Filtration

Filtration allows us to discard certain values based on computation and return only a subset of the group. 
We can do this using the **filter()** function in Pandas.

In [41]:
sales_data.shape

(5681, 11)

If i wanted only those groups of **Item_Weight** with standard deviation < 3.
We use filter() function to do the job.

In [42]:
grouped = sales_data.groupby('Item_Weight')
df_filter = grouped.filter(lambda item: item['Item_Weight'].std()<3)
df_filter

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,FDW14,8.300,reg,0.038428,Dairy,87.3198,OUT017,2007,,Tier 2,Supermarket Type1
2,NCN55,14.600,Low Fat,0.099575,Others,241.7538,OUT010,1998,,Tier 3,Grocery Store
3,FDQ58,7.315,Low Fat,0.015388,Snack Foods,155.0340,OUT017,2007,,Tier 2,Supermarket Type1
5,FDH56,9.800,Regular,0.063817,Fruits and Vegetables,117.1492,OUT046,1997,Small,Tier 1,Supermarket Type1
...,...,...,...,...,...,...,...,...,...,...,...
5676,FDB58,10.500,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1
5677,FDD47,7.600,Regular,0.142991,Starchy Foods,169.1448,OUT018,2009,Medium,Tier 3,Supermarket Type2
5678,NCO17,10.000,Low Fat,0.073529,Health and Hygiene,118.7440,OUT045,2002,,Tier 2,Supermarket Type1
5679,FDJ26,15.300,Regular,0.000000,Canned,214.6218,OUT017,2007,,Tier 2,Supermarket Type1


In [43]:
df_filter.shape

(4692, 11)

In [44]:
sales_data.shape

(5681, 11)

- GroupBy has conveniently returned a DataFrame with only those groups that have Item_Weight less than 3 standard deviations.
- You can see the difference in shape of original dataframe and filtered dataframe

In [45]:
df = pd.DataFrame(gender_data)
df

Unnamed: 0,Gender,Height
0,m,172
1,f,171
2,f,169
3,m,173
4,f,170
5,m,175
6,m,178
7,f,145
8,m,171
9,f,150


In [46]:
grouped = sales_data.groupby('Item_Visibility')

In [47]:
filtered_df = grouped.filter(lambda f: f['Item_Visibility'].std() < .2)

# Here Item_Viisbility with value < .2 are removed and then stored the resulty in a dataframe.

In [48]:
filtered_df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
56,FDA09,13.350,Regular,0.0,Snack Foods,179.6660,OUT013,1987,High,Tier 3,Supermarket Type1
90,FDN39,19.350,Regular,0.0,Meat,165.7816,OUT045,2002,,Tier 2,Supermarket Type1
109,FDU14,17.750,Low Fat,0.0,Dairy,249.7750,OUT018,2009,Medium,Tier 3,Supermarket Type2
125,FDZ36,,Regular,0.0,Baking Goods,186.4240,OUT027,1985,Medium,Tier 3,Supermarket Type3
140,DRF48,5.730,Low Fat,0.0,Soft Drinks,188.3898,OUT010,1998,,Tier 3,Grocery Store
...,...,...,...,...,...,...,...,...,...,...,...
5643,FDQ19,,Regular,0.0,Fruits and Vegetables,244.3512,OUT019,1985,Small,Tier 1,Grocery Store
5648,FDK26,,Regular,0.0,Canned,187.5240,OUT027,1985,Medium,Tier 3,Supermarket Type3
5654,FDQ57,7.275,Low Fat,0.0,Snack Foods,144.5760,OUT013,1987,High,Tier 3,Supermarket Type1
5661,DRG37,,Low Fat,0.0,Soft Drinks,155.7972,OUT027,1985,Medium,Tier 3,Supermarket Type3


In [49]:
filtered_df.shape

(457, 11)

In [50]:
sales_data.shape

(5681, 11)

## Applying our own functions

- Pandas’ apply() function applies a function along an axis of the DataFrame. 
- When using it with the GroupBy function, we can apply any function to the grouped result

In [55]:
grouped = sales_data.groupby(['Outlet_Establishment_Year'])
grouped['Item_MRP'].apply(lambda x: x/x.sum())

# First group data by Outlet_Establishment_Year
# Second select Item_MRP and apply lambda function
# Lambda function which takes each value of Item_MRP and divide it by sum of all values

### Example - 2 using groupby and apply

In [62]:
grouped = sales_data.groupby(['Outlet_Establishment_Year'], as_index=False)
grouped['Item_Type'].apply(lambda z: z.str.upper())

In [66]:
transformed_data = sales_data.dropna(axis=0)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDW58,20.750,Low Fat,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1
5,FDH56,9.800,Regular,0.063817,Fruits and Vegetables,117.1492,OUT046,1997,Small,Tier 1,Supermarket Type1
6,FDL48,19.350,Regular,0.082602,Baking Goods,50.1034,OUT018,2009,Medium,Tier 3,Supermarket Type2
13,FDU11,4.785,Low Fat,0.092738,Breads,122.3098,OUT049,1999,Medium,Tier 1,Supermarket Type1
14,DRL59,16.750,LF,0.021206,Hard Drinks,52.0298,OUT013,1987,High,Tier 3,Supermarket Type1
...,...,...,...,...,...,...,...,...,...,...,...
5673,FDF46,7.070,Low Fat,0.094053,Snack Foods,116.0834,OUT018,2009,Medium,Tier 3,Supermarket Type2
5674,DRL35,15.700,Low Fat,0.030704,Hard Drinks,43.2770,OUT046,1997,Small,Tier 1,Supermarket Type1
5675,FDW46,13.000,Regular,0.070411,Snack Foods,63.4484,OUT049,1999,Medium,Tier 1,Supermarket Type1
5676,FDB58,10.500,Regular,0.013496,Snack Foods,141.3154,OUT046,1997,Small,Tier 1,Supermarket Type1


### Example - 3 GroupBy and Apply()

In [80]:
grouped = transformed_data.groupby('Outlet_Size', as_index=False)
grouped['Item_Weight'].apply(lambda data: data - data.mean())


# First group data by Outlet_Size
# Second select Item_wEIGHT and apply lambda function
# Lambda function which takes each value of Item_Weight and divide it by sum of all values