# Groupby-Transform Function

In this notebook, We demonstrate the way to use Groupby-Transform to add a sector-average column into a pandas DataFrame.

## 1. Sample Data Preparation

In [1]:
from IPython.display import display_html
import numpy as np
import pandas as pd
np.random.seed(1)

MORNINGSTAR_SECTOR_CODES = {
     -1: 'Misc',
    101: 'Basic Materials',
    102: 'Consumer Cyclical',
    103: 'Financial Services',
    104: 'Real Estate',
    205: 'Consumer Defensive',
    206: 'Healthcare',
    207: 'Utilities',
    308: 'Communication Services',
    309: 'Energy',
    310: 'Industrials',
    311: 'Technology' ,    
}
MORNINGSTAR_SECTOR_NAMES = list(MORNINGSTAR_SECTOR_CODES.values())

ALPHABETS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

def create_random_data(n=10, n_sectors=5):
    # Positive, rounded, random numbers with mean of 100.0 and variance of 30.0
    prices = np.abs(np.around(np.random.randn(n) * 30.0 + 100, 2))
    sectors_to_pick = np.random.choice(
        MORNINGSTAR_SECTOR_NAMES, n_sectors, replace=False)
    sectors = np.random.choice(sectors_to_pick, n)
    
    symbols = []
    for i in range(n):
        symbols.append(ALPHABETS[i:i+3])
    
    data = list(zip(symbols, sectors, prices))
    df = pd.DataFrame(data=data, columns=['symbol', 'sector', 'price'])
    return (df, sectors_to_pick)

df, sectors = create_random_data(10, 3)
df.head(10)

Unnamed: 0,symbol,sector,price
0,ABC,Consumer Defensive,148.73
1,BCD,Consumer Defensive,81.65
2,CDE,Technology,84.15
3,DEF,Technology,67.81
4,EFG,Consumer Defensive,125.96
5,FGH,Technology,30.95
6,GHI,Technology,152.34
7,HIJ,Consumer Defensive,77.16
8,IJK,Financial Services,109.57
9,JKL,Consumer Defensive,92.52


## 2. Viewing sample data by sector

In [2]:
html_str = ''
for sector in sectors:
    html_str += df[df['sector'] == sector].to_html()
display_html(html_str.replace('table','table style="display:inline"'),raw=True)

Unnamed: 0,symbol,sector,price
2,CDE,Misc,84.15
3,DEF,Misc,67.81
5,FGH,Misc,30.95
6,GHI,Misc,152.34

Unnamed: 0,symbol,sector,price
0,ABC,Healthcare,148.73
1,BCD,Healthcare,81.65
4,EFG,Healthcare,125.96
7,HIJ,Healthcare,77.16
9,JKL,Healthcare,92.52

Unnamed: 0,symbol,sector,price
8,IJK,Real Estate,109.57


## 3. Applying the groupby-transform function

In [3]:
mean = lambda x: x.sum() / x.shape[0]
df['sector_avg_price'] = df.groupby('sector')['price'].transform(mean)
df

Unnamed: 0,symbol,sector,price,sector_avg_price
0,ABC,Healthcare,148.73,105.204
1,BCD,Healthcare,81.65,105.204
2,CDE,Misc,84.15,83.8125
3,DEF,Misc,67.81,83.8125
4,EFG,Healthcare,125.96,105.204
5,FGH,Misc,30.95,83.8125
6,GHI,Misc,152.34,83.8125
7,HIJ,Healthcare,77.16,105.204
8,IJK,Real Estate,109.57,109.57
9,JKL,Healthcare,92.52,105.204


## 4. Assert the groupby-transformed DataFrame

In [4]:
def assert_transform(df):
    sector = df['sector'][0]
    sector_df = df[df['sector'] == sector]
    meanprice = sector_df['price'].mean()
    assert (df['sector_avg_price'][0] == meanprice), "Wrong mean"
    return True
    
assert_transform(df)

True

## 5. Viewing output data by sector

In [5]:
df1 = df.groupby('sector')['price'].apply(mean).to_frame().reset_index()
html_str = ''
for sector in sectors:
    html_str += df1[df1['sector'] == sector].to_html()
display_html(html_str.replace('table','table style="display:inline"'),raw=True)

Unnamed: 0,sector,price
1,Misc,83.8125

Unnamed: 0,sector,price
0,Healthcare,105.204

Unnamed: 0,sector,price
2,Real Estate,109.57


In [6]:
html_str = ''
for sector in sectors:
    html_str += df[df['sector'] == sector].to_html()
display_html(html_str.replace('table','table style="display:inline"'),raw=True)

Unnamed: 0,symbol,sector,price,sector_avg_price
2,CDE,Misc,84.15,83.8125
3,DEF,Misc,67.81,83.8125
5,FGH,Misc,30.95,83.8125
6,GHI,Misc,152.34,83.8125

Unnamed: 0,symbol,sector,price,sector_avg_price
0,ABC,Healthcare,148.73,105.204
1,BCD,Healthcare,81.65,105.204
4,EFG,Healthcare,125.96,105.204
7,HIJ,Healthcare,77.16,105.204
9,JKL,Healthcare,92.52,105.204

Unnamed: 0,symbol,sector,price,sector_avg_price
8,IJK,Real Estate,109.57,109.57
