# FDNote5W.ipynb

Prepared by Inmoo Lee for the Financial Databases class at KAIST

inmool@kaist.ac.kr

For portfolio return calculations using SQL

Input files used

    - 2020SP500Constituents_2025_Short.xlsx
    - return_data.ft
    - note4data.xlsx
    - fbhrs.ft
    - Note3w_RHistory2025_Short.xlsx
    

In [27]:
import os #import a package called os

# os.getcwd()  #get the current working directory
# path='D:\\####'#Change this to your directory
# os.chdir(path) # change the working directory

In [28]:
import numpy as np
import pandas as pd
from datetime import datetime as dt

# Calculate portfolio returns using SQL

Equally-weighted vs. value-weighted returns


In [29]:
#Calculate the equally-weighted average return in each month
from pandasql import sqldf
def pysqldf(q):
 return sqldf(q, globals())

In [30]:
#Read the data
df=pd.read_feather('./return_data.ft')

In [31]:


from pandas.tseries.offsets import MonthEnd

## As discussed before, we can find the the month end any # of months before/after
### you have to convert the floating 64 format Date to integer and then to string format
### to use pd_to_datetime

df.loc[:,'date1']=pd.to_datetime(round(df.loc[:,'Date']).astype(int).astype(str),format='%Y%m%d')+MonthEnd(0)
###find the date 12 months after (+12) and before (-12) the current date
df.loc[:,'fmonth12']=df.loc[:,'date1']+MonthEnd(+12)
df.loc[:,'bmonth12']=df.loc[:,'date1']+MonthEnd(-12)
print(df[['date1','fmonth12','bmonth12']].head())

       date1   fmonth12   bmonth12
0 2001-01-31 2002-01-31 2000-01-31
1 2001-02-28 2002-02-28 2000-02-29
2 2001-03-31 2002-03-31 2000-03-31
3 2001-04-30 2002-04-30 2000-04-30
4 2001-05-31 2002-05-31 2000-05-31


In [32]:
###find yyyymm to be used as an time indicator (year+month)
### since date1 is a datetime variable, dt.year and dt.month
### can be used to get year and month from a date.
df.loc[:,'yyyymm']=df.loc[:,'date1'].dt.year*100+df.date1.dt.month
print(df[['yyyymm']].head())

   yyyymm
0  200101
1  200102
2  200103
3  200104
4  200105


## Portfolio return calculation

In [33]:
#Let's import the market cap information 
marketc=pd.read_excel('./note4data.xlsx', sheet_name="marketcap", header=0)


In [34]:
from pandas.tseries.offsets import MonthEnd

# convert "Date" to a datetime variable recognized in Pandas and call it "rdate"
# make a time indicator variable (yyyymm), which is same the one as used above
marketc['rdate']=pd.to_datetime(marketc['Date'].astype(int).astype(str),format='%Y%m%d')+MonthEnd(0)
marketc['yyyymm']=marketc.rdate.dt.year*100+marketc.rdate.dt.month
print(marketc.head())

       Date    ID    Price    Return    Numsh      rdate  yyyymm
0  20010131  MSFT  61.0625  0.407781  5335391 2001-01-31  200101
1  20010228  MSFT  59.0000 -0.033777  5335391 2001-02-28  200102
2  20010330  MSFT  54.6875 -0.073093  5336000 2001-03-31  200103
3  20010430  MSFT  67.7500  0.238857  5382102 2001-04-30  200104
4  20010531  MSFT  69.1800  0.021107  5382102 2001-05-31  200105


#### Find lagged market cap values and ids
Here, by using *groupby(['ID'])*, lagged values are *correctly* identified for each id

It is also important to note that the file should be **sorted by (ID and Date)** before lagged values are identified.


In [35]:
print(marketc.columns)
# Calculate the market capitalization by multiplying Price and Numsh
marketc['marketcap']=marketc.Price * marketc.Numsh

# Sort the marketcap data by ID and Date
# This is important to use .sort_values() before using .shift()
# After sorting, reset the index to avoid confusion
marketcap=marketc.sort_values(['ID','Date'],ascending=True).reset_index(drop=True)
# Check whether the sorting is correct
print(marketcap.head())
##Here, it is important to use .shirt(1) with groupby(['ID']) 
### to correctly get the lagged market cap for each ID
### Otherwise, it may use the other ID's market cap as its lagged value

## shift(1) means the previous row in the group
## If you want to use the next row, use shift(-1)
marketcap['lmc']=marketcap.groupby(['ID'])['marketcap'].shift(1)#lagged marketcap
marketcap['lid']=marketcap.groupby(['ID'])['ID'].shift(1)

Index(['Date', 'ID', 'Price', 'Return', 'Numsh', 'rdate', 'yyyymm'], dtype='object')
       Date   ID   Price    Return    Numsh      rdate  yyyymm     marketcap
0  20010131  IBM  112.00  0.317647  1754380 2001-01-31  200101  1.964906e+08
1  20010228  IBM   99.90 -0.106875  1754380 2001-02-28  200102  1.752626e+08
2  20010330  IBM   96.18 -0.037237  1760804 2001-03-31  200103  1.693541e+08
3  20010430  IBM  115.14  0.197130  1760804 2001-04-30  200104  2.027390e+08
4  20010531  IBM  111.80 -0.027792  1737418 2001-05-31  200105  1.942433e+08


In [36]:
# Check whether the lagged market cap is correct
print(marketcap[['ID','Date','marketcap','lmc','lid']].head())
print(marketcap.loc[160:180,['ID','Date','marketcap','lmc','lid']])

    ID      Date     marketcap           lmc  lid
0  IBM  20010131  1.964906e+08           NaN  NaN
1  IBM  20010228  1.752626e+08  1.964906e+08  IBM
2  IBM  20010330  1.693541e+08  1.752626e+08  IBM
3  IBM  20010430  2.027390e+08  1.693541e+08  IBM
4  IBM  20010531  1.942433e+08  2.027390e+08  IBM
       ID      Date     marketcap           lmc   lid
160   IBM  20140530  1.866061e+08  1.988636e+08   IBM
161   IBM  20140630  1.834784e+08  1.866061e+08   IBM
162   IBM  20140731  1.912085e+08  1.834784e+08   IBM
163   IBM  20140829  1.918369e+08  1.912085e+08   IBM
164   IBM  20140930  1.893729e+08  1.918369e+08   IBM
165   IBM  20141031  1.627001e+08  1.893729e+08   IBM
166   IBM  20141128  1.604932e+08  1.627001e+08   IBM
167   IBM  20141231  1.587811e+08  1.604932e+08   IBM
168  MSFT  20010131  3.257923e+08           NaN   NaN
169  MSFT  20010228  3.147881e+08  3.257923e+08  MSFT
170  MSFT  20010330  2.918125e+08  3.147881e+08  MSFT
171  MSFT  20010430  3.646374e+08  2.918125e+08  MSF

In [37]:
# Read fbhrs.ft, which we created last week.
fbhrs=pd.read_feather('./fbhrs.ft')

In [38]:
# /*add market cap information to bhr file*/
# Use the SQL to combine two dataframes.
# Note that two dataframes are merged by matching ids and yyyymm values
query='''select a.*,b.marketcap,b.lmc,b.rdate
            from fbhrs as a
            left join marketcap as b
            where a.id = b.ID and a.yyyymm = b.yyyymm
            order by a.id, a.yyyymm'''
#Run the query and store the result in bhrmc
bhrmc=pysqldf(query)

print(bhrmc.head())
print(bhrmc.describe())
print(bhrmc['bhr1y'])
print(bhrmc['bhr1y'].tail(20))

    id  yyyymm    logsum  nummonths     bhr1y     marketcap           lmc  \
0  IBM  200101  0.357970         12  0.430423  1.964906e+08           NaN   
1  IBM  200102 -0.032225         12 -0.031711  1.752626e+08  1.964906e+08   
2  IBM  200103 -0.012692         12 -0.012612  1.693541e+08  1.752626e+08   
3  IBM  200104  0.083456         12  0.087038  2.027390e+08  1.693541e+08   
4  IBM  200105 -0.312906         12 -0.268681  1.942433e+08  2.027390e+08   

                        rdate  
0  2001-01-31 00:00:00.000000  
1  2001-02-28 00:00:00.000000  
2  2001-03-31 00:00:00.000000  
3  2001-04-30 00:00:00.000000  
4  2001-05-31 00:00:00.000000  
              yyyymm      logsum   nummonths       bhr1y     marketcap  \
count     468.000000  468.000000  468.000000  435.000000  4.680000e+02   
mean   200706.500000    0.049146   11.576923    0.065536  2.158384e+08   
std       374.582071    0.174003    1.752472    0.186702  5.313461e+07   
min    200101.000000   -0.626890    1.000000   -0

## Value-weighted returns

$R_{vw}=\sum_{i=1}^{n} w_i \times R_i$ where $w_i= \frac{MC_i}{\sum_{i=1}^{n} MC_i} $

Therefore, $R_{vw}=\sum_{i=1}^{n} \frac{MC_i}{\sum_{i=1}^{n} MC_i} \times R_i = \frac{\sum_{i=1}^{n} MC_i \times R_i}{\sum_{i=1}^{n} MC_i}$

One thing you have to be aware of is the fact that the market cap used here should be the market cap at the **beginning of the month (lmc)**, not at the end of the month.  

In [39]:
# /*calculate equally-weighted and value-weighted returns of three stocks */
# Note that sum are calculated in each group specified in "group by"(i.e., in each month (yyyymm)).
# In other words, equally- and value-weighted returns are calculated in each yyyymm

query='''select a.yyyymm, sum(a.bhr1y)/count(a.bhr1y) as ewbhr1y,
                sum(a.bhr1y * a.lmc) / sum(a.lmc) as vwbhr1y, 
                count(a.bhr1y) as numstock
            from bhrmc as a
            group by a.yyyymm
            order by a.yyyymm'''

# Run the query and store the result in portret
portret=pysqldf(query)

# Check the first 20 rows and the last few rows of portret
print(portret.head(20))
print(portret.tail())

    yyyymm   ewbhr1y   vwbhr1y  numstock
0   200101  0.349003       NaN         3
1   200102  0.024466  0.030362         3
2   200103  0.073645  0.068677         3
3   200104  0.136714  0.137508         3
4   200105 -0.137322 -0.147868         3
5   200106 -0.163099 -0.175687         3
6   200107 -0.158866 -0.173322         3
7   200108 -0.239212 -0.236591         3
8   200109 -0.087263 -0.085107         3
9   200110 -0.168423 -0.148427         3
10  200111 -0.099295 -0.087645         3
11  200112 -0.120632 -0.111027         3
12  200201 -0.230607 -0.221988         3
13  200202 -0.241252 -0.239652         3
14  200203 -0.201661 -0.201076         3
15  200204 -0.193554 -0.187987         3
16  200205  0.005709  0.002081         3
17  200206  0.017000  0.001020         3
18  200207  0.025652 -0.004228         3
19  200208  0.137165  0.130813         3
     yyyymm  ewbhr1y  vwbhr1y  numstock
151  201308      NaN      NaN         0
152  201309      NaN      NaN         0
153  201310      Na

In [40]:
# /*Compare average equally- and value-weighted returns*/
# The following calcualte simple averages of monthly equally- and value-
# weighted returns calculated above across all year/month

query='''select avg(a.ewbhr1y) as avgew, avg(a.vwbhr1y) as avgvw
            from portret as a'''
summary=pysqldf(query)
print('Average using SQL   :\n', summary.head())
print('Average using mean   :\n', portret[['ewbhr1y','vwbhr1y']].mean())
print(portret.describe())

Average using SQL   :
       avgew     avgvw
0  0.065536  0.054611
Average using mean   :
 ewbhr1y    0.065536
vwbhr1y    0.054611
dtype: float64
             yyyymm     ewbhr1y     vwbhr1y    numstock
count     156.00000  145.000000  144.000000  156.000000
mean   200706.50000    0.065536    0.054611    2.788462
std       375.38676    0.141365    0.135933    0.770501
min    200101.00000   -0.241252   -0.262408    0.000000
25%    200403.75000   -0.019622   -0.021618    3.000000
50%    200706.50000    0.076353    0.061526    3.000000
75%    201009.25000    0.142813    0.137663    3.000000
max    201312.00000    0.446937    0.413614    3.000000


## Form portfolios based on market cap

Check which company has larger cap during the sample period

In [41]:
#Find the average lagged market cap for each id
# and call it avgmarketcap
# Note that avgmarketcap is the average of lagged market cap (lmc) for each ID acorr time

query='''select b.id, avg(b.lmc) as avgmarketcap 
            from marketcap as b group by b.id'''

summc=pysqldf(query)
print('Average calculated using sql: \n',summc)
print('Average calculated using mean()/groupby(): \n',marketcap.groupby('ID').lmc.mean())

Average calculated using sql: 
      ID  avgmarketcap
0   IBM  1.671088e+08
1  MSFT  2.710105e+08
2   WMT  2.186112e+08
Average calculated using mean()/groupby(): 
 ID
IBM     1.671088e+08
MSFT    2.710105e+08
WMT     2.186112e+08
Name: lmc, dtype: float64


#### Find out the size group using only the market caps in Jan. 2001

In [42]:
# Assign firms into a size portfolio based on the market cap on 20010131

# First, find out the median of the market cap in Jan. 2001 and call it med
bhrmc['med']=np.median(bhrmc[bhrmc['yyyymm']==200101]['marketcap'])# calcualte the median market cap in Jan 2001

# Check the median value
print(bhrmc.med.describe())
print(bhrmc.head(200))
#check the median market cap in Jan 2001 to see whether med is correct
print(bhrmc.loc[bhrmc.yyyymm==200101,['marketcap']].describe())

# Check the median market cap and market cap of each firm in Jan 2001
print(bhrmc[bhrmc.yyyymm==200101][['med','marketcap','id']])

count    4.680000e+02
mean     2.536879e+08
std      2.028727e-06
min      2.536879e+08
25%      2.536879e+08
50%      2.536879e+08
75%      2.536879e+08
max      2.536879e+08
Name: med, dtype: float64
       id  yyyymm    logsum  nummonths     bhr1y     marketcap           lmc  \
0     IBM  200101  0.357970         12  0.430423  1.964906e+08           NaN   
1     IBM  200102 -0.032225         12 -0.031711  1.752626e+08  1.964906e+08   
2     IBM  200103 -0.012692         12 -0.012612  1.693541e+08  1.752626e+08   
3     IBM  200104  0.083456         12  0.087038  2.027390e+08  1.693541e+08   
4     IBM  200105 -0.312906         12 -0.268681  1.942433e+08  2.027390e+08   
..    ...     ...       ...        ...       ...           ...           ...   
195  MSFT  200404  0.083891         12  0.087510  2.820653e+08  2.686706e+08   
196  MSFT  200405  0.082571         12  0.086075  2.831448e+08  2.820653e+08   
197  MSFT  200406  0.101417         12  0.106738  3.102187e+08  2.831448e+08  

In [43]:
#Assign firms into a size portfolio based on the market cap on 20010131

# Use SQL for the portfolio assignment
# find the list of ids available in Jan. 2001 and add a size indicator (isize)
# based on the median market cap in Jan. 2001 found above ("med") (1=large, 2=small)

# Note that "case when" is used to assign a value based on a condition
# If the condition is true, it returns 1, otherwise it returns 2

query='''select a.id, 
                case when a.marketcap >= a.med then 1 else 2 end as isize
            from bhrmc as a
            where a.yyyymm = 200101
            order by a.id'''

# Run the query and store the result in isize
isize=pysqldf(query)

# Check the columns and the first few rows of isize
print(isize.columns)
print(isize)

Index(['id', 'isize'], dtype='object')
     id  isize
0   IBM      2
1  MSFT      1
2   WMT      1


In [44]:
# Combine the above result with the bhrmc dataframe by matching ids
# (adding isize to bhrmc file)

query='''select a.*,b.isize
            from bhrmc as a
            left join isize as b
            on a.id = b.id
            order by a.id, a.yyyymm'''
bhrmc2=pysqldf(query)

# Confirm that isize is added to bhrmc2
print(bhrmc2)

      id  yyyymm    logsum  nummonths     bhr1y     marketcap           lmc  \
0    IBM  200101  0.357970         12  0.430423  1.964906e+08           NaN   
1    IBM  200102 -0.032225         12 -0.031711  1.752626e+08  1.964906e+08   
2    IBM  200103 -0.012692         12 -0.012612  1.693541e+08  1.752626e+08   
3    IBM  200104  0.083456         12  0.087038  2.027390e+08  1.693541e+08   
4    IBM  200105 -0.312906         12 -0.268681  1.942433e+08  2.027390e+08   
..   ...     ...       ...        ...       ...           ...           ...   
463  WMT  201308  0.021951          5       NaN  2.391320e+08  2.553843e+08   
464  WMT  201309  0.081285          4       NaN  2.407735e+08  2.391320e+08   
465  WMT  201310  0.067946          3       NaN  2.489821e+08  2.407735e+08   
466  WMT  201311  0.030918          2       NaN  2.628018e+08  2.489821e+08   
467  WMT  201312 -0.023102          1       NaN  2.546229e+08  2.628018e+08   

                          rdate          med  isize

### Calculate summary statistics for each isize group

In [45]:
# Use SQL to summarize the average, maximum, minimum, and count of bhr1y
# grouped by isize (size portfolio)

query='''select a.isize,avg(a.bhr1y) as avg, max(a.bhr1y) as max, min(a.bhr1y) as min,
                count(a.bhr1y) as num
            from bhrmc2 as a
            group by a.isize'''
summary=pysqldf(query)
print(summary)

#Alternativley, you can use the following code to summarize the average, maximum, minimum, and count of bhr1y
# grouped by isize (size portfolio)
print(bhrmc2.groupby('isize')['bhr1y'].describe())

   isize       avg       max       min  num
0      1  0.054897  0.811814 -0.465749  290
1      2  0.086814  0.586113 -0.360123  145
       count      mean       std       min       25%       50%       75%  \
isize                                                                      
1      290.0  0.054897  0.176339 -0.465749 -0.045086  0.048799  0.133958   
2      145.0  0.086814  0.204856 -0.360123 -0.047152  0.072456  0.232000   

            max  
isize            
1      0.811814  
2      0.586113  


# Calendar-time portfolio formation and return calculation

### Calendar time portfolio is a portfolio formed in each month
#### In each month, the portfolio is composed of firms that had an event within the past 24 mont

A firm is included **only once** even if it had multiple events within the past 24 months.

In [46]:
#Read the data

events=pd.read_excel('./note4data.xlsx', sheet_name="Events", header=0)
print(events.dtypes)
print(events)

ID      object
Date     int64
dtype: object
      ID      Date
0   MSFT  20040105
1   MSFT  20050510
2   MSFT  20081025
3    IBM  20020108
4    IBM  20060207
5    IBM  20090604
6    IBM  20121017
7    WMT  20011202
8    WMT  20030920
9    WMT  20060320
10   WMT  20070205
11   WMT  20081105
12   WMT  20110503
13   WMT  20120224


In [47]:
#/*combine the event data with the return data*/
# Match by ids only

query='''select a.*,b.rdate,b.return,b.lmc,b.marketcap
            from events as a
            left join marketcap as b 
            on a.id=b.id
            order by a.id, a.date,b.date'''

eventret=pysqldf(query)

print(eventret.head())
# Check the number of rows and columns for eventret and two input dataframes
print("Number of rows and columns in eventret:", eventret.shape)
print("Number of rows and columns in events:", events.shape)
print("Number of rows and columns in marketcap:", marketcap.shape)

#This is not a Cartesian product, but a join
# Therefore, the number of rows in eventret is NOT the product of
# the number of rows in events and marketcap

print(events.shape[0]*marketcap.shape[0])

    ID      Date                       rdate    Return           lmc  \
0  IBM  20020108  2001-01-31 00:00:00.000000  0.317647           NaN   
1  IBM  20020108  2001-02-28 00:00:00.000000 -0.106875  1.964906e+08   
2  IBM  20020108  2001-03-31 00:00:00.000000 -0.037237  1.752626e+08   
3  IBM  20020108  2001-04-30 00:00:00.000000  0.197130  1.693541e+08   
4  IBM  20020108  2001-05-31 00:00:00.000000 -0.027792  2.027390e+08   

      marketcap  
0  1.964906e+08  
1  1.752626e+08  
2  1.693541e+08  
3  2.027390e+08  
4  1.942433e+08  
Number of rows and columns in eventret: (2352, 6)
Number of rows and columns in events: (14, 2)
Number of rows and columns in marketcap: (504, 10)
7056


In [48]:
## convert the date format of Date in eventret
## to a datetime variable and call it "edate" (event date)

## In addition, call the return date as "rd" (return date)

## Note that yyyymm indicates year and month of return date (not event date)
eventret['edate']=pd.to_datetime(eventret['Date'].astype(int).astype(str),format='%Y%m%d')

eventret['rd']=pd.to_datetime(eventret['rdate'])

##yyyymm is the year-month of returns
eventret['yyyymm']=eventret.rd.dt.year*100+eventret.rd.dt.month
print(eventret.head())

    ID      Date                       rdate    Return           lmc  \
0  IBM  20020108  2001-01-31 00:00:00.000000  0.317647           NaN   
1  IBM  20020108  2001-02-28 00:00:00.000000 -0.106875  1.964906e+08   
2  IBM  20020108  2001-03-31 00:00:00.000000 -0.037237  1.752626e+08   
3  IBM  20020108  2001-04-30 00:00:00.000000  0.197130  1.693541e+08   
4  IBM  20020108  2001-05-31 00:00:00.000000 -0.027792  2.027390e+08   

      marketcap      edate         rd  yyyymm  
0  1.964906e+08 2002-01-08 2001-01-31  200101  
1  1.752626e+08 2002-01-08 2001-02-28  200102  
2  1.693541e+08 2002-01-08 2001-03-31  200103  
3  2.027390e+08 2002-01-08 2001-04-30  200104  
4  1.942433e+08 2002-01-08 2001-05-31  200105  


In [49]:
from dateutil.relativedelta import *

## You can use the "relativedelta" fundtion to find out
## the date corresponding to a certain number of months 
#  plus (or minus) a certain date.  
# (MonthEnd() is not used here since event date is not necessarily the end of a month)

##Find out the date 24 months after the event date

## "e24" is the date 24 months after the event date
## eyyyymm and e24yyyymm are year and months of event date and 24 months after the corresponding event date.


#appy.(lambda x: f(x)) is used to run f(x) function 
# for each observation of a column (here, eventret.edate)


eventret['e24']=eventret.edate.apply(lambda x: x+relativedelta(months=+24))
#check whether e24 is correct
print(eventret[['edate','e24']])

# eyyyymm and e24yyyymm are year and months of event date and 24 months after the corresponding event date.
eventret['eyyyymm']=eventret.edate.dt.year*100+eventret.edate.dt.month
eventret['e24yyyymm']=eventret.e24.dt.year*100+eventret.e24.dt.month

#Check the output of eventret
print(eventret.head())
print(eventret.tail())
print(eventret[['ID','edate','yyyymm','e24','rd','eyyyymm','e24yyyymm']])

          edate        e24
0    2002-01-08 2004-01-08
1    2002-01-08 2004-01-08
2    2002-01-08 2004-01-08
3    2002-01-08 2004-01-08
4    2002-01-08 2004-01-08
...         ...        ...
2347 2012-02-24 2014-02-24
2348 2012-02-24 2014-02-24
2349 2012-02-24 2014-02-24
2350 2012-02-24 2014-02-24
2351 2012-02-24 2014-02-24

[2352 rows x 2 columns]
    ID      Date                       rdate    Return           lmc  \
0  IBM  20020108  2001-01-31 00:00:00.000000  0.317647           NaN   
1  IBM  20020108  2001-02-28 00:00:00.000000 -0.106875  1.964906e+08   
2  IBM  20020108  2001-03-31 00:00:00.000000 -0.037237  1.752626e+08   
3  IBM  20020108  2001-04-30 00:00:00.000000  0.197130  1.693541e+08   
4  IBM  20020108  2001-05-31 00:00:00.000000 -0.027792  2.027390e+08   

      marketcap      edate         rd  yyyymm        e24  eyyyymm  e24yyyymm  
0  1.964906e+08 2002-01-08 2001-01-31  200101 2004-01-08   200201     200401  
1  1.752626e+08 2002-01-08 2001-02-28  200102 2004-01-08   2

In [50]:
## Compare the results with those using MonthEnd()
eventret['e24a']=eventret.edate+MonthEnd(+24)
print(eventret[['edate','e24','e24a']])

          edate        e24       e24a
0    2002-01-08 2004-01-08 2003-12-31
1    2002-01-08 2004-01-08 2003-12-31
2    2002-01-08 2004-01-08 2003-12-31
3    2002-01-08 2004-01-08 2003-12-31
4    2002-01-08 2004-01-08 2003-12-31
...         ...        ...        ...
2347 2012-02-24 2014-02-24 2014-01-31
2348 2012-02-24 2014-02-24 2014-01-31
2349 2012-02-24 2014-02-24 2014-01-31
2350 2012-02-24 2014-02-24 2014-01-31
2351 2012-02-24 2014-02-24 2014-01-31

[2352 rows x 3 columns]


## Calculate calendar time portfolio return

The portfolios are composed of firms with the events that occurred within the past 24 months

Find out the list of the stocks that satisfy the condition in each month

In each month, id will be listed if it satisfy the condition in where i.e., return date is within the 24-month window starting from the month after event month.

#### **distinct** is used in the "select" statement to select only unique observations (prevent same values to be selected multiple times.

In [51]:
#Save the eventret dataframe to an Excel file
# This will create an Excel file named 'eventret.xlsx' in the current directory
eventret.to_excel('./eventret.xlsx')

In [52]:
print(eventret.head())

    ID      Date                       rdate    Return           lmc  \
0  IBM  20020108  2001-01-31 00:00:00.000000  0.317647           NaN   
1  IBM  20020108  2001-02-28 00:00:00.000000 -0.106875  1.964906e+08   
2  IBM  20020108  2001-03-31 00:00:00.000000 -0.037237  1.752626e+08   
3  IBM  20020108  2001-04-30 00:00:00.000000  0.197130  1.693541e+08   
4  IBM  20020108  2001-05-31 00:00:00.000000 -0.027792  2.027390e+08   

      marketcap      edate         rd  yyyymm        e24  eyyyymm  e24yyyymm  \
0  1.964906e+08 2002-01-08 2001-01-31  200101 2004-01-08   200201     200401   
1  1.752626e+08 2002-01-08 2001-02-28  200102 2004-01-08   200201     200401   
2  1.693541e+08 2002-01-08 2001-03-31  200103 2004-01-08   200201     200401   
3  2.027390e+08 2002-01-08 2001-04-30  200104 2004-01-08   200201     200401   
4  1.942433e+08 2002-01-08 2001-05-31  200105 2004-01-08   200201     200401   

        e24a  
0 2003-12-31  
1 2003-12-31  
2 2003-12-31  
3 2003-12-31  
4 2003-12-3

In [None]:
##Notice the use of "distinct" here
## If not used, it can list the same yyyymm id row mutiple times

## Note that the conditions in the "where" statement dictates which observations to include
## based on yyyymm, event yyyymm (eyyyymm) and 24 months after the event yyyymm (e24yyyymm)

print(eventret.columns)

query='''select distinct a.yyyymm, a.id
                  from eventret as a
                  where  a.yyyymm > a.eyyyymm and a.yyyymm <= a.e24yyyymm
                  order by a.yyyymm,a.id''' # a.yyyymm > a.eyyyymm 해준 것은 forward looking 안하기 위함. 
                  # distinct 는 event가 24m 윈도우 내에서 여러 번 발생하더라도 1 번만 남기기 위함. 

portdat=pysqldf(query)
print(portdat.head())
print(portdat.iloc[20:50,])

Index(['ID', 'Date', 'rdate', 'Return', 'lmc', 'marketcap', 'edate', 'rd',
       'yyyymm', 'e24', 'eyyyymm', 'e24yyyymm', 'e24a'],
      dtype='object')
   yyyymm   ID
0  200201  WMT
1  200202  IBM
2  200202  WMT
3  200203  IBM
4  200203  WMT
    yyyymm    ID
20  200211   WMT
21  200212   IBM
22  200212   WMT
23  200301   IBM
24  200301   WMT
25  200302   IBM
26  200302   WMT
27  200303   IBM
28  200303   WMT
29  200304   IBM
30  200304   WMT
31  200305   IBM
32  200305   WMT
33  200306   IBM
34  200306   WMT
35  200307   IBM
36  200307   WMT
37  200308   IBM
38  200308   WMT
39  200309   IBM
40  200309   WMT
41  200310   IBM
42  200310   WMT
43  200311   IBM
44  200311   WMT
45  200312   IBM
46  200312   WMT
47  200401   IBM
48  200401   WMT
49  200402  MSFT


In [54]:
### We are now ready to calculate the calendar time portfolio returns
### We will calculate the equally- and value-weighted returns in each month

## calcualte the equally- and value-weighted calendar time portfolio returns in each month
## after joining market cap information.

query='''select a.yyyymm, sum(b.return)/count(b.return) as ewret,
                  sum(b.return*b.lmc)/sum(b.lmc) as vwret, count(b.return) as numstock
                from portdat as a 
                left join marketcap as b
                on  (a.id=b.id and a.yyyymm=b.yyyymm)
                group by a.yyyymm
                order by a.yyyymm'''

calret=pysqldf(query)
print(calret.head())
print(calret.tail())
print(calret.describe())

   yyyymm     ewret     vwret  numstock
0  200201  0.042224  0.042224         1
1  200202 -0.027707 -0.016640         2
2  200203  0.024844  0.016387         2
3  200204 -0.141680 -0.130681         2
4  200205 -0.034617 -0.033792         2
     yyyymm     ewret     vwret  numstock
149  201406 -0.016761 -0.016761         1
150  201407  0.057373  0.057373         1
151  201408  0.009026  0.009026         1
152  201409 -0.012845 -0.012845         1
153  201410 -0.133962 -0.133962         1
              yyyymm       ewret       vwret    numstock
count     154.000000  154.000000  154.000000  154.000000
mean   200798.642857    0.007749    0.007113    1.909091
std       371.362985    0.049153    0.047607    0.689450
min    200201.000000   -0.152166   -0.142266    1.000000
25%    200503.250000   -0.016479   -0.016731    1.000000
50%    200805.500000    0.008188    0.007264    2.000000
75%    201107.750000    0.034472    0.036668    2.000000
max    201410.000000    0.220664    0.170629    3.00

# P/B portfolio formation and return calculation

Input files used

    - 2020SP500Constituents_2025_Short.xlsx

We will make 3 dataframes (return, market cap and MB ratios) out of the input file and then combine them to calculate portfolio returns

2020SP500Constituents_2025.xlsx file includes the list of firms included in S&P 500 as of 2020 and other information of these firms (returns, market capitalization and market-to-book equity ratio) retrieved from Bloomberg in Excel as will be discussed in Note6W.  You will find that the information can be retrieved directly from Bloomberg using API as will be discussed in Note6W (and shown in FDNote6W2025.ipynb).

#### First, Make Return Dataframe

In [55]:
#########################################################
########read the return data
#########################################################
returns=pd.read_excel('./2020SP500Constituents_2025_Short.xlsx', sheet_name="Returns", header=0)
print(returns.columns[:5])
print(returns.iloc[:5,:5])

#discard the first three columns
returns=returns.iloc[:,3:].copy()#The first one is row and the second one is column
print(returns.columns[:5])
print(returns.iloc[:5,:5])


Index(['CUST_TRR_RETURN_HOLDING_PER', 'Date', 'Bdate', 'Edate',
       'WEC UN Equity'],
      dtype='object')
   CUST_TRR_RETURN_HOLDING_PER       Date      Bdate      Edate  WEC UN Equity
0                          NaN 2020-07-01 2020-06-30 2020-07-31       8.682259
1                          NaN 2020-08-01 2020-07-31 2020-08-31      -0.555573
2                          NaN 2020-09-01 2020-08-31 2020-09-30       2.997449
3                          NaN 2020-10-01 2020-09-30 2020-10-30       3.766770
4                          NaN 2020-11-01 2020-10-30 2020-11-30      -4.986378
Index(['Edate', 'WEC UN Equity', 'NWS UW Equity', 'LYB UN Equity',
       'AXP UN Equity'],
      dtype='object')
       Edate  WEC UN Equity  NWS UW Equity  LYB UN Equity  AXP UN Equity
0 2020-07-31       8.682259       6.778243      -4.869142      -1.526615
1 2020-08-31      -0.555573      18.103450       6.348149       8.861980
2 2020-09-30       2.997449      -6.623806       7.651191      -1.319027
3 2020-10

In [56]:
## select only those with available "Edate" information and rename "Edate" as "date"
ret0=returns.dropna(subset=['Edate']).copy()#drop all rows with Edate=NaN

#Rename
# it is important to use inplace=True to change the original dataframe
ret0.rename(columns={'Edate': 'date'}, inplace=True)

##check
print(ret0.shape)
print(ret0.columns)
print(ret0.columns.values[:5])

(54, 506)
Index(['date', 'WEC UN Equity', 'NWS UW Equity', 'LYB UN Equity',
       'AXP UN Equity', 'VZ UN Equity', 'AVGO UW Equity', 'BA UN Equity',
       'CAT UN Equity', 'JPM UN Equity',
       ...
       'NCLH UN Equity', 'MS UN Equity', 'APH UN Equity', 'PXD UN Equity',
       'DVN UN Equity', 'FTI UN Equity', 'WYNN UW Equity', 'APA UW Equity',
       'ALB UN Equity', 'VNT UN Equity'],
      dtype='object', length=506)
['date' 'WEC UN Equity' 'NWS UW Equity' 'LYB UN Equity' 'AXP UN Equity']


In [57]:
##sort by date
ret0sort=ret0.sort_values(['date'])

print(ret0sort.iloc[:5,:5])

        date  WEC UN Equity  NWS UW Equity  LYB UN Equity  AXP UN Equity
0 2020-07-31       8.682259       6.778243      -4.869142      -1.526615
1 2020-08-31      -0.555573      18.103450       6.348149       8.861980
2 2020-09-30       2.997449      -6.623806       7.651191      -1.319027
3 2020-10-30       3.766770      -6.866953      -2.894027      -8.618539
4 2020-11-30      -4.986378      36.789550      25.810100      29.975890


#### Trasnpose the data to calculate portfolio returns

In the current format, returns are across different columns.
To calculate portfolio returns, it is easy to have returns in one column, not across different columns

In [58]:
##transpose the data (column names are called "id" and values are called "ret")
# pd.melt() is used to reshape the DataFrame
# It converts the DataFrame from wide format to long format

#id_vars=['date'] specifies the columns to keep as identifiers
#var_name='id' specifies the name of the new column that will contain the former column names
#value_name='ret' specifies the name of the new column that will contain the former values

returns0=pd.melt(ret0sort,id_vars=['date'],var_name='id',value_name='ret')

#check the first few rows and the rows with ret=-99
print(returns0.head())
print (returns0.loc[returns0['ret']==-99])

        date             id       ret
0 2020-07-31  WEC UN Equity  8.682259
1 2020-08-31  WEC UN Equity -0.555573
2 2020-09-30  WEC UN Equity  2.997449
3 2020-10-30  WEC UN Equity  3.766770
4 2020-11-30  WEC UN Equity -4.986378
Empty DataFrame
Columns: [date, id, ret]
Index: []


In [59]:
# Print the row at index 352
# This will show the data for the specific date and id at that index
print(returns0.iloc[352,:])

date    2022-11-30 00:00:00
id             BA UN Equity
ret                25.52102
Name: 352, dtype: object


In [60]:
###Replace the "ret" column values of -99 as missing returns
returns0.loc[returns0['ret']==-99,'ret']=np.nan
### convert the return in % to decimal by dividing the return values by 100
returns0['ret']=returns0['ret']/100.0

#check
print(returns0.iloc[352,:])
print(returns0.head())
print(returns0.ret.describe())

date    2022-11-30 00:00:00
id             BA UN Equity
ret                 0.25521
Name: 352, dtype: object
        date             id       ret
0 2020-07-31  WEC UN Equity  0.086823
1 2020-08-31  WEC UN Equity -0.005556
2 2020-09-30  WEC UN Equity  0.029974
3 2020-10-30  WEC UN Equity  0.037668
4 2020-11-30  WEC UN Equity -0.049864
count    1080.000000
mean        0.017066
std         0.087469
min        -0.244431
25%        -0.037846
50%         0.012920
75%         0.063766
max         0.459312
Name: ret, dtype: float64


In [61]:
#use query to check summary statistics of monthly returns

query='''
        select avg(a.ret) as avg, count(a.ret) as num,
            sum(a.ret) as sum, min(a.ret) as min, max(a.ret) as max
        from returns0 as a
        where a.ret not null
    '''
print(pysqldf(query))

        avg   num        sum       min       max
0  0.017066  1080  18.431564 -0.244431  0.459312


#### Second, Make Market Cap Dataframe

In [62]:
## Read the market capitalization information 
## and drop the observations without Bdate and rename Bdate as date.
## Bdat is the date of market cap calculation in the "MarketCap" worksheet (Edate is a month after Bdate)

mc=pd.read_excel('./2020SP500Constituents_2025_Short.xlsx', sheet_name="MarketCap", header=0)
print(mc.columns[:5])
mc=mc.iloc[:,2:]#skip the first two columns
print(mc.columns[:5])

Index(['CUR_MKT_CAP', 'Date', 'Bdate', 'Edate', 'WEC UN Equity'], dtype='object')
Index(['Bdate', 'Edate', 'WEC UN Equity', 'NWS UW Equity', 'LYB UN Equity'], dtype='object')


In [63]:
mc=mc.drop(['Edate'],axis=1) #drop the Edate column

##In the data, market cap is the market cap on Bdate
mc.rename(columns={'Bdate': 'date'}, inplace=True)#rename Bdate column as date
mc=mc.dropna(subset=['date'])#if date is missing, drop
mcsort=mc.sort_values(['date'])#sort by date

In [64]:
print(mcsort.iloc[:5,:5])

        date  WEC UN Equity  NWS UW Equity  LYB UN Equity  AXP UN Equity
0 2020-06-30     27647.8366      6997.4238     21931.1744     76633.2867
1 2020-07-31     30048.2934      7493.5412     20871.6320     75137.6358
2 2020-08-31     29676.0807      8890.0126     21859.7963     81796.3183
3 2020-09-30     30565.6061      8244.5225     23532.3311     80717.4024
4 2020-10-30     31716.9421      7732.3846     22856.7600     73466.6260


In [65]:
#pd.melt: This function is useful to massage a DataFrame into a format where one or more
# columns are identifier variables (id_vars), while all other columns, 
#considered measured variables (value_vars), are “unpivoted” to the row axis, 
#leaving just two non-identifier columns, ‘variable’ and ‘value’.

mc0=pd.melt(mcsort,id_vars=['date'],var_name='id',value_name='mcap')#change the format
mc0.loc[mc0['mcap']==0.0,'mcap']=np.nan
mc0sort=mc0.sort_values(['id','date']).reset_index(drop=True)


#check
print(mc.iloc[:5,:5])
#print(mc.columns.values[:5])
#print(mc.shape)

print(mc0sort.iloc[:5,])
#print(mc0.loc[mc0['mcap']==0.0])

        date  WEC UN Equity  NWS UW Equity  LYB UN Equity  AXP UN Equity
0 2020-06-30     27647.8366      6997.4238     21931.1744     76633.2867
1 2020-07-31     30048.2934      7493.5412     20871.6320     75137.6358
2 2020-08-31     29676.0807      8890.0126     21859.7963     81796.3183
3 2020-09-30     30565.6061      8244.5225     23532.3311     80717.4024
4 2020-10-30     31716.9421      7732.3846     22856.7600     73466.6260
        date                  id  mcap
0 2020-06-30  2078185D UN Equity   NaN
1 2020-07-31  2078185D UN Equity   NaN
2 2020-08-31  2078185D UN Equity   NaN
3 2020-09-30  2078185D UN Equity   NaN
4 2020-10-30  2078185D UN Equity   NaN


#### Third, Make M/B Dataframe

In [66]:
#########for m/b  ###################################################################################

##In the MB worksheet, Edate is the date of market cap to book value information.

mb=pd.read_excel('./2020SP500Constituents_2025_Short.xlsx', sheet_name="MB", header=0)
print(mb.columns[:5])
mb=mb.iloc[:,2:]#skip the first two columns
print(mb.columns[:5])

Index(['MARKET_CAPITALIZATION_TO_BV', 'Bdate', 'Edate', 'WEC UN Equity',
       'NWS UW Equity'],
      dtype='object')
Index(['Edate', 'WEC UN Equity', 'NWS UW Equity', 'LYB UN Equity',
       'AXP UN Equity'],
      dtype='object')


In [67]:
## Process the MB data as we did for other data

mb.rename(columns={'Edate': 'date'}, inplace=True)#rename Edate columns
mb=mb.dropna(subset=['date'])#drop if date is missing
mbsort=mb.sort_values(['date'])# sort by date
mb0=pd.melt(mbsort,id_vars=['date'],var_name='id',value_name='mb')#change the format
mb0['year']=mb0['date'].dt.year#get year out of date and make it "year" column

#check
#print(mb.columns.values[:5])
print(mb.shape)
print(mb0.head())

(5, 506)
        date             id      mb  year
0 2020-06-30  WEC UN Equity  2.6626  2020
1 2021-06-30  WEC UN Equity  2.5905  2021
2 2022-06-30  WEC UN Equity  2.8117  2022
3 2023-06-30  WEC UN Equity  2.3828  2023
4 2024-06-28  WEC UN Equity  2.0457  2024


#### Drop rows with missing mb values

In [68]:
print(mb0.shape)
mb1=mb0.dropna(subset=['mb'])#drop those with missing mb
print(mb1.shape)
print(mb1.groupby(['date'])['mb'].describe())

(2525, 4)
(100, 4)
            count      mean         std       min       25%      50%  \
date                                                                   
2020-06-30   20.0  -2.52474   19.859317  -77.2004  1.179375  2.50415   
2021-06-30   20.0  13.13700   42.965612  -10.7934  1.847675  3.39870   
2022-06-30   20.0  -5.20962   38.242032 -164.9399  1.545100  2.18875   
2023-06-30   20.0  46.55105  192.228118  -12.1895  1.604475  2.34635   
2024-06-28   20.0  12.49511   43.154660  -37.4117  1.880250  2.35075   

                 75%       max  
date                            
2020-06-30  4.993675   11.7642  
2021-06-30  7.156900  193.9717  
2022-06-30  6.050300   18.4707  
2023-06-30  6.369175  862.7349  
2024-06-28  6.680075  187.5567  


##### Find the cutoff points for MB quintiles (5 groups)


In [69]:
## First, find out the percentile values of MB ratios in each month
mb2 = mb1.groupby(['date'])['mb'].describe(percentiles=[.2, .4, .6,.8]).reset_index()
print(mb2.head())

# Second, rename the columns for easier access
mb2 = mb2[['date','20%','40%','60%','80%']]\
.rename(columns={'20%':'quint20','40%':'quint40','60%':'quint60','80%':'quint80'})

print(mb2.columns)
print(mb2.head())

        date  count      mean         std       min      20%      40%  \
0 2020-06-30   20.0  -2.52474   19.859317  -77.2004  1.02410  1.67098   
1 2021-06-30   20.0  13.13700   42.965612  -10.7934  1.79258  2.92506   
2 2022-06-30   20.0  -5.20962   38.242032 -164.9399  1.26804  1.97252   
3 2023-06-30   20.0  46.55105  192.228118  -12.1895  1.48664  2.06024   
4 2024-06-28   20.0  12.49511   43.154660  -37.4117  1.83690  2.03278   

       50%      60%      80%       max  
0  2.50415  3.24376  5.27006   11.7642  
1  3.39870  4.31894  7.83904  193.9717  
2  2.18875  2.61136  6.27686   18.4707  
3  2.34635  4.24056  7.56532  862.7349  
4  2.35075  3.81054  9.50956  187.5567  
Index(['date', 'quint20', 'quint40', 'quint60', 'quint80'], dtype='object')
        date  quint20  quint40  quint60  quint80
0 2020-06-30  1.02410  1.67098  3.24376  5.27006
1 2021-06-30  1.79258  2.92506  4.31894  7.83904
2 2022-06-30  1.26804  1.97252  2.61136  6.27686
3 2023-06-30  1.48664  2.06024  4.24056  7.

Add quintile cutoff points and divide firms in to 5 groups based on mb;


In [70]:
# add cutoff points to the original mb data
# Note that columns to be included are selected from mb0 to avoid
# date column being duplicated in the result (both a and b have the date column)
query='''select a.id,a.mb,a.year,b.*
              from mb0 as a 
              left join mb2 as b 
              on a.date = b.date'''
              
mb3=pysqldf(query)
print(mb3.columns)
print(mb3.head())

Index(['id', 'mb', 'year', 'date', 'quint20', 'quint40', 'quint60', 'quint80'], dtype='object')
              id      mb  year                        date  quint20  quint40  \
0  WEC UN Equity  2.6626  2020  2020-06-30 00:00:00.000000  1.02410  1.67098   
1  WEC UN Equity  2.5905  2021  2021-06-30 00:00:00.000000  1.79258  2.92506   
2  WEC UN Equity  2.8117  2022  2022-06-30 00:00:00.000000  1.26804  1.97252   
3  WEC UN Equity  2.3828  2023  2023-06-30 00:00:00.000000  1.48664  2.06024   
4  WEC UN Equity  2.0457  2024  2024-06-28 00:00:00.000000  1.83690  2.03278   

   quint60  quint80  
0  3.24376  5.27006  
1  4.31894  7.83904  
2  2.61136  6.27686  
3  4.24056  7.56532  
4  3.81054  9.50956  


### Use SQL to form portfolios and calculate returns of the portfolio composed of all stocks in each portfolio


In [71]:
#Find out which MB portfolios each stock belongs to in each year

# Assgin firms into 5 groups based on mb using SQL;
# Note that "case when" is used to assign a value based on a condition
# new column "pmb" is the portfolio based on mb

query='''select a.*,
                case when a.mb <= a.quint20 then 1 else 
                    case when a.mb <= a.quint40 then 2 else
                        case when a.mb <= a.quint60 then 3 else
                            case when a.mb <= a.quint80 then 4 else 5
                            end
                        end
                    end
                end as pmb
            from mb3 as a'''
mb3a=pysqldf(query)
print(mb3a.head())

              id      mb  year                        date  quint20  quint40  \
0  WEC UN Equity  2.6626  2020  2020-06-30 00:00:00.000000  1.02410  1.67098   
1  WEC UN Equity  2.5905  2021  2021-06-30 00:00:00.000000  1.79258  2.92506   
2  WEC UN Equity  2.8117  2022  2022-06-30 00:00:00.000000  1.26804  1.97252   
3  WEC UN Equity  2.3828  2023  2023-06-30 00:00:00.000000  1.48664  2.06024   
4  WEC UN Equity  2.0457  2024  2024-06-28 00:00:00.000000  1.83690  2.03278   

   quint60  quint80  pmb  
0  3.24376  5.27006    3  
1  4.31894  7.83904    2  
2  2.61136  6.27686    4  
3  4.24056  7.56532    3  
4  3.81054  9.50956    3  


# Combine 3 dataframes

Add new variables

To combine data for each month, we create year and month columns

In [72]:
#/*combine the data*/
mb5=mb3a[mb3a['mb'].notnull()].copy()# Get rid of ones with missing mb values

print(mb5.date)
#the following change the format of date to a simmpler datetime format
mb5['date']=pd.to_datetime((mb5['date']).astype(str))
print(mb5.date)

0     2020-06-30 00:00:00.000000
1     2021-06-30 00:00:00.000000
2     2022-06-30 00:00:00.000000
3     2023-06-30 00:00:00.000000
4     2024-06-28 00:00:00.000000
                 ...            
95    2020-06-30 00:00:00.000000
96    2021-06-30 00:00:00.000000
97    2022-06-30 00:00:00.000000
98    2023-06-30 00:00:00.000000
99    2024-06-28 00:00:00.000000
Name: date, Length: 100, dtype: object
0    2020-06-30
1    2021-06-30
2    2022-06-30
3    2023-06-30
4    2024-06-28
        ...    
95   2020-06-30
96   2021-06-30
97   2022-06-30
98   2023-06-30
99   2024-06-28
Name: date, Length: 100, dtype: datetime64[ns]


In [73]:
# Add year and month columns

returns0['year']=returns0['date'].dt.year
mc0sort['year']=mc0sort['date'].dt.year
mb5['year']=mb5['date'].dt.year

returns0['month']=returns0['date'].dt.month
mc0sort['month']=mc0sort['date'].dt.month
mb5['month']=mb5['date'].dt.month

print(returns0.head())
print(mb5.head())
print(mc0.head())

        date             id       ret  year  month
0 2020-07-31  WEC UN Equity  0.086823  2020      7
1 2020-08-31  WEC UN Equity -0.005556  2020      8
2 2020-09-30  WEC UN Equity  0.029974  2020      9
3 2020-10-30  WEC UN Equity  0.037668  2020     10
4 2020-11-30  WEC UN Equity -0.049864  2020     11
              id      mb  year       date  quint20  quint40  quint60  quint80  \
0  WEC UN Equity  2.6626  2020 2020-06-30  1.02410  1.67098  3.24376  5.27006   
1  WEC UN Equity  2.5905  2021 2021-06-30  1.79258  2.92506  4.31894  7.83904   
2  WEC UN Equity  2.8117  2022 2022-06-30  1.26804  1.97252  2.61136  6.27686   
3  WEC UN Equity  2.3828  2023 2023-06-30  1.48664  2.06024  4.24056  7.56532   
4  WEC UN Equity  2.0457  2024 2024-06-28  1.83690  2.03278  3.81054  9.50956   

   pmb  month  
0    3      6  
1    2      6  
2    4      6  
3    3      6  
4    3      6  
        date             id        mcap
0 2020-06-30  WEC UN Equity  27647.8366
1 2020-07-31  WEC UN Equity  30

### Combine three data sets 

- yyyymm is the year-month of return
- mcapym is the year-month of market cap to be used for weights in vw
- mbym is the year-month of mb portfolio

mb portfolio is formed at the end of June and this is used from July of the year till June of the following year in portfolio formation.

Note the conditions used in join "on"

- When you combine the return datafreame and the market cap dataframe, make sure that the market cap is the market cap at one month before the return month.
    - ((a.year-b.year)*12+(a.month-b.month)) =1: To make sure that lagged market cap is indeed the market cap one month prior to the return month: 
        - b (mc0sort)' year/month is the year/month for market cap and 
        - a (returns0)'s year/month is the year/month for returns
- When you combine the return dataframe and the MB portfolio dataframe, make sure that returns are included from July of the MB portfolio formation year and June of the following year (Portfolios are formed in June of each year)
    - ((a.year-c.year)*12+(a.month-c.month)) between 1 and 12: To combine June BM of year t with returns from July of year t to June of year t+1: 
        - c (mb5)'s year/month is the year/month for market-to-book ratio and 
        - a (returns0)'s year/month is the year/month for returns (returns are included from one month after untill twelve months after the market-to-book calculation month.

Check a simpler way used to retrive M/B ratios and other information using Bloomber API in the next week's notebook, FDNote6W2025.ipynb

In [74]:
### Combine the returns, market cap, and mb dataframes using SQL
# Note the conditions used in join "on" as explained above
# The conditions make sure that the mcap is the market cap one month prior to the return month
# and that returns are included from July of the MB portfolio formation year and June of the following

query='''select a.date,a.year*100+a.month as yyyymm,a.id
            ,a.ret,b.mcap,c.mb,c.pmb,
            b.year*100+b.month as mcapym,
            c.year*100+c.month as mbym
         from returns0 as a
         left join mc0sort as b on ((a.year-b.year)*12+(a.month-b.month)) =1 and a.id=b.id 
         left join mb5 as c on a.id=c.id and ((a.year-c.year)*12+(a.month-c.month)) between 1 and 12
         order by a.id,a.date'''
              
data=pysqldf(query)
print(data.head())
print(data[data.ret.notnull()].head())#print only those with non-missing ret
print(data.columns)

                         date  yyyymm                  id  ret  mcap  mb  pmb  \
0  2020-07-31 00:00:00.000000  202007  2078185D UN Equity  NaN   NaN NaN  NaN   
1  2020-08-31 00:00:00.000000  202008  2078185D UN Equity  NaN   NaN NaN  NaN   
2  2020-09-30 00:00:00.000000  202009  2078185D UN Equity  NaN   NaN NaN  NaN   
3  2020-10-30 00:00:00.000000  202010  2078185D UN Equity  NaN   NaN NaN  NaN   
4  2020-11-30 00:00:00.000000  202011  2078185D UN Equity  NaN   NaN NaN  NaN   

     mcapym  mbym  
0  202006.0   NaN  
1  202007.0   NaN  
2  202008.0   NaN  
3  202009.0   NaN  
4  202010.0   NaN  
                           date  yyyymm              id       ret  \
594  2020-07-31 00:00:00.000000  202007  ABBV UN Equity -0.021769   
595  2020-08-31 00:00:00.000000  202008  ABBV UN Equity  0.009061   
596  2020-09-30 00:00:00.000000  202009  ABBV UN Equity -0.085413   
597  2020-10-30 00:00:00.000000  202010  ABBV UN Equity -0.015108   
598  2020-11-30 00:00:00.000000  202011  ABBV UN

In [75]:
###check whether mcapym is the previous month (market cap), 
## and mbym (year/month of MB calculation) changes in July
print(data.loc[data.pmb.notnull(),['date','mcapym','mb','pmb','mbym','id','yyyymm','mcapym','mbym']].reset_index(drop=True).head(50))
#############################################################################             

                          date    mcapym       mb  pmb      mbym  \
0   2020-07-31 00:00:00.000000  202006.0  11.7642  5.0  202006.0   
1   2020-08-31 00:00:00.000000  202007.0  11.7642  5.0  202006.0   
2   2020-09-30 00:00:00.000000  202008.0  11.7642  5.0  202006.0   
3   2020-10-30 00:00:00.000000  202009.0  11.7642  5.0  202006.0   
4   2020-11-30 00:00:00.000000  202010.0  11.7642  5.0  202006.0   
5   2020-12-31 00:00:00.000000  202011.0  11.7642  5.0  202006.0   
6   2021-01-29 00:00:00.000000  202012.0  11.7642  5.0  202006.0   
7   2021-02-26 00:00:00.000000  202101.0  11.7642  5.0  202006.0   
8   2021-03-31 00:00:00.000000  202102.0  11.7642  5.0  202006.0   
9   2021-04-30 00:00:00.000000  202103.0  11.7642  5.0  202006.0   
10  2021-05-31 00:00:00.000000  202104.0  11.7642  5.0  202006.0   
11  2021-06-30 00:00:00.000000  202105.0  11.7642  5.0  202006.0   
12  2021-07-30 00:00:00.000000  202106.0  15.8284  5.0  202106.0   
13  2021-08-31 00:00:00.000000  202107.0  15.828

#### Calculate the value-weighted and equally-weighted returns 
using only thoe with available return and portfolio information


In [76]:
#/*calculate returns of the portfolio*/
##Note that in the data, mcap is the market cap at the end of the month prior to the return month
##Therefore, we do not need to use the lagged market cap.

#Include only thoes with non-missing return, pmb and market cap
#An additional condition is that pmb is not 0

data1=data.loc[(data.ret.notnull() & data.pmb.notnull() &\
                data.pmb!=0 & data.mcap.notnull()),:].reset_index(drop=True).copy()
# Check the number of rows and columns before and after filtering
print("Before filtering:", data.shape)
print("After filtering:", data1.shape)
print(data1.head())

Before filtering: (27270, 9)
After filtering: (1080, 9)
                         date  yyyymm              id       ret         mcap  \
0  2020-07-31 00:00:00.000000  202007  ABBV UN Equity -0.021769  173027.9187   
1  2020-08-31 00:00:00.000000  202008  ABBV UN Equity  0.009061  167265.0210   
2  2020-09-30 00:00:00.000000  202009  ABBV UN Equity -0.085413  169018.0800   
3  2020-10-30 00:00:00.000000  202010  ABBV UN Equity -0.015108  154581.7440   
4  2020-11-30 00:00:00.000000  202011  ABBV UN Equity  0.228907  150187.3092   

        mb  pmb    mcapym      mbym  
0  11.7642  5.0  202006.0  202006.0  
1  11.7642  5.0  202007.0  202006.0  
2  11.7642  5.0  202008.0  202006.0  
3  11.7642  5.0  202009.0  202006.0  
4  11.7642  5.0  202010.0  202006.0  


In [77]:
# We are now ready to calculate the portfolio returns
# Note that the portfolio returns are calculated in each month (yyyymm)

# calculate equally- and value-weighted returns 
# of each pmb portfolio in each month

query='''select a.yyyymm, a.pmb, sum(a.ret)/count(a.ret) as ewret,
             sum(a.ret*a.mcap)/sum(a.mcap) as vwret,count(a.ret) as numstock
         from data1 as a
         group by a.yyyymm, pmb
         order by a.yyyymm, pmb'''              
ret=pysqldf(query)
print(ret.head(30))

    yyyymm  pmb     ewret     vwret  numstock
0   202007  1.0 -0.000460  0.006232         4
1   202007  2.0 -0.054581 -0.021719         4
2   202007  3.0 -0.006112  0.014459         4
3   202007  4.0  0.054087  0.043299         4
4   202007  5.0  0.016782  0.016136         4
5   202008  1.0  0.113550  0.081012         4
6   202008  2.0 -0.006106  0.009459         4
7   202008  3.0  0.057520  0.096705         4
8   202008  4.0  0.055447  0.049175         4
9   202008  5.0  0.031494  0.044261         4
10  202009  1.0 -0.036222 -0.025361         4
11  202009  2.0 -0.108780 -0.092676         4
12  202009  3.0  0.007918 -0.035875         4
13  202009  4.0  0.012708  0.008863         4
14  202009  5.0 -0.018630 -0.011561         4
15  202010  1.0 -0.072201 -0.060021         4
16  202010  2.0 -0.039204 -0.010159         4
17  202010  3.0  0.044233  0.017343         4
18  202010  4.0  0.006273 -0.020993         4
19  202010  5.0 -0.038540 -0.028743         4
20  202011  1.0  0.272093  0.14277

In [78]:
##########################################################
###Calculate the average returns of each PMB group
###Note that below is the average of average returns
query='''select a.pmb, avg(ewret),avg(vwret),count(vwret) as num
             from ret as a
             group by pmb'''
avg=sqldf(query,locals())
print(avg)

   pmb  avg(ewret)  avg(vwret)  num
0  1.0    0.021794    0.019540   53
1  2.0    0.013375    0.015622   53
2  3.0    0.012337    0.010520   53
3  4.0    0.019331    0.018327   53
4  5.0    0.018494    0.023007   53


In [79]:
print(ret.groupby('pmb')[['ewret','vwret']].mean())
print(ret.groupby('pmb')[['ewret','vwret']].describe())

##########################################################

        ewret     vwret
pmb                    
1.0  0.021794  0.019540
2.0  0.013375  0.015622
3.0  0.012337  0.010520
4.0  0.019331  0.018327
5.0  0.018494  0.023007
    ewret                                                              \
    count      mean       std       min       25%       50%       75%   
pmb                                                                     
1.0  54.0  0.021794  0.074545 -0.128001 -0.020856  0.011632  0.075191   
2.0  54.0  0.013375  0.071947 -0.125322 -0.023934  0.013320  0.043417   
3.0  54.0  0.012337  0.062882 -0.137074 -0.022572  0.009289  0.051360   
4.0  54.0  0.019331  0.067631 -0.140069 -0.018847  0.009939  0.070006   
5.0  54.0  0.018494  0.050911 -0.078579 -0.018253  0.017289  0.050614   

              vwret                                                    \
          max count      mean       std       min       25%       50%   
pmb                                                                     
1.0  0.272093  53.0  0.01954

# Appendix 

Below are just for reference

- How to calculate returns using price information
- How to calcualte value-weighted returns using a function
- How to form MB portfolios using Python


## Appendix 1: Calculate returns using price information

In [80]:
########read the company id in the first row#########################################################
LowPEHead=pd.read_excel('./Note3w_RHistory2025_Short.xlsx', sheet_name="RHistory", skiprows=0,nrows=1,header=None)
print(LowPEHead.iloc[:,0:5].head())

    0          1          2          3          4
0 NaN  001060.KS  000640.KS  008260.KS  003300.KS


In [81]:
########read the price data                 #########################################################
####Skip the first 3 rows
LowPE=pd.read_excel('./Note3w_RHistory2025_Short.xlsx', sheet_name="RHistory", skiprows=3,header=None)
print(LowPE.iloc[:,0:5].head())

           0             1           2     3            4
0 2021-01-31  28462.707971  109223.325  2945   9766.35055
1 2021-02-28  26937.920044  112621.384  3220  10186.91110
2 2021-03-31  27399.976991  104854.392  3610  11915.88225
3 2021-04-30  26937.920044  119902.939  5800  12710.27440
4 2021-05-31  26799.302960  121359.250  5390  13037.37705


In [82]:
#change the column headings
##replace column names with the one in LowPEHead
LowPE.columns=LowPEHead.iloc[0,:]
print(LowPE.columns)

Index([        nan, '001060.KS', '000640.KS', '008260.KS', '003300.KS',
       '000240.KS', '006390.KS', '000050.KS', '005810.KS', '005610.KS',
       '000370.KS', '004560.KS', '009160.KS', '001250.KS', '000120.KS',
       '072710.KS', '002350.KS', '004250.KS', '004450.KS', '005390.KS',
       '015890.KS', '020000.KS', '019180.KS', '011070.KS', '053690.KS',
       '100220.KS', '102260.KS', '014830.KS', '079430.KS', '086280.KS',
       '089470.KS', '093050.KS', '011210.KS', '058860.KS', '129260.KS',
       '013870.KS', '264900.KS', '300720.KS', '402340.KS'],
      dtype='object', name=0)


In [83]:
##Rename the first column as date
LowPE.rename(columns={ LowPE.columns[0]: "date"}, inplace=True)
print(LowPE.shape)
print(LowPE.iloc[:,0:5].head())

(36, 39)
0       date     001060.KS   000640.KS  008260.KS    003300.KS
0 2021-01-31  28462.707971  109223.325       2945   9766.35055
1 2021-02-28  26937.920044  112621.384       3220  10186.91110
2 2021-03-31  27399.976991  104854.392       3610  11915.88225
3 2021-04-30  26937.920044  119902.939       5800  12710.27440
4 2021-05-31  26799.302960  121359.250       5390  13037.37705


In [84]:
print(LowPE.shape)

(36, 39)


In [85]:
###eliminate columns with all NA
LowPE.dropna(axis=1,how='all',inplace=True)
print(LowPE.shape)
print(LowPE.columns)
print(LowPE.head())
print(LowPE.describe().T)# by using .T you can transpose the output


(36, 11)
Index(['date', '001060.KS', '000640.KS', '008260.KS', '003300.KS', '000240.KS',
       '006390.KS', '000050.KS', '005810.KS', '005610.KS', '000370.KS'],
      dtype='object', name=0)
0       date     001060.KS   000640.KS  008260.KS    003300.KS  000240.KS  \
0 2021-01-31  28462.707971  109223.325       2945   9766.35055      15850   
1 2021-02-28  26937.920044  112621.384       3220  10186.91110      17250   
2 2021-03-31  27399.976991  104854.392       3610  11915.88225      18500   
3 2021-04-30  26937.920044  119902.939       5800  12710.27440      18600   
4 2021-05-31  26799.302960  121359.250       5390  13037.37705      20350   

0  006390.KS  000050.KS     005810.KS  005610.KS  000370.KS  
0      36500      12550  17888.906778      70000       3380  
1      38700      12050  16333.341500      71200       3780  
2      44000      13200  15300.007650      72500       4880  
3      41250      13150  18866.676100      70300       4775  
4      41400      15000  19300.0096

In [86]:
######### transpose the data #######################################################################
trans=pd.melt(LowPE,id_vars=['date'],var_name='id',value_name='price')
print(trans.head())
trans.sort_values(['id','date'],inplace=True)
trans.reset_index(drop=True,inplace=True)
print(trans.head())

        date         id         price
0 2021-01-31  001060.KS  28462.707971
1 2021-02-28  001060.KS  26937.920044
2 2021-03-31  001060.KS  27399.976991
3 2021-04-30  001060.KS  26937.920044
4 2021-05-31  001060.KS  26799.302960
        date         id    price
0 2021-01-31  000050.KS  12550.0
1 2021-02-28  000050.KS  12050.0
2 2021-03-31  000050.KS  13200.0
3 2021-04-30  000050.KS  13150.0
4 2021-05-31  000050.KS  15000.0


### One can create new columns as follows

In [87]:
#lagged values for each id
trans[['lprice','ldate']]=trans.groupby(['id'])[['price','date']].shift(1)
trans.reset_index(drop=True,inplace=True)# reset the index and drop old index values.  In addition, replace the original
print(trans.head(50))

         date         id    price   lprice      ldate
0  2021-01-31  000050.KS  12550.0      NaN        NaT
1  2021-02-28  000050.KS  12050.0  12550.0 2021-01-31
2  2021-03-31  000050.KS  13200.0  12050.0 2021-02-28
3  2021-04-30  000050.KS  13150.0  13200.0 2021-03-31
4  2021-05-31  000050.KS  15000.0  13150.0 2021-04-30
5  2021-06-30  000050.KS  14250.0  15000.0 2021-05-31
6  2021-07-31  000050.KS  13600.0  14250.0 2021-06-30
7  2021-08-31  000050.KS  13650.0  13600.0 2021-07-31
8  2021-09-30  000050.KS  13800.0  13650.0 2021-08-31
9  2021-10-31  000050.KS  13300.0  13800.0 2021-09-30
10 2021-11-30  000050.KS  13000.0  13300.0 2021-10-31
11 2021-12-31  000050.KS  13750.0  13000.0 2021-11-30
12 2022-01-31  000050.KS  14000.0  13750.0 2021-12-31
13 2022-02-28  000050.KS  14650.0  14000.0 2022-01-31
14 2022-03-31  000050.KS  15950.0  14650.0 2022-02-28
15 2022-04-30  000050.KS  15900.0  15950.0 2022-03-31
16 2022-05-31  000050.KS  15350.0  15900.0 2022-04-30
17 2022-06-30  000050.KS  12

In [88]:
#calculate returns
trans['return']=(trans['price']-trans['lprice'])/trans['lprice']
#create aYearMonth columns
trans['yyyymm']=trans['date'].dt.year*100+trans['date'].dt.month

### One should be careful not to mistakenly calcualted returns using prices of different companies
- It can happen when ID changes
- In addition, returns may not represent the return over the intended time interval (e.g., a month) when there are missing values

In [89]:
#  Find the differences in the number of months between the lagged observation and the current one
# and eliminate those with more than one month difference

trans['diffm']=(trans['date'].dt.year-trans['ldate'].dt.year)*12+\
                (trans['date'].dt.month-trans['ldate'].dt.month)
print(trans.head())
print(trans.dtypes)#types of each column values

        date         id    price   lprice      ldate    return  yyyymm  diffm
0 2021-01-31  000050.KS  12550.0      NaN        NaT       NaN  202101    NaN
1 2021-02-28  000050.KS  12050.0  12550.0 2021-01-31 -0.039841  202102    1.0
2 2021-03-31  000050.KS  13200.0  12050.0 2021-02-28  0.095436  202103    1.0
3 2021-04-30  000050.KS  13150.0  13200.0 2021-03-31 -0.003788  202104    1.0
4 2021-05-31  000050.KS  15000.0  13150.0 2021-04-30  0.140684  202105    1.0
date      datetime64[ns]
id                object
price            float64
lprice           float64
ldate     datetime64[ns]
return           float64
yyyymm             int32
diffm            float64
dtype: object


#### If the gap between the current and the previous month is greater than one month, set the return as missing

In [90]:
print(trans['return'].describe())

count    350.000000
mean       0.005756
std        0.108299
min       -0.315927
25%       -0.050392
50%       -0.002451
75%        0.052006
max        0.831897
Name: return, dtype: float64


In [91]:
trans.loc[(trans['diffm']!=1.0),'return']=np.nan
print(trans['return'].describe())

count    350.000000
mean       0.005756
std        0.108299
min       -0.315927
25%       -0.050392
50%       -0.002451
75%        0.052006
max        0.831897
Name: return, dtype: float64


## Use "groupby" to correctly calculate returns using prices

Below, you will find that **ret** includes wrong returns when ID changes to a different one while **ret1** and **ret2** correctly calculate returns even when ID changes (missing in those cases) by using **groupby(['id'])**

In [92]:
#Calculate returns in a simpler way
#Problem with a new ID
trans['ret']=(trans['price']-trans['price'].shift(1))/trans['price'].shift(1)
print(trans.ret.describe())

count    359.000000
mean       0.085739
std        1.405548
min       -0.811821
25%       -0.053413
50%       -0.003135
75%        0.053752
max       25.968722
Name: ret, dtype: float64


In [93]:
#The above code does not work correctly for the first observation of each ID
#because it uses the previous observation of the same ID
#Therefore, we need to use groupby to calculate the returns for each ID separately
#This will ensure that the return is calculated based on the previous observation of the same ID

#Correct one
#############################################
trans['ret1']=(trans['price']-trans.groupby(['id'])['price'].shift(1))\
                /trans.groupby(['id'])['price'].shift(1)
print(trans.ret1.describe())

count    350.000000
mean       0.005756
std        0.108299
min       -0.315927
25%       -0.050392
50%       -0.002451
75%        0.052006
max        0.831897
Name: ret1, dtype: float64


#### One can use ".pct_change()" to find the return with "groupby"

In [94]:

#trans['ret2']=trans.price.pct_change(fill_method=None)
trans['ret2']=trans.groupby(['id']).price.pct_change()
print(trans.ret2.describe())
print((trans.ret2-trans.ret1).describe())

print(trans[['id','date','ret','ret1','ret2']].head(50))

count    350.000000
mean       0.005756
std        0.108299
min       -0.315927
25%       -0.050392
50%       -0.002451
75%        0.052006
max        0.831897
Name: ret2, dtype: float64
count    3.500000e+02
mean    -3.035766e-19
std      4.697233e-17
min     -1.110223e-16
25%     -3.382711e-17
50%      0.000000e+00
75%      3.035766e-17
max      1.110223e-16
dtype: float64
           id       date       ret      ret1      ret2
0   000050.KS 2021-01-31       NaN       NaN       NaN
1   000050.KS 2021-02-28 -0.039841 -0.039841 -0.039841
2   000050.KS 2021-03-31  0.095436  0.095436  0.095436
3   000050.KS 2021-04-30 -0.003788 -0.003788 -0.003788
4   000050.KS 2021-05-31  0.140684  0.140684  0.140684
5   000050.KS 2021-06-30 -0.050000 -0.050000 -0.050000
6   000050.KS 2021-07-31 -0.045614 -0.045614 -0.045614
7   000050.KS 2021-08-31  0.003676  0.003676  0.003676
8   000050.KS 2021-09-30  0.010989  0.010989  0.010989
9   000050.KS 2021-10-31 -0.036232 -0.036232 -0.036232
10  000050.KS 202

## Appendix 2: How to calculate value-weighted returns using a function


In [95]:
####Using pandas##########################################
# One can define a custom function to calculate value weighted return
#df is a dataframe, avg_name is the column name to be used for average calculation
#weight_name is the column name to be used as the weight in value-weighted average

def wavg(df, avg_name, weight_name):
    df = df[df[avg_name].notna() & df[weight_name].notna()].copy()  # Filter out NaN values
    if df.empty:
        return np.nan
    d = df[avg_name]
    w = df[weight_name]
    
    try:
        w_sum = w.sum()
        if w_sum == 0 or np.isnan(w_sum) or np.isinf(w_sum):
            return np.nan
        else:
            return (d * w).sum() / w_sum
    except ZeroDivisionError:
        return np.nan

# Calculate Weighted Average Returns 
# to use the custom function defined above for each group of yyyymm and pmb
# one can use the apply method with a lambda function

data1.groupby(['yyyymm', 'pmb'])[['ret', 'mcap']].apply(lambda x: wavg(x, 'ret', 'mcap')).reset_index()
data1['icount']=np.where(data1['ret'].notnull(),1,0)#icount is set to be 1 if ret is not null.  Otherwise, set to be zero
print(data1.icount.describe())

count    1080.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: icount, dtype: float64


In [96]:
ret['vwret1']= data1.groupby(['yyyymm','pmb'])[['ret','mcap']].apply(wavg, 'ret','mcap').reset_index().iloc[:,2]

# Equally-weighted returns are calculated by using the same function
# where the weight is same for all (i.e., 1) in each group

ret['ewret1'] = data1.groupby(['yyyymm','pmb'])[['ret','icount']].apply(wavg, 'ret','icount').reset_index().iloc[:,2]
print(ret.head(20))

###Compare the results calculated using SQL above
print((ret.ewret-ret.ewret1).describe())
print((ret.vwret-ret.vwret1).describe())

    yyyymm  pmb     ewret     vwret  numstock    vwret1    ewret1
0   202007  1.0 -0.000460  0.006232         4  0.006232 -0.000460
1   202007  2.0 -0.054581 -0.021719         4 -0.021719 -0.054581
2   202007  3.0 -0.006112  0.014459         4  0.014459 -0.006112
3   202007  4.0  0.054087  0.043299         4  0.043299  0.054087
4   202007  5.0  0.016782  0.016136         4  0.016136  0.016782
5   202008  1.0  0.113550  0.081012         4  0.081012  0.113550
6   202008  2.0 -0.006106  0.009459         4  0.009459 -0.006106
7   202008  3.0  0.057520  0.096705         4  0.096705  0.057520
8   202008  4.0  0.055447  0.049175         4  0.049175  0.055447
9   202008  5.0  0.031494  0.044261         4  0.044261  0.031494
10  202009  1.0 -0.036222 -0.025361         4 -0.025361 -0.036222
11  202009  2.0 -0.108780 -0.092676         4 -0.092676 -0.108780
12  202009  3.0  0.007918 -0.035875         4 -0.035875  0.007918
13  202009  4.0  0.012708  0.008863         4  0.008863  0.012708
14  202009

## Appendix 3: The following are alternative ways to find out the MB portfolios in Python

In [97]:
#The following are alternative ways to find the portfolios in Python
#assign MB portfolio value - define a function

# Define a function to assign quintiles based on the mb value
def quintile(x):#note that the input is a dataframe
    
    if x['mb'] <= x['quint20']: return 1
    elif x['mb'] <= x['quint40']: return 2
    elif x['mb'] <= x['quint60']: return 3
    elif x['mb'] <= x['quint80']: return 4
    else: return 5
mb3['pmb']=mb3.apply(quintile, axis=1)#apply the function and call the result as pmb
mb3['pmb']=np.where(mb3['mb'].isnull(),0,mb3['pmb'])#If mb is null assign 0

#check
print(mb3.columns)
print(mb3.head())
#print(mb3[mb3['mb'].isnull()].head())
#print(mb3[mb3['mb'].notnull()].head())
#print(mb3.mb.describe())
#print(mb3[mb3.pmb>0].pmb.describe())
print(mb3.groupby(['pmb']).mb.describe())

Index(['id', 'mb', 'year', 'date', 'quint20', 'quint40', 'quint60', 'quint80',
       'pmb'],
      dtype='object')
              id      mb  year                        date  quint20  quint40  \
0  WEC UN Equity  2.6626  2020  2020-06-30 00:00:00.000000  1.02410  1.67098   
1  WEC UN Equity  2.5905  2021  2021-06-30 00:00:00.000000  1.79258  2.92506   
2  WEC UN Equity  2.8117  2022  2022-06-30 00:00:00.000000  1.26804  1.97252   
3  WEC UN Equity  2.3828  2023  2023-06-30 00:00:00.000000  1.48664  2.06024   
4  WEC UN Equity  2.0457  2024  2024-06-28 00:00:00.000000  1.83690  2.03278   

   quint60  quint80  pmb  
0  3.24376  5.27006    3  
1  4.31894  7.83904    2  
2  2.61136  6.27686    4  
3  4.24056  7.56532    3  
4  3.81054  9.50956    3  
     count       mean         std       min        25%       50%        75%  \
pmb                                                                           
0      0.0        NaN         NaN       NaN        NaN       NaN        NaN   
1   

In [98]:
######################################################################
# An easier way using pandas' qcut to form deciles in each year
######################################################################
print(mb3.shape)
mb4=mb3[mb3['mb'].notnull()].copy()# Get rid of ones with missing mb values
print(mb4.shape)

#lambda "arguments" : "expression"
#The "expression" with "argument" is executed and the result is returned:


#".transform(lambda x: f(x))" is similar to .apply(lambda x: f(x)) but keep the original shape
# while apply does not
mb4.loc[:,'mbquint']=mb4.groupby(['year'])['mb'].transform(
                     lambda x: pd.qcut(x, 5, labels=range(1,6)))

print(mb3.columns)
print(mb4.columns)

(2525, 9)
(100, 9)
Index(['id', 'mb', 'year', 'date', 'quint20', 'quint40', 'quint60', 'quint80',
       'pmb'],
      dtype='object')
Index(['id', 'mb', 'year', 'date', 'quint20', 'quint40', 'quint60', 'quint80',
       'pmb', 'mbquint'],
      dtype='object')


In [99]:
print(mb4[['pmb','mbquint']])
print(mb4.dtypes)


    pmb mbquint
0     3       3
1     2       2
2     4       4
3     3       3
4     3       3
..  ...     ...
95    1       1
96    5       5
97    1       1
98    5       5
99    5       5

[100 rows x 2 columns]
id           object
mb          float64
year          int64
date         object
quint20     float64
quint40     float64
quint60     float64
quint80     float64
pmb           int64
mbquint    category
dtype: object
