**TableⅤ: Fama-MacBeth Regressions**

For this table, dataset required: at_df_yret.csv, sic.csv, table_V.xlsx
- Features = LogEmissions, LogME, LogB/M, I/K, roe, TAN, Lev
- y = company monthly excess return
- Fixed Effect: Industry (sic)

In [None]:
from statsmodels.regression.linear_model import OLS
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import mstats

In [None]:
data = pd.read_excel("table_V.xlsx")
sic = pd.read_csv("sic.csv")
ret = pd.read_csv('at_df_yret.csv')

In [None]:
data

Unnamed: 0,gvkey,YEAR,p_at,logemissions,Unnamed: 4,bm,roe,IK,TAN,Lev,eq,LogME,LogB_M
0,4819,1991,113.556179,2.055211,,1.04000,0.06700,0.207058,0.593225,0.53775,71.017,1.851362,0.017033
1,5109,1991,890.784850,2.949773,,1.24900,0.00450,0.155321,0.249599,0.42025,140.834,2.148708,0.096562
2,5275,1991,344.646944,2.537374,,1.27225,0.03500,0.128585,1.038297,0.76975,54.910,1.739651,0.104572
3,7281,1991,414.392345,2.617412,,0.51550,0.11825,0.309295,0.552865,0.24300,26.751,1.427340,-0.287771
4,8958,1991,1038.636391,3.016464,,0.57300,0.18225,0.389263,0.291795,0.39575,30.675,1.486785,-0.241845
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14600,7921,2022,9.049658,0.956632,,0.18700,0.22100,0.112850,0.402235,0.39900,2294.375,3.360664,-0.728158
14601,10618,2022,0.000000,,,0.12700,0.34100,0.221491,0.386654,0.62000,1351.701,3.130881,-0.896196
14602,20232,2022,7.049857,0.848180,,0.13600,0.27800,0.203297,0.214927,0.48600,4161.000,3.619198,-0.866461
14603,126554,2022,4.832372,0.684160,,0.13100,0.23100,0.232800,0.330863,0.49600,5305.000,3.724685,-0.882729


In [None]:
data = data.drop(columns=['Unnamed: 4'])
data = data.dropna()
print(data.info())
print(sic.info())
print(ret.info())

<class 'pandas.core.frame.DataFrame'>
Index: 12774 entries, 0 to 14604
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   gvkey         12774 non-null  int64  
 1   YEAR          12774 non-null  int64  
 2   p_at          12774 non-null  float64
 3   logemissions  12774 non-null  float64
 4   bm            12774 non-null  float64
 5   roe           12774 non-null  float64
 6   IK            12774 non-null  float64
 7   TAN           12774 non-null  float64
 8   Lev           12774 non-null  float64
 9   eq            12774 non-null  float64
 10  LogME         12774 non-null  float64
 11  LogB_M        12774 non-null  float64
dtypes: float64(10), int64(2)
memory usage: 1.3 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14605 entries, 0 to 14604
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gvkey               14605 

In [None]:
# merge sic, year return, and other company-level features
data = data.merge(sic[['fyear','sic','gvkey']], left_on=['YEAR','gvkey'], right_on=['fyear','gvkey'], how='inner')
data = data.drop(columns=['YEAR'])
data = data.merge(ret[['gvkey','fyear','yret','rf','ret_prem']],on=['gvkey','fyear'], how='inner')
data_merge = data.drop(columns=['p_at','bm','eq','yret','rf'])
data_merge

Unnamed: 0,gvkey,logemissions,roe,IK,TAN,Lev,LogME,LogB_M,fyear,sic,ret_prem
0,109084,1.853148,0.21125,0.254015,0.824835,0.20575,2.194786,0.075547,1998,2070,-22.006991
1,105365,2.357612,-0.04600,0.281339,2.165262,0.22250,3.038122,-0.260625,1998,6798,1173.139650
2,110533,2.790533,0.18400,0.375512,1.444591,0.59850,2.172906,-0.238636,1998,3590,-108.025093
3,110566,0.481130,0.14950,0.202412,0.481943,0.62125,2.478711,0.283866,1998,2810,-78.435404
4,110685,-1.605135,0.13600,0.190240,0.303811,0.71925,2.477084,-0.472370,1998,3812,95.095466
...,...,...,...,...,...,...,...,...,...,...,...
1224,166482,2.061344,0.09900,0.166972,0.481765,0.55300,2.825621,-0.326058,2022,3490,-16.834232
1225,179657,0.822771,0.15400,0.124351,0.288331,0.71400,3.034588,-0.291579,2022,3530,-5.143472
1226,187740,0.118276,0.66800,0.177675,0.183526,0.46400,2.925698,-0.634512,2022,3674,19.385286
1227,126554,0.684160,0.23100,0.232800,0.330863,0.49600,3.724685,-0.882729,2022,3826,10.374597


In [None]:
# Winsorize and Normalize
features = list(data_merge.columns)
features.remove('fyear')
features.remove('gvkey')
features.remove('sic')
for col in list(features):
    data_merge[col] = mstats.winsorize(data_merge[col], limits=[0.01, 0.01])
    data_merge[col] = (data_merge[col] - data_merge[col].mean()) / data_merge[col].std()
data_merge

Unnamed: 0,gvkey,logemissions,roe,IK,TAN,Lev,LogME,LogB_M,fyear,sic,ret_prem
0,109084,0.417081,0.578367,1.014236,-0.205510,-1.937495,-1.076085,1.130266,1998,2070,-0.805249
1,105365,0.720542,-0.490997,1.314039,0.776038,-1.848906,0.347550,0.102550,1998,6798,4.501489
2,110533,0.980965,0.465091,2.347347,0.248315,0.139727,-1.113020,0.169774,1998,3590,-2.108661
3,110566,-0.408256,0.321678,0.448018,-0.456598,0.260050,-0.596792,1.767120,1998,2810,-1.660296
4,110685,-1.663249,0.265559,0.314464,-0.587038,0.778364,-0.599538,-0.544776,1998,3812,0.969177
...,...,...,...,...,...,...,...,...,...,...,...
1224,166482,0.542322,0.111754,0.059158,-0.456729,-0.100919,-0.011173,-0.097484,2022,3490,-0.726868
1225,179657,-0.202742,0.340384,-0.408497,-0.598373,0.750597,0.341586,0.007922,2022,3530,-0.549720
1226,187740,-0.626531,2.477033,0.176595,-0.675118,-0.571632,0.157767,-1.040461,2022,3674,-0.178042
1227,126554,-0.286123,0.660465,0.781451,-0.567229,-0.402387,1.506539,-1.799286,2022,3826,-0.314579


In [48]:
# Create dummy variables for SIC, treating SIC codes as categorical variables
df_with_dummies = pd.get_dummies(data_merge, columns=['sic'], drop_first=True,dtype=int)
df_with_dummies.head()

Unnamed: 0,gvkey,logemissions,roe,IK,TAN,Lev,LogME,LogB_M,fyear,ret_prem,...,sic_4581,sic_4911,sic_4923,sic_4955,sic_5010,sic_5172,sic_6411,sic_6798,sic_7359,sic_8731
0,109084,0.417081,0.578367,1.014236,-0.20551,-1.937495,-1.076085,1.130266,1998,-0.805249,...,0,0,0,0,0,0,0,0,0,0
1,105365,0.720542,-0.490997,1.314039,0.776038,-1.848906,0.34755,0.10255,1998,4.501489,...,0,0,0,0,0,0,0,1,0,0
2,110533,0.980965,0.465091,2.347347,0.248315,0.139727,-1.11302,0.169774,1998,-2.108661,...,0,0,0,0,0,0,0,0,0,0
3,110566,-0.408256,0.321678,0.448018,-0.456598,0.26005,-0.596792,1.76712,1998,-1.660296,...,0,0,0,0,0,0,0,0,0,0
4,110685,-1.663249,0.265559,0.314464,-0.587038,0.778364,-0.599538,-0.544776,1998,0.969177,...,0,0,0,0,0,0,0,0,0,0


In [61]:
# Perform Annual Cross-Sectional Regressions for 2 specifications
results1 = []
results2 = []
NW1 = []
NW2 = []

for year in df_with_dummies['fyear'].unique():
  year_data = df_with_dummies[df_with_dummies['fyear'] == year].copy()
  X = year_data.drop(columns=['ret_prem','fyear','gvkey'])
  df_with_dummies1 = X.drop(columns=['TAN','Lev'])
  df_with_dummies2 = X
  X1 = sm.add_constant(df_with_dummies1)
  X2 = sm.add_constant(df_with_dummies2)
  y = year_data['ret_prem']
  # regression for each specification
  model1 = sm.OLS(y, X1).fit(cov_type='HAC', cov_kwds={'maxlags': 4})
  results1.append(model1.params)
  model2 = sm.OLS(y, X2).fit(cov_type='HAC', cov_kwds={'maxlags': 4})
  results2.append(model2.params)
  NW1.append(model1.bse)
  NW2.append(model2.bse)


In [62]:
# Compute the Mean Coefficients and Newey-West t-stats
results1 = pd.DataFrame(results1)
results2 = pd.DataFrame(results2)
mean_params1 = results1.mean()
mean_params2 = results2.mean()
NW1 = pd.DataFrame(NW1)
NW2 = pd.DataFrame(NW2)
mean_se1 = NW1.mean()
mean_se2 = NW2.mean()
t1 = mean_params1/mean_se1
t2 = mean_params2/mean_se2

In [65]:
summary1 = pd.DataFrame({
    'Mean Coefficient': mean_params1,
    't-Statistics': t1
})
print(summary1)

              Mean Coefficient  t-Statistics
const                 0.269628  9.030843e-01
logemissions         -0.150289 -5.221682e-01
roe                  -0.152882 -8.020526e-01
IK                    0.103607  5.277732e-01
LogME                -0.088824 -5.016951e-01
...                        ...           ...
sic_5172              0.091893  4.311039e-01
sic_6411              0.179778  9.415174e+14
sic_6798              0.016183  1.591732e-01
sic_7359             -0.550012 -1.348927e+00
sic_8731             -0.723441 -1.160824e+00

[73 rows x 2 columns]


In [66]:
summary2 = pd.DataFrame({
    'Mean Coefficient': mean_params2,
    't-Statistics': t2
})
print(summary2)

              Mean Coefficient  t-Statistics
const                 0.374692  2.904376e+00
logemissions         -0.389460 -2.898646e+00
roe                  -0.094533 -7.200738e-01
IK                    0.114373  8.037289e-01
TAN                  -0.792713 -3.210419e+00
...                        ...           ...
sic_5172             -0.032285 -1.409111e-01
sic_6411              0.171896  6.512052e+14
sic_6798              2.292019  1.175261e+02
sic_7359             -0.393413 -8.478799e+00
sic_8731             -1.111264 -9.607350e+00

[75 rows x 2 columns]


In [None]:
# def fama_macbeth_regression(data):
#     # Unique time periods
#     time_periods = data['time'].unique()
#     results = []

#     for period in time_periods:
#         period_data = data[data['time'] == period]

In [None]:
# # Dependent and independent variables
# y = data['logemissions']
# X = data[[ 'eq','LogME', 'LogB_M', 'IK', 'roe', 'TAN', 'Lev']]
# names=data['gvkey']
# time=data['YEAR']

In [None]:
# df=[]
# for t in time:
#     for i in names:
#         df.append([t,i,X,y])

In [None]:
# cc=pd.DataFrame(df)
# cc2=pd.DataFrame(df,columns)

In [None]:
# ids=data['gvkey'].unique()
# results_df = pd.DataFrame()
# df2=data.loc[:,['gvkey','YEAR','logemissions', 'eq','LogME', 'LogB_M', 'IK', 'roe', 'TAN', 'Lev']]
# results_df = pd.DataFrame()
# df2.head()
# df2.dropna()


In [None]:
# # 将当前公司的结果存储，例如存储R²
#     results_df = results_df.append({
#         'CompanyID': company_id,
#         'R_squared': results.rsquared,
#         'Coefficients': results.params
#     }, ignore_index=True)

In [None]:
# df2.replace([np.inf, -np.inf], np.nan, inplace=True)
# y = df2[['logemissions']]
# X = df2[['eq', 'LogME', 'LogB_M', 'IK', 'roe', 'TAN', 'Lev']]
# X = sm.add_constant(X)
# model = sm.OLS(y, X).fit()

In [None]:
# results_df = pd.DataFrame()
# for gvkey in data['gvkey'].unique():
#     company_data = df2[df2['gvkey'] == gvkey]
#     for year in company_data['YEAR'].unique():
#         yearly_data = company_data[company_data['YEAR'] == year]
#         # Dependent and independent variables
#         y = yearly_data['logemissions']
#         X = yearly_data[['eq', 'LogME', 'LogB_M', 'IK', 'roe', 'TAN', 'Lev']]
#         X = sm.add_constant(X)  # Add a constant to the model
#                 # Regression model


In [None]:
# model = sm.OLS(y, X).fit()
# results_df = results_df.append({'CompanyID': gvkey,'Year': year,'R_squared': model.rsquared, 'Coefficients': model.params}, ignore_index=True)
