<a href="https://colab.research.google.com/github/julianikulski/director-experience/blob/main/analysis/regression_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Regression analysis
The datasets created by the `exploratory data analysis` notebook are used in this notebook to conduct fixed effects regression analysis.

In [1]:
# connecting to Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# install the linearmodels package
!pip install linearmodels &> /dev/null


In [3]:
import pandas as pd
import numpy as np

import linearmodels as lm
from linearmodels.panel.data import PanelData
from sklearn.linear_model import HuberRegressor
import statsmodels.api as sm



In [4]:
# read in the datasets
env_df = pd.read_csv('/content/drive/My Drive/director-csr/env_score_dataset.csv')
soc_df = pd.read_csv('/content/drive/My Drive/director-csr/soc_score_dataset.csv')
env_df.drop(columns=['Unnamed: 0'], inplace=True)
soc_df.drop(columns=['Unnamed: 0'], inplace=True)
env_df.head()


Unnamed: 0,ticker,comp_name,isin,env_score,year_score,independent_dirs,board_size,roa,leverage,slack,board_tenure,control_year,supersector,environmental_exp,environmental_comm_exp,ceo_duality,banks,basic resources,chemicals,construction and materials,consumer products and services,energy,financial services,"food, beverage and tobacco",health care,industrial goods and services,insurance,media,"personal care, drug and grocery stores",real estate,retailers,technology,telecommunications,travel and leisure,utilities,2013,2014,2015,2016
0,mmm,3m co,us88579y1010,83.891263,2012,90.0,10.0,14.65,16.88,20.07,9.38,2011,industrial goods and services,0.4,0.0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,abt,abbott laboratories,us0028241000,52.133087,2012,88.24,13.0,9.2,25.27,23.09,8.91,2011,health care,0.461538,0.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,atvi,activision blizzard inc,us00507v1098,0.0,2012,23.08,12.0,8.02,0.0,35.54,7.29,2011,consumer products and services,0.090909,0.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,adbe,adobe inc,us00724f1012,80.082004,2012,90.0,9.0,10.23,16.84,34.79,13.23,2011,technology,0.111111,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,aap,advance auto parts inc,us00751y1064,7.411255,2012,81.82,10.0,11.72,11.82,10.37,5.36,2011,retailers,0.181818,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [5]:
# get the names of the industries
industries = list(env_df['supersector'].unique())
industries


['industrial goods and services',
 'health care',
 'consumer products and services',
 'technology',
 'retailers',
 'utilities',
 'insurance',
 'real estate',
 'chemicals',
 'financial services',
 'food, beverage and tobacco',
 'travel and leisure',
 'personal care, drug and grocery stores',
 'automobiles and parts',
 'telecommunications',
 'basic resources',
 'energy',
 'banks',
 'media',
 'construction and materials']

In [6]:
def prep_data(df, dep_var, ind_var, effect):
    '''
    Function to split the data into X and y and create indices
    to prepare for estimation
    Args: df = dataframe
          dep_var = str; dependent variable name
          ind_var = str; independent variable name
          effect = str; name of effect
    Returns: X, y = dataframes
    '''
    if 'comm' in ind_var:
        var_to_be_removed = ind_var.replace('_comm', '')
    else:
        var_to_be_removed = ind_var.replace('exp', '')
        var_to_be_removed = var_to_be_removed+'comm_exp'

    if effect == 'industry':
        index_col = 'supersector'
        effect_to_be_removed = 'ticker'
    elif effect == 'entity':
        index_col = 'ticker'
        effect_to_be_removed = 'supersector'

    # define the variables
    df.rename(columns={'year_score':'time'}, inplace=True)
    new_df = df.set_index([index_col, 'time']).copy()
    y = new_df[[dep_var]].copy()
    # add all columns to be dropped together
    dummies = list(new_df.columns)[14:]
    dummies.extend(['comp_name', 'isin', dep_var,
                    var_to_be_removed, 'control_year'])
    dummies.append(effect_to_be_removed)
    X = new_df.drop(columns=dummies) 

    return X, y


In [7]:
# define X and y
X_soc_exp, y_soc_exp = prep_data(soc_df, 'soc_score', 'social_exp', 'industry')
# add a constant
X_soc_exp = sm.add_constant(X_soc_exp)
X_soc_comm, y_soc_comm = prep_data(soc_df, 'soc_score', 'social_comm_exp', 'industry')
# add a constant
X_soc_comm = sm.add_constant(X_soc_comm)
X_env_exp, y_env_exp = prep_data(env_df, 'env_score', 'environmental_exp', 'industry')
# add a constant
X_env_exp = sm.add_constant(X_env_exp)
X_env_comm, y_env_comm = prep_data(env_df, 'env_score', 'environmental_comm_exp', 'industry')
# add a constant
X_env_comm = sm.add_constant(X_env_comm)


In [8]:
# shapes of X and y
print(X_env_comm.shape)
print(y_env_comm.shape)


(2201, 9)
(2201, 1)


An explanation of how the covariance type `clustered` is calculated by linearmodels is given here: https://github.com/bashtage/linearmodels/blob/5c2b663314d6bd337eb601afb378927468fc9a5e/linearmodels/panel/covariance.py

In [9]:
# fixed effects regression 
# time and entity
mod = lm.PanelOLS(y_env_comm, X_env_comm, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,env_score,R-squared:,0.1803
Estimator:,PanelOLS,R-squared (Between):,-0.1899
No. Observations:,2201,R-squared (Within):,0.1813
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.1174
Time:,14:38:02,Log-likelihood,-1.009e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,59.648
Entities:,20,P-value,0.0000
Avg Obs:,110.05,Distribution:,"F(8,2169)"
Min Obs:,35.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-24.091,11.133,-2.1639,0.0306,-45.923,-2.2584
independent_dirs,0.5152,0.0959,5.3741,0.0000,0.3272,0.7031
board_size,3.1347,0.6649,4.7147,0.0000,1.8308,4.4386
roa,0.1423,0.1253,1.1356,0.2562,-0.1034,0.3880
leverage,-0.0403,0.0700,-0.5757,0.5649,-0.1776,0.0970
slack,-0.2806,0.0924,-3.0370,0.0024,-0.4618,-0.0994
board_tenure,-0.2136,0.3330,-0.6413,0.5214,-0.8666,0.4395
environmental_comm_exp,38.997,11.832,3.2959,0.0010,15.794,62.200
ceo_duality,-5.0408,1.9273,-2.6155,0.0090,-8.8203,-1.2612


In [10]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_env_comm.shape[0] - 1) / (X_env_comm.shape[0] - (X_env_comm.shape[1]-1) -1))
adjusted_r2


0.18192336269500148

In [11]:
# fixed effects regression 
# time and entity
mod = lm.PanelOLS(y_env_exp, X_env_exp, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,env_score,R-squared:,0.1648
Estimator:,PanelOLS,R-squared (Between):,-0.3203
No. Observations:,2201,R-squared (Within):,0.1653
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.0849
Time:,14:38:03,Log-likelihood,-1.011e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,53.485
Entities:,20,P-value,0.0000
Avg Obs:,110.05,Distribution:,"F(8,2169)"
Min Obs:,35.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-28.111,10.629,-2.6447,0.0082,-48.956,-7.2663
independent_dirs,0.5261,0.0902,5.8328,0.0000,0.3492,0.7030
board_size,3.2760,0.6501,5.0394,0.0000,2.0012,4.5508
roa,0.1420,0.1256,1.1311,0.2581,-0.1042,0.3883
leverage,-0.0432,0.0701,-0.6156,0.5382,-0.1807,0.0944
slack,-0.2948,0.0958,-3.0762,0.0021,-0.4827,-0.1069
board_tenure,-0.2234,0.3375,-0.6619,0.5081,-0.8853,0.4385
environmental_exp,18.163,6.4685,2.8079,0.0050,5.4775,30.848
ceo_duality,-4.0083,2.0397,-1.9651,0.0495,-8.0082,-0.0083


In [12]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_env_exp.shape[0] - 1) / (X_env_exp.shape[0] - (X_env_exp.shape[1]-1) -1))
adjusted_r2


0.16595216537008417

In [13]:
# fixed effects regression 
# time and entity
mod = lm.PanelOLS(y_soc_exp, X_soc_exp, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,soc_score,R-squared:,0.1881
Estimator:,PanelOLS,R-squared (Between):,-0.1703
No. Observations:,2201,R-squared (Within):,0.1874
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.1323
Time:,14:38:03,Log-likelihood,-9377.4
Cov. Estimator:,Clustered,,
,,F-statistic:,62.833
Entities:,20,P-value,0.0000
Avg Obs:,110.05,Distribution:,"F(8,2169)"
Min Obs:,35.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-3.5981,6.8497,-0.5253,0.5994,-17.031,9.8346
independent_dirs,0.4488,0.0645,6.9595,0.0000,0.3223,0.5753
board_size,2.0490,0.3018,6.7904,0.0000,1.4573,2.6408
roa,0.1896,0.0940,2.0172,0.0438,0.0053,0.3740
leverage,-0.0120,0.0556,-0.2151,0.8297,-0.1210,0.0971
slack,-0.1657,0.0654,-2.5326,0.0114,-0.2941,-0.0374
board_tenure,-0.3523,0.1733,-2.0325,0.0422,-0.6923,-0.0124
social_exp,17.966,3.8292,4.6919,0.0000,10.457,25.475
ceo_duality,-2.8959,1.4486,-1.9991,0.0457,-5.7366,-0.0552


In [14]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_soc_exp.shape[0] - 1) / (X_soc_exp.shape[0] - (X_soc_exp.shape[1]-1) -1))
adjusted_r2


0.18812297005401551

In [15]:
# fixed effects regression 
# time and entity
mod = lm.PanelOLS(y_soc_comm, X_soc_comm, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,soc_score,R-squared:,0.2070
Estimator:,PanelOLS,R-squared (Between):,-0.1166
No. Observations:,2201,R-squared (Within):,0.2066
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.1582
Time:,14:38:03,Log-likelihood,-9351.4
Cov. Estimator:,Clustered,,
,,F-statistic:,70.792
Entities:,20,P-value,0.0000
Avg Obs:,110.05,Distribution:,"F(8,2169)"
Min Obs:,35.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,2.1283,7.1177,0.2990,0.7650,-11.830,16.087
independent_dirs,0.4346,0.0625,6.9574,0.0000,0.3121,0.5571
board_size,1.9678,0.3315,5.9356,0.0000,1.3177,2.6180
roa,0.2002,0.0876,2.2866,0.0223,0.0285,0.3719
leverage,-0.0145,0.0546,-0.2655,0.7907,-0.1215,0.0925
slack,-0.1525,0.0652,-2.3372,0.0195,-0.2804,-0.0245
board_tenure,-0.3758,0.1724,-2.1797,0.0294,-0.7140,-0.0377
social_comm_exp,28.069,4.8337,5.8068,0.0000,18.589,37.548
ceo_duality,-3.6763,1.3072,-2.8123,0.0050,-6.2399,-1.1128


In [16]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_soc_comm.shape[0] - 1) / (X_soc_comm.shape[0] - (X_soc_comm.shape[1]-1) - 1))
adjusted_r2


0.2073961000641916

# Robustness check



## Removed outliers

Outliers were truncated at the 1st and 99th percentile

In [17]:
# read in the datasets without outliers
env_df = pd.read_csv('/content/drive/My Drive/director-csr/no_outliers_env.csv')
soc_df = pd.read_csv('/content/drive/My Drive/director-csr/no_outliers_soc.csv')
env_df.drop(columns=['Unnamed: 0'], inplace=True)
soc_df.drop(columns=['Unnamed: 0'], inplace=True)
env_df.head()


Unnamed: 0,env_score,independent_dirs,board_size,roa,leverage,slack,board_tenure,environmental_exp,environmental_comm_exp,ticker,comp_name,isin,year_score,control_year,supersector,ceo_duality
0,83.891263,90.0,10.0,14.65,16.88,20.07,9.38,0.4,0.0,mmm,3m co,us88579y1010,2012,2011,industrial goods and services,0
1,52.133087,88.24,13.0,9.2,25.27,23.09,8.91,0.461538,0.0,abt,abbott laboratories,us0028241000,2012,2011,health care,0
2,80.082004,90.0,9.0,10.23,16.84,34.79,13.23,0.111111,0.0,adbe,adobe inc,us00724f1012,2012,2011,technology,0
3,7.411255,81.82,10.0,11.72,11.82,10.37,5.36,0.181818,0.0,aap,advance auto parts inc,us00751y1064,2012,2011,retailers,1
4,57.98028,87.5,13.0,4.85,34.79,21.86,7.41,0.2,0.0,aep,,us0255371017,2012,2011,utilities,0


In [18]:
# get the names of the industries
industries = list(env_df['supersector'].unique())
industries


['industrial goods and services',
 'health care',
 'technology',
 'retailers',
 'utilities',
 'insurance',
 'real estate',
 'chemicals',
 'financial services',
 'food, beverage and tobacco',
 'travel and leisure',
 'telecommunications',
 'basic resources',
 'energy',
 'banks',
 'automobiles and parts',
 'personal care, drug and grocery stores',
 'media',
 'consumer products and services',
 'construction and materials']

In [19]:
# define X and y
X_soc_exp, y_soc_exp = prep_data(soc_df, 'soc_score', 'social_exp', 'industry')
# add a constant
X_soc_exp = sm.add_constant(X_soc_exp)
X_soc_comm, y_soc_comm = prep_data(soc_df, 'soc_score', 'social_comm_exp', 'industry')
# add a constant
X_soc_comm = sm.add_constant(X_soc_comm)
X_env_exp, y_env_exp = prep_data(env_df, 'env_score', 'environmental_exp', 'industry')
# add a constant
X_env_exp = sm.add_constant(X_env_exp)
X_env_comm, y_env_comm = prep_data(env_df, 'env_score', 'environmental_comm_exp', 'industry')
# add a constant
X_env_comm = sm.add_constant(X_env_comm)


In [20]:
# fixed effects regression removed outliers
# time and entity
mod = lm.PanelOLS(y_env_comm, X_env_comm, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,env_score,R-squared:,0.1382
Estimator:,PanelOLS,R-squared (Between):,-0.0539
No. Observations:,1397,R-squared (Within):,0.1354
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.0813
Time:,14:38:04,Log-likelihood,-6201.5
Cov. Estimator:,Clustered,,
,,F-statistic:,27.363
Entities:,20,P-value,0.0000
Avg Obs:,69.850,Distribution:,"F(8,1365)"
Min Obs:,18.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-2.9711,11.012,-0.2698,0.7874,-24.574,18.632
independent_dirs,0.2826,0.0993,2.8443,0.0045,0.0877,0.4774
board_size,3.2033,0.5263,6.0860,0.0000,2.1708,4.2359
roa,0.3230,0.1661,1.9452,0.0520,-0.0027,0.6488
leverage,-0.0994,0.0686,-1.4494,0.1474,-0.2339,0.0351
slack,-0.1718,0.0834,-2.0609,0.0395,-0.3354,-0.0083
board_tenure,-0.1962,0.3848,-0.5099,0.6102,-0.9512,0.5587
environmental_comm_exp,40.941,8.2236,4.9784,0.0000,24.808,57.073
ceo_duality,-2.5803,2.3257,-1.1095,0.2674,-7.1427,1.9821


In [21]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_env_comm.shape[0] - 1) / (X_env_comm.shape[0] - (X_env_comm.shape[1]-1) -1))
adjusted_r2


0.13614097545612383

In [22]:
# fixed effects regression removed outliers
# time and entity
mod = lm.PanelOLS(y_env_exp, X_env_exp, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,env_score,R-squared:,0.1094
Estimator:,PanelOLS,R-squared (Between):,-0.1561
No. Observations:,1397,R-squared (Within):,0.1059
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.0429
Time:,14:38:04,Log-likelihood,-6224.4
Cov. Estimator:,Clustered,,
,,F-statistic:,20.964
Entities:,20,P-value,0.0000
Avg Obs:,69.850,Distribution:,"F(8,1365)"
Min Obs:,18.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-9.5774,11.953,-0.8013,0.4231,-33.025,13.871
independent_dirs,0.3317,0.0948,3.4981,0.0005,0.1457,0.5178
board_size,3.3442,0.5251,6.3684,0.0000,2.3141,4.3743
roa,0.3464,0.1747,1.9830,0.0476,0.0037,0.6890
leverage,-0.0934,0.0754,-1.2391,0.2155,-0.2412,0.0545
slack,-0.1923,0.0878,-2.1914,0.0286,-0.3645,-0.0202
board_tenure,-0.1933,0.4049,-0.4774,0.6332,-0.9875,0.6010
environmental_exp,11.451,8.6479,1.3241,0.1857,-5.5138,28.415
ceo_duality,-1.5401,2.4557,-0.6272,0.5307,-6.3574,3.2772


In [23]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_env_exp.shape[0] - 1) / (X_env_exp.shape[0] - (X_env_exp.shape[1]-1) -1))
adjusted_r2


0.1064997817464292

In [24]:
# fixed effects regression removed outliers
# time and entity
mod = lm.PanelOLS(y_soc_exp, X_soc_exp, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,soc_score,R-squared:,0.1419
Estimator:,PanelOLS,R-squared (Between):,-0.1062
No. Observations:,1700,R-squared (Within):,0.1397
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.0978
Time:,14:38:04,Log-likelihood,-7146.4
Cov. Estimator:,Clustered,,
,,F-statistic:,34.472
Entities:,20,P-value,0.0000
Avg Obs:,85.000,Distribution:,"F(8,1668)"
Min Obs:,25.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-5.5604,7.5898,-0.7326,0.4639,-20.447,9.3261
independent_dirs,0.4394,0.0521,8.4356,0.0000,0.3372,0.5416
board_size,2.1672,0.3138,6.9056,0.0000,1.5516,2.7827
roa,0.2458,0.1518,1.6195,0.1055,-0.0519,0.5435
leverage,-0.0103,0.0651,-0.1584,0.8742,-0.1380,0.1174
slack,-0.1327,0.0738,-1.7979,0.0724,-0.2775,0.0121
board_tenure,-0.2108,0.1990,-1.0591,0.2897,-0.6012,0.1796
social_exp,16.257,3.5046,4.6387,0.0000,9.3827,23.130
ceo_duality,-3.7797,1.4325,-2.6386,0.0084,-6.5893,-0.9701


In [25]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_soc_exp.shape[0] - 1) / (X_soc_exp.shape[0] - (X_soc_exp.shape[1]-1) -1))
adjusted_r2


0.14040824730003063

In [26]:
# fixed effects regression removed outliers
# time and entity
mod = lm.PanelOLS(y_soc_comm, X_soc_comm, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,soc_score,R-squared:,0.1722
Estimator:,PanelOLS,R-squared (Between):,-0.0765
No. Observations:,1700,R-squared (Within):,0.1714
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.1260
Time:,14:38:04,Log-likelihood,-7115.7
Cov. Estimator:,Clustered,,
,,F-statistic:,43.381
Entities:,20,P-value,0.0000
Avg Obs:,85.000,Distribution:,"F(8,1668)"
Min Obs:,25.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,2.9918,6.8353,0.4377,0.6617,-10.415,16.399
independent_dirs,0.3929,0.0553,7.1009,0.0000,0.2844,0.5015
board_size,2.0250,0.2924,6.9247,0.0000,1.4514,2.5985
roa,0.2468,0.1303,1.8948,0.0583,-0.0087,0.5023
leverage,-0.0116,0.0557,-0.2084,0.8350,-0.1208,0.0976
slack,-0.1114,0.0740,-1.5047,0.1326,-0.2565,0.0338
board_tenure,-0.2194,0.1871,-1.1724,0.2412,-0.5864,0.1476
social_comm_exp,30.982,4.5369,6.8290,0.0000,22.084,39.881
ceo_duality,-4.1980,1.2175,-3.4479,0.0006,-6.5861,-1.8099


In [27]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_soc_comm.shape[0] - 1) / (X_soc_comm.shape[0] - (X_soc_comm.shape[1]-1) - 1))
adjusted_r2


0.17221394072831087

## Winsorized outliers
Outliers were winsorized at the 1st and 99th percentile

In [28]:
# read in the datasets without outliers
env_df = pd.read_csv('/content/drive/My Drive/director-csr/winsorized_env.csv')
soc_df = pd.read_csv('/content/drive/My Drive/director-csr/winsorized_soc.csv')
env_df.drop(columns=['Unnamed: 0'], inplace=True)
soc_df.drop(columns=['Unnamed: 0'], inplace=True)
env_df.head()


Unnamed: 0,ticker,comp_name,isin,env_score,year_score,independent_dirs,board_size,roa,leverage,slack,board_tenure,control_year,supersector,environmental_exp,environmental_comm_exp,ceo_duality
0,mmm,3m co,us88579y1010,83.891263,2012,90.0,10.0,14.65,16.88,20.07,9.38,2011,industrial goods and services,0.4,0.0,0
1,abt,abbott laboratories,us0028241000,52.133087,2012,88.24,13.0,9.2,25.27,23.09,8.91,2011,health care,0.461538,0.0,0
2,atvi,activision blizzard inc,us00507v1098,0.0,2012,50.0,12.0,8.02,0.0,35.54,7.29,2011,consumer products and services,0.090909,0.0,1
3,adbe,adobe inc,us00724f1012,80.082004,2012,90.0,9.0,10.23,16.84,34.79,13.23,2011,technology,0.111111,0.0,0
4,aap,advance auto parts inc,us00751y1064,7.411255,2012,81.82,10.0,11.72,11.82,10.37,5.36,2011,retailers,0.181818,0.0,1


In [29]:
# get the names of the industries
industries = list(env_df['supersector'].unique())
industries


['industrial goods and services',
 'health care',
 'consumer products and services',
 'technology',
 'retailers',
 'utilities',
 'insurance',
 'real estate',
 'chemicals',
 'financial services',
 'food, beverage and tobacco',
 'travel and leisure',
 'personal care, drug and grocery stores',
 'automobiles and parts',
 'telecommunications',
 'basic resources',
 'energy',
 'banks',
 'media',
 'construction and materials']

In [30]:
# define X and y
X_soc_exp, y_soc_exp = prep_data(soc_df, 'soc_score', 'social_exp', 'industry')
# add a constant
X_soc_exp = sm.add_constant(X_soc_exp)
X_soc_comm, y_soc_comm = prep_data(soc_df, 'soc_score', 'social_comm_exp', 'industry')
# add a constant
X_soc_comm = sm.add_constant(X_soc_comm)
X_env_exp, y_env_exp = prep_data(env_df, 'env_score', 'environmental_exp', 'industry')
# add a constant
X_env_exp = sm.add_constant(X_env_exp)
X_env_comm, y_env_comm = prep_data(env_df, 'env_score', 'environmental_comm_exp', 'industry')
# add a constant
X_env_comm = sm.add_constant(X_env_comm)


In [31]:
# fixed effects regression winsorized outliers
# time and entity
mod = lm.PanelOLS(y_env_comm, X_env_comm, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,env_score,R-squared:,0.2071
Estimator:,PanelOLS,R-squared (Between):,-0.2380
No. Observations:,2201,R-squared (Within):,0.2078
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.1329
Time:,14:38:05,Log-likelihood,-1.005e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,70.835
Entities:,20,P-value,0.0000
Avg Obs:,110.05,Distribution:,"F(8,2169)"
Min Obs:,35.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-33.789,10.517,-3.2129,0.0013,-54.413,-13.165
independent_dirs,0.5155,0.0990,5.2097,0.0000,0.3215,0.7096
board_size,3.9717,0.4322,9.1903,0.0000,3.1242,4.8192
roa,0.2142,0.1510,1.4182,0.1563,-0.0820,0.5103
leverage,-0.0400,0.0683,-0.5864,0.5576,-0.1739,0.0939
slack,-0.3187,0.0816,-3.9078,0.0001,-0.4787,-0.1588
board_tenure,-0.1268,0.3586,-0.3536,0.7237,-0.8301,0.5764
environmental_comm_exp,39.726,10.653,3.7291,0.0002,18.835,60.616
ceo_duality,-5.0586,1.9753,-2.5610,0.0105,-8.9322,-1.1851


In [32]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_env_comm.shape[0] - 1) / (X_env_comm.shape[0] - (X_env_comm.shape[1]-1) -1))
adjusted_r2


0.20859377786772415

In [33]:
# fixed effects regression winsorized outliers
# time and entity
mod = lm.PanelOLS(y_env_exp, X_env_exp, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,env_score,R-squared:,0.1910
Estimator:,PanelOLS,R-squared (Between):,-0.3595
No. Observations:,2201,R-squared (Within):,0.1914
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.1016
Time:,14:38:05,Log-likelihood,-1.007e+04
Cov. Estimator:,Clustered,,
,,F-statistic:,63.999
Entities:,20,P-value,0.0000
Avg Obs:,110.05,Distribution:,"F(8,2169)"
Min Obs:,35.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-37.769,9.8892,-3.8193,0.0001,-57.163,-18.376
independent_dirs,0.5276,0.0917,5.7562,0.0000,0.3479,0.7074
board_size,4.1402,0.4300,9.6274,0.0000,3.2968,4.9835
roa,0.2080,0.1502,1.3845,0.1663,-0.0866,0.5025
leverage,-0.0429,0.0687,-0.6244,0.5325,-0.1777,0.0919
slack,-0.3337,0.0851,-3.9234,0.0001,-0.5005,-0.1669
board_tenure,-0.1425,0.3652,-0.3903,0.6964,-0.8586,0.5736
environmental_exp,16.751,6.3346,2.6443,0.0082,4.3282,29.173
ceo_duality,-4.1150,2.0980,-1.9614,0.0500,-8.2293,-0.0006


In [34]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_env_exp.shape[0] - 1) / (X_env_exp.shape[0] - (X_env_exp.shape[1]-1) -1))
adjusted_r2


0.19210315922038892

In [35]:
# fixed effects regression winsorized outliers
# time and entity
mod = lm.PanelOLS(y_soc_exp, X_soc_exp, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,soc_score,R-squared:,0.1972
Estimator:,PanelOLS,R-squared (Between):,-0.2033
No. Observations:,2201,R-squared (Within):,0.1964
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.1373
Time:,14:38:05,Log-likelihood,-9353.6
Cov. Estimator:,Clustered,,
,,F-statistic:,66.581
Entities:,20,P-value,0.0000
Avg Obs:,110.05,Distribution:,"F(8,2169)"
Min Obs:,35.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-9.1662,6.6224,-1.3841,0.1665,-22.153,3.8207
independent_dirs,0.4614,0.0618,7.4635,0.0000,0.3402,0.5827
board_size,2.4224,0.2600,9.3153,0.0000,1.9124,2.9324
roa,0.2451,0.1051,2.3330,0.0197,0.0391,0.4512
leverage,-0.0132,0.0544,-0.2431,0.8079,-0.1199,0.0935
slack,-0.1646,0.0676,-2.4350,0.0150,-0.2972,-0.0320
board_tenure,-0.3202,0.1805,-1.7747,0.0761,-0.6741,0.0336
social_exp,17.439,3.7126,4.6973,0.0000,10.158,24.720
ceo_duality,-3.0381,1.4869,-2.0432,0.0412,-5.9541,-0.1221


In [36]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_soc_exp.shape[0] - 1) / (X_soc_exp.shape[0] - (X_soc_exp.shape[1]-1) -1))
adjusted_r2


0.19712251981231627

In [37]:
# fixed effects regression winsorized outliers
# time and entity
mod = lm.PanelOLS(y_soc_comm, X_soc_comm, time_effects=True, entity_effects=True)
res = mod.fit(cov_type='clustered', cluster_time=True, cluster_entity=True)
res


0,1,2,3
Dep. Variable:,soc_score,R-squared:,0.2164
Estimator:,PanelOLS,R-squared (Between):,-0.1532
No. Observations:,2201,R-squared (Within):,0.2158
Date:,"Thu, Apr 29 2021",R-squared (Overall):,0.1629
Time:,14:38:06,Log-likelihood,-9327.0
Cov. Estimator:,Clustered,,
,,F-statistic:,74.854
Entities:,20,P-value,0.0000
Avg Obs:,110.05,Distribution:,"F(8,2169)"
Min Obs:,35.000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
const,-3.4085,7.0963,-0.4803,0.6310,-17.325,10.508
independent_dirs,0.4444,0.0608,7.3136,0.0000,0.3252,0.5636
board_size,2.3290,0.3001,7.7604,0.0000,1.7404,2.9175
roa,0.2661,0.0944,2.8193,0.0049,0.0810,0.4512
leverage,-0.0147,0.0534,-0.2753,0.7831,-0.1193,0.0900
slack,-0.1523,0.0673,-2.2637,0.0237,-0.2843,-0.0204
board_tenure,-0.3358,0.1796,-1.8694,0.0617,-0.6881,0.0165
social_comm_exp,27.890,4.6367,6.0152,0.0000,18.798,36.983
ceo_duality,-3.7476,1.3362,-2.8048,0.0051,-6.3679,-1.1273


In [38]:
# calculate the adjusted 
r2 = res.rsquared_within

adjusted_r2 = (1 - (1 - r2))*((X_soc_comm.shape[0] - 1) / (X_soc_comm.shape[0] - (X_soc_comm.shape[1]-1) - 1))
adjusted_r2


0.2166070641204397