In [1]:
import numpy as np
import pandas as pd
from src.model import OLSModel

In [2]:
Y_COLUMN = 'reputation_score_2020'

## read in data

In [3]:
df = pd.read_csv('../data/modelinput/information_governance_clean_dataset.csv')

## --> ols model by theory features

In [4]:
fixed_model_columns = [
    'firmhash',
    'firm',
    'sector',
    'young_firm',
    'reputation_score_2020',
]
Y_COLUMN = 'reputation_score_2020'

In [5]:
model_features = [c for c in df.columns if c not in fixed_model_columns]
model = OLSModel(df, model_features, Y_COLUMN)

In [6]:
def scientific_ols_table(model):
    
    dataf = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]

    INDEX_SWAP = {
        'composite_relational_ig_practises': 'Composite Relational IG Practises',
        'composite_formal_ig_practises': 'Composite Formal IG Practises',
        'return_on_assets': 'Return on Assets',
        'log_n_employees': 'Number of employees (LOG)',
        'csr_index': 'CSR Index',
        'logmin_n_data_breaches': 'Number of Data Breaches (LOG, min = 1)',
        'sqrt_age_in_years': 'Age (Square Root)',
    }

    new_indices = []
    coefs = []
    stderr = []
    tvalue = []
    confintlow = []
    confinthigh = []
    pvalue = []
    for i, r in dataf.iterrows():

        if 'dummy' in i:
            new_indices.append(f"  {i.split('sector_')[1]}")
        else:
            try:
                new_indices.append(INDEX_SWAP[i])
            except:
                new_indices.append(i.title())

        if r['P>|t|'] <= 0.001:
            sig = '***'
        elif r['P>|t|'] <= 0.01:
            sig = '**'
        elif r['P>|t|'] <= 0.05:
            sig = '*'
        else:
            sig = ''

        coefs.append(('{:.3f}{}'.format(r['coef'], sig)))
        stderr.append(('{:.2f}'.format(r['std err'])))
        tvalue.append(('{:.2f}'.format(r['t'])))
        confintlow.append(('{:.2f}'.format(r['[0.025'])))
        confinthigh.append(('{:.2f}'.format(r['0.975]'])))
        confint = [f'{l} to {h}' for l, h in zip(confintlow, confinthigh)]
        pvalue.append(('{:.3f}'.format(r['P>|t|'])))

    df =  pd.DataFrame(np.column_stack([
        coefs,
        stderr,
        tvalue,
        confint,
        pvalue,
    ]), columns=[
        'Coefficient (b)',
        'SE',
        't statistic',
        '95% CI',
        'p value',
    ], index=new_indices)
    
    if model.f_pvalue <= 0.001:
        sig = '***'
    elif model.f_pvalue <= 0.01:
        sig = '**'
    elif model.f_pvalue <= 0.05:
        sig = '*'
    else:
        sig = ''

    dfextra = pd.DataFrame({
        'Coefficient (b)': [
            '{:.3f}'.format(model.rsquared),
            '{:.3f}'.format(model.rsquared_adj),
            '{:.3f}{}'.format(model.fvalue, sig),
            '{:.3f}'.format(model.f_pvalue),
        ]
    }, index=[
        'R2',
        'Adjusted R2',
        'F',
        'Probability F',
    ])
    
    return df, dfextra

In [7]:
df_ols, df_ols_extra = scientific_ols_table(model)

In [9]:
df_ols

Unnamed: 0,Coefficient (b),SE,t statistic,95% CI,p value
Composite Relational IG Practises,1.665**,0.63,2.63,0.41 to 2.92,0.01
Composite Formal IG Practises,0.846*,0.39,2.19,0.08 to 1.61,0.031
Return on Assets,2.347,1.32,1.77,-0.28 to 4.97,0.079
Number of employees (LOG),0.161*,0.07,2.38,0.03 to 0.30,0.019
CSR Index,2.438,2.2,1.11,-1.94 to 6.81,0.271
"Number of Data Breaches (LOG, min = 1)",-0.036,0.09,-0.38,-0.22 to 0.15,0.705
Age (Square Root),0.015,0.03,0.59,-0.04 to 0.07,0.558
Health Care,0.058,0.15,0.39,-0.24 to 0.35,0.701
Technology,-0.070,0.19,-0.36,-0.45 to 0.31,0.716
Media,0.952***,0.29,3.3,0.38 to 1.53,0.001


In [10]:
df_ols_extra

Unnamed: 0,Coefficient (b)
R2,0.298
Adjusted R2,0.173
F,2.378**
Probability F,0.004


In [8]:
with pd.ExcelWriter('../data/modeloutput/scientific_ols_table.xlsx', mode='w') as writer:  
    df_ols.to_excel(writer, sheet_name='ols_betas')
    df_ols_extra.to_excel(writer, sheet_name='ols')

## --> PROCESS macro for moderation effect

In [11]:
from pyprocessmacro import Process

In [12]:
model = Process(data=df,
                model=1,
                x=['composite_relational_ig_practises'],
                y=Y_COLUMN,
                m=["young_firm"])
model.summary()

Process successfully initialized.
Based on the Process Macro by Andrew F. Hayes, Ph.D. (www.afhayes.com)


****************************** SPECIFICATION ****************************

Model = 1

Variables:
    Cons = Cons
    x = composite_relational_ig_practises
    y = reputation_score_2020
    m = young_firm

Sample size:
113

***************************** OUTCOME MODELS ****************************

Outcome = reputation_score_2020 
OLS Regression Summary

     R²  Adj. R²    MSE      F  df1  df2  p-value
 0.0863   0.0525 0.3880 3.4318    3  109   0.0196

Coefficients

                                               coeff     se       t      p    LLCI   ULCI
Cons                                          6.0895 0.4321 14.0940 0.0000  5.2427 6.9364
composite_relational_ig_practises             1.4801 0.9649  1.5339 0.1279 -0.4111 3.3713
young_firm                                   -0.2195 0.5350 -0.4103 0.6824 -1.2681 0.8291
composite_relational_ig_practises*young_firm  0.3006 1.1536  0.

  and should_run_async(code)


In [13]:
model = Process(data=df,
                model=1,
                x=['composite_formal_ig_practises'],
                y=Y_COLUMN,
                m=["young_firm"])
model.summary()

Process successfully initialized.
Based on the Process Macro by Andrew F. Hayes, Ph.D. (www.afhayes.com)


****************************** SPECIFICATION ****************************

Model = 1

Variables:
    Cons = Cons
    x = composite_formal_ig_practises
    y = reputation_score_2020
    m = young_firm

Sample size:
113

***************************** OUTCOME MODELS ****************************

Outcome = reputation_score_2020 
OLS Regression Summary

     R²  Adj. R²    MSE      F  df1  df2  p-value
 0.0484   0.0132 0.4041 1.8487    3  109   0.1426

Coefficients

                                           coeff     se       t      p    LLCI   ULCI
Cons                                      6.6341 0.1827 36.3145 0.0000  6.2761 6.9922
composite_formal_ig_practises             0.3364 0.5144  0.6540 0.5145 -0.6718 1.3446
young_firm                               -0.2537 0.2531 -1.0026 0.3183 -0.7498 0.2423
composite_formal_ig_practises*young_firm  0.6483 0.6741  0.9618 0.3383 -0.6728 1.96

  and should_run_async(code)


## --> ols model by output of feature selection

In [None]:
import json

In [None]:
with open('../data/modeloutput/feature_fits_20210421_002632.json', 'r') as infile:
    results = json.load(infile)

In [None]:
count = 0
max_score = 0
best_features = None
for k, v in results.items():
    if v['rsquared_adj'] > max_score:
        max_score = v['rsquared_adj']
        n_significant = v['n_significant']
        best_features = v['features']

In [None]:
model = _model(df, best_features, Y_COLUMN)

In [None]:
print(model.summary())