In [None]:
import numpy as np
import pandas as pd
from src.model import OLSModel

In [None]:
Y_COLUMN = 'reputation_score_2020'

## read in data

In [None]:
df = pd.read_csv('../data/modelinput/information_governance_clean_dataset.csv')

## --> ols model by theory features

In [None]:
fixed_model_columns = [
    'firmhash',
    'firm',
    'sector',
    'young_firm',
    'reputation_score_2020',
]
Y_COLUMN = 'reputation_score_2020'

In [None]:
model_features = [c for c in df.columns if c not in fixed_model_columns]
model = OLSModel(df, model_features, Y_COLUMN)

In [None]:
def scientific_ols_table(model):
    
    dataf = pd.read_html(model.summary().tables[1].as_html(),header=0,index_col=0)[0]

    INDEX_SWAP = {
        'composite_relational_ig_practises': 'Composite Relational IG Practises',
        'composite_formal_ig_practises': 'Composite Formal IG Practises',
        'return_on_assets': 'Return on Assets',
        'log_n_employees': 'Number of employees (LOG)',
        'csr_index': 'CSR Index',
        'logmin_n_data_breaches': 'Number of Data Breaches (LOG, min = 1)',
        'sqrt_age_in_years': 'Age (Square Root)',
    }

    new_indices = []
    coefs = []
    stderr = []
    tvalue = []
    confintlow = []
    confinthigh = []
    pvalue = []
    for i, r in dataf.iterrows():

        if 'dummy' in i:
            new_indices.append(f"  {i.split('sector_')[1]}")
        else:
            try:
                new_indices.append(INDEX_SWAP[i])
            except:
                new_indices.append(i.title())

        if r['P>|t|'] <= 0.001:
            sig = '***'
        elif r['P>|t|'] <= 0.01:
            sig = '**'
        elif r['P>|t|'] <= 0.05:
            sig = '*'
        else:
            sig = ''

        coefs.append(('{:.3f}{}'.format(r['coef'], sig)))
        stderr.append(('{:.2f}'.format(r['std err'])))
        tvalue.append(('{:.2f}'.format(r['t'])))
        confintlow.append(('{:.2f}'.format(r['[0.025'])))
        confinthigh.append(('{:.2f}'.format(r['0.975]'])))
        confint = [f'{l} to {h}' for l, h in zip(confintlow, confinthigh)]
        pvalue.append(('{:.3f}'.format(r['P>|t|'])))

    df =  pd.DataFrame(np.column_stack([
        coefs,
        stderr,
        tvalue,
        confint,
        pvalue,
    ]), columns=[
        'Coefficient (b)',
        'SE',
        't statistic',
        '95% CI',
        'p value',
    ], index=new_indices)
    
    if model.f_pvalue <= 0.001:
        sig = '***'
    elif model.f_pvalue <= 0.01:
        sig = '**'
    elif model.f_pvalue <= 0.05:
        sig = '*'
    else:
        sig = ''

    dfextra = pd.DataFrame({
        'Coefficient (b)': [
            '{:.3f}'.format(model.rsquared),
            '{:.3f}'.format(model.rsquared_adj),
            '{:.3f}{}'.format(model.fvalue, sig),
            '{:.3f}'.format(model.f_pvalue),
        ]
    }, index=[
        'R2',
        'Adjusted R2',
        'F',
        'Probability F',
    ])
    
    return df, dfextra

In [None]:
df_ols, df_ols_extra = scientific_ols_table(model)

In [None]:
with pd.ExcelWriter('../data/modeloutput/scientific_ols_table.xlsx', mode='w') as writer:  
    df_ols.to_excel(writer, sheet_name='ols_betas')
    df_ols_extra.to_excel(writer, sheet_name='ols')

## --> PROCESS macro for moderation effect

In [None]:
from pyprocessmacro import Process

In [None]:
model = Process(data=df,
                model=1,
                x=['composite_relational_ig_practises'],
                y=Y_COLUMN,
                m=["young_firm"])
model.summary()

In [None]:
model = Process(data=df,
                model=1,
                x=['composite_formal_ig_practises'],
                y=Y_COLUMN,
                m=["young_firm"])
model.summary()

## --> ols model by output of feature selection

In [None]:
import json

In [None]:
with open('../data/modeloutput/feature_fits_20210421_002632.json', 'r') as infile:
    results = json.load(infile)

In [None]:
count = 0
max_score = 0
best_features = None
for k, v in results.items():
    if v['rsquared_adj'] > max_score:
        max_score = v['rsquared_adj']
        n_significant = v['n_significant']
        best_features = v['features']

In [None]:
model = _model(df, best_features, Y_COLUMN)

In [None]:
print(model.summary())