In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = [12, 8]

In [None]:
fixed_model_columns = [
    'firmhash',
    'firm',
    'sector',
]

In [None]:
df_model = pd.read_csv('../data/modelinput/information_governance_clean_dataset.csv')

In [None]:
df_model.dtypes

In [None]:
sector_firm_count = df_model.groupby(['sector'])[['firm']].count()
sector_firm_count.sort_values('firm', ascending=False)

In [None]:
df_model[['reputation_score_2020']].boxplot(vert=False)

In [None]:
df_model[['log_n_employees']].hist()

In [None]:
corr_columns = [c for c in df_model.columns if c not in fixed_model_columns]

In [None]:
plt.rcParams["figure.figsize"] = [20, 8]
heatmap = sns.heatmap(df_model[corr_columns].corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
def scientific_correlation_table(dataf, index_renames=None):
    """"""
    
    mean_values = ['{:.2f}'.format(v) for v in df_model[sc_corr_table_columns].mean().values]
    std_values = ['{:.2f}'.format(v) for v in df_model[sc_corr_table_columns].std().values]
    columns = ['Mean', 'S.D.']
    table_values = [mean_values, std_values]
    
    indices = []
    count = 0
    for mainc in dataf.columns:
        pcor_outputs = []
        for corc in dataf.columns:
            output = generate_pcor_and_pstars(df_model[corc], df_model[mainc])
            pcor_outputs.append(output)
        none_values = [None for v in range(count)]
        values_include = pcor_outputs[-(len(pcor_outputs)-count):]
        final_values = [*none_values, *values_include]
        table_values.append(final_values)
        indices.append(f'{count+1}. {mainc}')
        columns.append(f'{count+1}.')
        count += 1
        
    if index_renames != None:
        indices = [f"{x.split('. ')[0]}. {new_index_names[x.split('. ')[1]]}" for x in indices]
        
    return pd.DataFrame(np.column_stack(table_values), index=indices, columns=columns)


def generate_pcor_and_pstars(values1, values2):
    """"""
    
    output = stats.pearsonr(values1, values2)
    
    if output[0] > 0.99999:
        sig = ''
    elif output[1] <= 0.001:
        sig = '***'
    elif output[1] <= 0.01:
        sig = '**'
    elif output[1] <= 0.05:
        sig = '*'
    else:
        sig = ''
        
    return '{:.2f}{}'.format(output[0], sig)

In [None]:
new_index_names = {
    'reputation_score_2020': 'Corporate Reputation Score',
    'composite_relational_ig_practises': 'Relational IG Practises Composite ',
    'composite_formal_ig_practises': 'Formal IG Practises Composite ',
    'return_on_assets': 'Return on Assets',
    'log_n_employees': 'Log Number of Employees',
    'csr_index': 'CSR Index',
    'logmin_n_data_breaches': 'Log (min=1) Number of Data Breaches',
    'sqrt_age_in_years': 'Square Root Age',
}

sc_corr_table_columns = [x for x in corr_columns if 'dummy' not in x]

df_sc_correlation_table = scientific_correlation_table(df_model[sc_corr_table_columns], new_index_names)

In [None]:
df_sc_correlation_table

In [None]:
with pd.ExcelWriter('../data/modeloutput/scientific_correlation_table.xlsx', mode='w') as writer:  
    df_sc_correlation_table.to_excel(writer, sheet_name='table')