In [None]:
import csv
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = [12, 8]

In [None]:
isnull_columns = [
    'revenue_in_millions',
    'profits_in_millions',
    'age_in_years',
    'return_on_assets',
    'overall_employer_rating'
]

min_columns = {
    'pp_n_sentence': 1,
    'n_employees': 0
}

dummy_columns = ['pp_legislation_complied_with']

corr_columns = [
    'revenue_in_millions',
    'profits_in_millions',
    'return_on_assets',
    'n_employees',
    'age_in_years',
    'pp_n_sentence',
    'pp_number_of_words',
    'pp_number_of_unique_words',
    'pp_existence_of_a_transparency_report',
    'pp_contact_option',
    'dummy_pp_legislation_complied_with_standard',
    'dummy_pp_legislation_complied_with_ccpa',
    'dummy_pp_legislation_complied_with_gdpr',
    'pp_third_party_sharing',
    'pp_existence_of_a_data_protection_officer',
    'pp_iso_type',
    'pp_ambiquity_score',
    'pp_gunning_fog_index',
    'n_data_breaches',
    'overall_employer_rating',
    'reputation_score_2020',
    'reputation_score_2019',
    'reputation_score_growth',
]

def _filter_out_nulls(dataf, filterlist):
    """"""
    
    for c in filterlist:
        n_rows_before = len(dataf)
        dataf = dataf.loc[lambda x: ~x[c].isnull()]
        dataf = dataf.loc[lambda x: x[c]!=0]
        n_rows_filtered = n_rows_before - len(dataf)
        print(f'{c}: {n_rows_filtered} rows are filtered out')
        
    return dataf

def _filter_on_mins(dataf, filterdict):
    """"""
    
    for k, v in min_columns.items():
        n_rows_before = len(dataf)
        dataf = dataf.loc[lambda x: ~x[k].isnull()]
        dataf = dataf.loc[lambda x: x[k]>v]
        n_rows_filtered = n_rows_before - len(dataf)
        print(f'{k}: {n_rows_filtered} rows are filtered out')
        
    return dataf

def create_dummies(dataf, columns):
    """"""
    
    for c in columns:
        for d in dataf[c].unique():
            dataf[f'dummy_{c}_{d}'] = 0
            dataf.loc[lambda x: x[c].isnull(), f'dummy_{c}_{d}'] = None
            dataf.loc[lambda x: x[c]==d, f'dummy_{c}_{d}'] = 1
            
    return dataf

In [None]:
dfraw = pd.read_csv('../data/modelinput/information_governance_full_dataset.csv')

In [None]:
dfraw.dtypes

In [None]:
waterfall_on_selection = {}

waterfall_on_selection['n_firms_sample'] = dfraw['firmhash'].nunique()

df1 = dfraw.loc[lambda x: ~x['reputation_score_2020'].isnull()]
waterfall_on_selection['n_firms_reputation_2020'] = df1['firmhash'].nunique()

df2 = df1.loc[lambda x: ~x['reputation_score_2019'].isnull()]
waterfall_on_selection['n_firms_reputation_2019'] = df2['firmhash'].nunique()

df3 = df2.loc[lambda x: x['pp_privacy_policy_exists']==1]
waterfall_on_selection['n_firms_pp_existence'] = df3['firmhash'].nunique()

df4 = _filter_out_nulls(df3, isnull_columns)
df_clean = _filter_on_mins(df4, min_columns)

df_clean = create_dummies(df_clean, dummy_columns)

df_clean.to_csv('../data/modelinput/information_governance_clean_dataset.csv',
                index=False,
                quoting=csv.QUOTE_NONNUMERIC,
                quotechar='"')

waterfall_on_selection['n_firms_others_checked'] = df_clean['firmhash'].nunique()

waterfall_on_selection

In [None]:
plt.bar(waterfall_on_selection.keys(), waterfall_on_selection.values(), color='#333')
plt.xticks(rotation=90)
plt.show()

In [None]:
# df4.loc[lambda x: x['pp_n_sentence']<1000][['pp_n_sentence']].boxplot()
df_clean[['pp_n_sentence']].boxplot(vert=False)

In [None]:
# df4.loc[lambda x: x['pp_number_of_words']<20000][['pp_number_of_words']].boxplot(vert=False)
df_clean[['pp_number_of_words']].boxplot(vert=False)

In [None]:
for c in corr_columns:
    if len(df_clean[c].unique()) > 2:
        mean = round(df_clean[c].mean(), 2)
        std = round(df_clean[c].std(), 2)
        print(f'{c}: mean = {mean}; std = {std}')

In [None]:
for c in corr_columns:
    if len(df_clean[c].unique()) == 2:
        print(df_clean.groupby(c)['firm'].count())

In [None]:
plt.rcParams["figure.figsize"] = [20, 8]
heatmap = sns.heatmap(df_clean[corr_columns].corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
for c in corr_columns:
    result = stats.pearsonr(df_clean[c], df_clean['reputation_score_2020'])
    if result[1] <= 0.001:
        sig = '***'
    elif result[1] <= 0.01:
        sig = '**'
    elif result[1] <= 0.05:
        sig = '*'
    else:
        sig = ''
    print(f'{c} --> reputation_score_2020 => pearsonr coefficient: {round(result[0], 3)}{sig}; p = {round(result[1], 3)}')