In [None]:
import csv
import string
import re
import pandas as pd
from collections import Counter
from datetime import datetime
from src import common, string_matching

In [None]:
dffortune = pd.read_csv('../data/fortune/f500_full_firm_data.csv')
dffortune_sample = dffortune.loc[lambda x: (x['ranking']<=300) & (x['include']==True)].drop(columns=['include', 'ranklabel']).reset_index(drop=True)

manual_scores_2019 = {
    'Adobe': 7.86,
    'HCA Healthcare': 6.89,
    'Prudential Financial (U.S.)': 6.47,
    'Lowe’s': 5.76
}

for k, v in manual_scores_2019.items():
    dffortune_sample.loc[lambda x: x['firm']==k, 'reputation_score_2019'] = v

dffortune_sample_r_growth = dffortune_sample.loc[lambda x: (~x['reputation_score_2020'].isnull()) & (~x['reputation_score_2019'].isnull())][['reputation_score_2020', 'reputation_score_2019']]
dffortune_sample_r_growth = dffortune_sample_r_growth.loc[lambda x: (x['reputation_score_2020']!='-') & (x['reputation_score_2019']!='-')]

dffortune_sample_r_growth = dffortune_sample_r_growth.astype(float).round(5)
dffortune_sample_r_growth['reputation_score_growth'] = (dffortune_sample_r_growth['reputation_score_2020'] - dffortune_sample_r_growth['reputation_score_2019']) / dffortune_sample_r_growth['reputation_score_2019']

dffortune_sample = pd.concat([dffortune_sample, dffortune_sample_r_growth[['reputation_score_growth']]], axis=1)

### privacy policy features (alg)

In [None]:
df_pp_features = pd.read_csv('../data/policies/features/firm_pp_features_0.2.0.csv')
df_pp_features = df_pp_features.drop(columns=['firm'])
df_pp_features.columns = [f'pp_{c}' if c != 'firmhash' else c for c in df_pp_features.columns]

### n data breaches

In [None]:
df_prc_data_breaches = pd.read_csv('../data/breaches/prc_firm_data_breach_matches.csv')

### controls 1 ==> age

In [None]:
df_control_1 = pd.read_csv('../data/dbs/date_of_incorporation_and_stock_turnover.csv')

df_control_1['stock_turnover'] = df_control_1['stock_turnover'].str.replace(',', '.')
df_control_1['stock_turnover'] = df_control_1['stock_turnover'].replace('n.a.', None)
df_control_1['stock_turnover'] = df_control_1['stock_turnover'].replace('n.s.', None)
df_control_1['stock_turnover'] = df_control_1['stock_turnover'].astype(float)

df_control_1 = common.column_to_date(df_control_1, 'date_of_incorporation')
df_control_1 = df_control_1.rename(columns={'last_available_year': 'stock_turnover_last_available_year'})

df_control_1['age_in_years'] = round((pd.Timestamp.now() - pd.to_datetime(df_control_1['date_of_incorporation'])).dt.days / 364.24, 1)

df_control_1, _, __ = string_matching.match_firm_hash(dffortune_sample, df_control_1)
print(f'n no match: {_}')

### control 2 ==> roa

In [None]:
df_contron_2 = pd.read_csv('../data/dbs/return_on_assets.csv')
df_contron_2 = df_contron_2[['firm', 'return_on_assets']].copy()
df_contron_2['return_on_assets'] = df_contron_2['return_on_assets'].str.replace(',', '.').astype(float)
df_contron_2, _, __ = string_matching.match_firm_hash(dffortune_sample, df_contron_2)
print(f'n no match: {_}')

### controls 3 ==> csr score

In [None]:
df_csr_index = pd.read_csv('../data/msci_kld/msci_kld_social_ratings.csv')
df_csr_index = df_csr_index.rename(columns={'last_available_year': 'csr_index_last_available_year'})

### employee ratings

In [None]:
df_employee = pd.read_csv('../data/dbs/employee_satisfaction_glassdoor.csv')
df_employee = df_employee.drop(columns=['Industry', 'Sector'])
df_employee.columns = [common.to_clean_string(c) for c in df_employee.columns]
df_employee, _, __ = string_matching.match_firm_hash(dffortune_sample, df_employee)
print(f'n no match: {_}')

### privacy policy features (manuals)

In [None]:
df_ig_manuals = pd.read_csv('../data/dbs/information_governance_practises_manuals.csv')
df_ig_manuals['privacy_policy_url'] = df_ig_manuals['Privacy Policy URL corrected']
df_ig_manuals['privacy_policy_url'] = df_ig_manuals['privacy_policy_url'].fillna(df_ig_manuals['Privacy Policy URL'])
df_ig_manuals = df_ig_manuals.drop(columns=['Industry', 'Sector', 'Collector ', 'Comment', 'Privacy Policy URL', 'Privacy Policy URL corrected'])
df_ig_manuals.columns = [common.to_clean_string(c) for c in df_ig_manuals.columns]
df_ig_manuals, _, __ = string_matching.match_firm_hash(dffortune_sample, df_ig_manuals)
df_ig_manuals.columns = [f'pp_{c}' if c != 'firmhash' else c for c in df_ig_manuals.columns]
print(f'n no match: {_}')

### iss proposals

In [None]:
df_iss_share_pros = pd.read_csv('../data/dbs/iss_shareholder_proposals.csv')
df_iss_share_pros.columns = [common.to_clean_string(c) for c in df_iss_share_pros.columns]
df_iss_share_pros = common.column_to_date(df_iss_share_pros, 'meeting_date')
df_iss_share_pros = df_iss_share_pros.rename(columns={'company_name': 'firm'})

df_iss_share_pros_count = df_iss_share_pros.groupby('firm', as_index=False)[['other_status']].count().rename(columns={'other_status': 'number_of_shareholder_proposals'})

df_iss_share_pros_count, _, __ = string_matching.match_firm_hash(dffortune_sample, df_iss_share_pros_count)
print(f'n no match: {_}')

### long-short term investors

In [None]:
df_investors_shortlong = pd.read_csv('../data/dbs/long_short_term_investments.csv')
df_investors_shortlong.columns = [common.to_clean_string(c) for c in df_investors_shortlong.columns]
df_investors_shortlong.loc[lambda x: x['longterm']==0, 'share_shortterm_investors'] = df_investors_shortlong['share_percent']
df_investors_shortlong.loc[lambda x: x['longterm']==1, 'share_longterm_investors'] = df_investors_shortlong['share_percent']
df_shortlong_summed = df_investors_shortlong.groupby('firm', as_index=False)[['share_shortterm_investors', 'share_longterm_investors']].sum()
df_shortlong_summed, _, __ = string_matching.match_firm_hash(dffortune_sample, df_shortlong_summed)
print(f'n no match: {_}')

In [None]:
df_blended = dffortune_sample \
.merge(df_control_1, how='left', on='firmhash') \
.merge(df_contron_2, how='left', on='firmhash') \
.merge(df_pp_features, how='left', on='firmhash') \
.merge(df_ig_manuals, how='left', on='firmhash') \
.merge(df_prc_data_breaches, how='left', on='firmhash') \
.merge(df_csr_index, how='left', on='firmhash') \
.merge(df_employee, how='left', on='firmhash') \
.merge(df_iss_share_pros_count, how='left', on='firmhash') \
.merge(df_shortlong_summed, how='left', on='firmhash')

In [None]:
df_blended.columns

### checks

In [None]:
df_blended['pp_legislation_complied_with'] = df_blended['pp_legislation_complied_with'].str.strip().str.lower()
for c in ['stock_turnover_last_available_year', 'csr_index_last_available_year']:
    df_blended[c] = df_blended[c].fillna(-99).astype(int)

--> manual addons to reputation scores

### save result

In [None]:
df_blended.to_csv('../data/modelinput/information_governance_full_dataset.csv',
                  index=False,
                  quoting=csv.QUOTE_NONNUMERIC,
                  quotechar='"')