In [None]:
import csv
import pandas as pd
from src import common, features

In [None]:
isnull_columns = [
    'reputation_score_2020',
    'revenue_in_millions',
    'profits_in_millions',
    'age_in_years',
    'return_on_assets',
    'csr_index',
]

min_columns = {
    'pp_privacy_policy_exists': 1,
    'pp_n_sentence': 1,
    'n_employees': 1,
    'csr_index_last_available_year': 2017,
}

ig_feature_relational_practises = {
    'name': 'composite_relational_ig_practises',
    'features': {
        'pp_ambiquity_score': {
            'type': 'continues',
            'best_value': 0,
        },
        'pp_gunning_fog_index': {
            'type': 'continues',
            'best_value': 0,
        },
        'pp_contact_option': {
            'type': 'binary',
            'best_value': 1,
        },
        'pp_existence_of_a_transparency_report': {
            'type': 'binary',
            'best_value': 1,
        },
    }
}

ig_feature_formal_practises = {
    'name': 'composite_formal_ig_practises',
    'features': {
        'dummy_pp_legislation_complied_with_standard': {
            'type': 'binary',
            'best_value': 0,
        },
        'pp_third_party_sharing': {
            'type': 'binary',
            'best_value': 0,
        },
        'pp_existence_of_a_data_protection_officer': {
            'type': 'binary',
            'best_value': 1,
        },
        'pp_iso_type': {
            'type': 'binary',
            'best_value': 1,
        },
    }
}

dummy_columns = ['pp_legislation_complied_with', 'sector']

initial_model_columns = [
    'reputation_score_2020',
    'composite_relational_ig_practises',
    'composite_formal_ig_practises',
#     'pca_composite_relational_ig_practises',
#     'pca_composite_formal_ig_practises',
    'return_on_assets',
    'n_employees',
    'csr_index',
    'n_data_breaches',
    'age_in_years',
]

fixed_model_columns = [
    'firmhash',
    'firm',
    'sector',
]
    
fixed_dummy_columns = [
    'dummy_sector_Health Care',
    'dummy_sector_Technology',
    'dummy_sector_Media',
    'dummy_sector_Energy',
    'dummy_sector_Financials',
    'dummy_sector_Retailing',
    'dummy_sector_Industrials',
    'dummy_sector_Business Services',
    'dummy_sector_Household Products',
    'dummy_sector_Transportation',
    'dummy_sector_Telecommunications',
]

sectors_to_include = [
    'Business Services',
    'Financials',
    'Energy',
    'Retailing',
    'Technology',
    'Media',
    'Health Care',
    'Transportation',
    'Industrials',
    'Household Products',
    'Telecommunications'
]

In [None]:
dfraw = pd.read_csv('../data/modelinput/information_governance_full_dataset.csv')
dfwmab_2020 = pd.read_csv('../data/fortune/f500_reputation_2020.csv')
dfwmab_2019 = pd.read_csv('../data/fortune/f500_reputation_2019.csv')
dffortune = pd.read_csv('../data/fortune/f500_full_firm_data.csv')

In [None]:
dfwmab = dfwmab_2020.loc[lambda x: x['company'].isin(dfwmab_2019['company'].unique())].copy()
dfwmab['firmhash'] = [common.__hash(f) for f in dfwmab['company'].values]
dfwmab = dfwmab.merge(dffortune[['firmhash', 'sector']], how='left', on='firmhash')

In [None]:
print(f'initial sample size: {len(dfwmab)}')

In [None]:
print(f'initial sample n sectors: {len(dfwmab["sector"].unique())}')

In [None]:
len(dfwmab.loc[lambda x: x['sector'].isin(sectors_to_include)])

In [None]:
print(f'stage 2 sample n sectors: {len(dfwmab["Sector"].unique())}')

In [None]:
waterfall_on_selection = {}

waterfall_on_selection['Raw sample'] = dfraw['firmhash'].nunique()

dffiltering = common._filter_out_nulls(dfraw, isnull_columns)
df_clean = common._filter_on_mins(dffiltering, min_columns)

df_clean = features.create_dummies(df_clean, dummy_columns)

In [None]:
if df_clean['firmhash'].nunique() == len(df_clean):
    print(f'n firms final sample: {len(df_clean)}')
else:
    print('check duplicates!')

In [None]:
df_clean = features.create_composite_variable(df_clean, ig_feature_relational_practises, method='minmax')
df_clean = features.create_composite_variable(df_clean, ig_feature_formal_practises, method='minmax')

In [None]:
df_clean['young_firm'] = 0
df_clean.loc[lambda x: x['age_in_years']<=x['age_in_years'].median(), 'young_firm'] = 1
print(f'Median firm age: {round(df_clean["age_in_years"].median(), 5)}')

In [None]:
final_model_columns = []
min_skew = -1
max_skew = 1
for c in initial_model_columns:
    print(f'==> {c}')
    
    if len(df_clean[c].unique()) > 2:
        
        include_col = c
        datacheckresult = features.data_checks(df_clean, c, print_results=True)
        
#         if datacheckresult['skew'] <= min_skew or datacheckresult['skew'] >= max_skew:
#             include_col = f'zscore_{c}'
#             df_clean[include_col] = features.zscore_standardization(df_clean[c].values)
#             datacheckresult = features.data_checks(df_clean, include_col, print_results=True)

        if datacheckresult['skew'] <= min_skew or datacheckresult['skew'] >= max_skew:
            if df_clean[c].min() <= 0:
                include_col = f'logmin_{c}'
                df_clean[include_col] = features.log_transform(df_clean[c].values, lowest_value_before_transform=1)
                datacheckresult = features.data_checks(df_clean, include_col, print_results=True)
            else:
                include_col = f'log_{c}'
                df_clean[include_col] = features.log_transform(df_clean[c].values)
                datacheckresult = features.data_checks(df_clean, include_col, print_results=True)
                
        if datacheckresult['skew'] <= min_skew or datacheckresult['skew'] >= max_skew:
            include_col = f'sqrt_{c}'
            df_clean[include_col] = features.sqrt_transform(df_clean[c].values)
            datacheckresult = features.data_checks(df_clean, include_col, print_results=True)
                
        if datacheckresult['skew'] <= min_skew or datacheckresult['skew'] >= max_skew:
            include_col = f'sigmoid_{c}'
            df_clean[include_col] = features.sigmoid_transform(df_clean[c].values)
            datacheckresult = features.data_checks(df_clean, include_col, print_results=True)
            
                
    if datacheckresult['skew'] > min_skew and datacheckresult['skew'] < max_skew:
        final_model_columns.append(include_col)
    else:
        print(f'Not included: {c}')

In [None]:
output_col = [*fixed_model_columns, *final_model_columns, *fixed_dummy_columns, *['young_firm']]

df_clean.to_csv('../data/modelinput/information_governance_full_dataset_raw.csv',
                index=False,
                quoting=csv.QUOTE_NONNUMERIC,
                quotechar='"')

df_clean[output_col].to_csv('../data/modelinput/information_governance_clean_dataset.csv',
                            index=False,
                            quoting=csv.QUOTE_NONNUMERIC,
                            quotechar='"')

waterfall_on_selection['With other variables available'] = df_clean['firmhash'].nunique()