In [1]:
import csv
import pandas as pd
from src import common, features

In [2]:
isnull_columns = [
    'reputation_score_2020',
    'revenue_in_millions',
    'profits_in_millions',
    'age_in_years',
    'return_on_assets',
    'csr_index',
]

min_columns = {
    'pp_privacy_policy_exists': 1,
    'pp_n_sentence': 1,
    'n_employees': 1,
    'csr_index_last_available_year': 2017,
}

ig_feature_relational_practises = {
    'name': 'composite_relational_ig_practises',
    'features': {
        'pp_ambiquity_score': {
            'type': 'continues',
            'best_value': 0,
        },
        'pp_gunning_fog_index': {
            'type': 'continues',
            'best_value': 0,
        },
        'pp_contact_option': {
            'type': 'binary',
            'best_value': 1,
        },
        'pp_existence_of_a_transparency_report': {
            'type': 'binary',
            'best_value': 1,
        },
    }
}

ig_feature_formal_practises = {
    'name': 'composite_formal_ig_practises',
    'features': {
        'dummy_pp_legislation_complied_with_standard': {
            'type': 'binary',
            'best_value': 0,
        },
        'pp_third_party_sharing': {
            'type': 'binary',
            'best_value': 0,
        },
        'pp_existence_of_a_data_protection_officer': {
            'type': 'binary',
            'best_value': 1,
        },
        'pp_iso_type': {
            'type': 'binary',
            'best_value': 1,
        },
    }
}

dummy_columns = ['pp_legislation_complied_with', 'sector']

initial_model_columns = [
    'reputation_score_2020',
    'composite_relational_ig_practises',
    'composite_formal_ig_practises',
    'return_on_assets',
    'n_employees',
    'csr_index',
    'n_data_breaches',
    'age_in_years',
]

fixed_model_columns = [
    'firmhash',
    'firm',
    'sector',
]
    
fixed_dummy_columns = [
    'dummy_sector_Health Care',
    'dummy_sector_Technology',
    'dummy_sector_Media',
    'dummy_sector_Energy',
    'dummy_sector_Financials',
    'dummy_sector_Retailing',
    'dummy_sector_Industrials',
    'dummy_sector_Business Services',
    'dummy_sector_Household Products',
    'dummy_sector_Transportation',
    'dummy_sector_Telecommunications',
]

In [3]:
dfraw = pd.read_csv('../data/modelinput/information_governance_full_dataset.csv')

In [4]:
waterfall_on_selection = {}

waterfall_on_selection['Raw sample'] = dfraw['firmhash'].nunique()

dffiltering = common._filter_out_nulls(dfraw, isnull_columns)
df_clean = common._filter_on_mins(dffiltering, min_columns)

df_clean = features.create_dummies(df_clean, dummy_columns)

reputation_score_2020: 99 rows are filtered out
revenue_in_millions: 0 rows are filtered out
profits_in_millions: 0 rows are filtered out
age_in_years: 7 rows are filtered out
return_on_assets: 3 rows are filtered out
csr_index: 0 rows are filtered out
pp_privacy_policy_exists: 2 rows are filtered out
pp_n_sentence: 1 rows are filtered out
n_employees: 0 rows are filtered out
csr_index_last_available_year: 0 rows are filtered out


In [5]:
df_clean = features.create_composite_variable(df_clean, ig_feature_relational_practises, method='minmax')
df_clean = features.create_composite_variable(df_clean, ig_feature_formal_practises, method='minmax')

In [8]:
df_clean['young_firm'] = 0
df_clean.loc[lambda x: x['age_in_years']<=x['age_in_years'].median(), 'young_firm'] = 1

In [11]:
final_model_columns = []
min_skew = -1
max_skew = 1
for c in initial_model_columns:
    print('==>')
    
    if len(df_clean[c].unique()) > 2:
        
        include_col = c
        datacheckresult = features.data_checks(df_clean, c, print_results=True)

        if datacheckresult['skew'] <= min_skew or datacheckresult['skew'] >= max_skew:
            if df_clean[c].min() <= 0:
                include_col = f'logmin_{c}'
                df_clean[include_col] = features.log_transform(df_clean[c].values, lowest_value_before_transform=1)
                datacheckresult = features.data_checks(df_clean, include_col, print_results=True)
            else:
                include_col = f'log_{c}'
                df_clean[include_col] = features.log_transform(df_clean[c].values)
                datacheckresult = features.data_checks(df_clean, include_col, print_results=True)
                
        if datacheckresult['skew'] <= min_skew or datacheckresult['skew'] >= max_skew:
            if df_clean[c].min() >= 0:
                include_col = f'sqrt_{c}'
                df_clean[include_col] = features.sqrt_transform(df_clean[c].values)
                datacheckresult = features.data_checks(df_clean, include_col, print_results=True)
                
    if datacheckresult['skew'] > min_skew and datacheckresult['skew'] < max_skew:
        final_model_columns.append(include_col)
    else:
        print(f'Not included: {c}')

==>
reputation_score_2020: mean = 6.73; std = 0.64; kurtosis = 0.28; skew = 0.36; p value normal test = 0.188
==>
composite_relational_ig_practises: mean = 0.47; std = 0.11; kurtosis = 2.4; skew = 0.73; p value normal test = 0.0***
==>
composite_formal_ig_practises: mean = 0.33; std = 0.18; kurtosis = -0.26; skew = 0.09; p value normal test = 0.852
==>
return_on_assets: mean = 0.05; std = 0.05; kurtosis = 0.96; skew = 0.15; p value normal test = 0.125
==>
n_employees: mean = 115950.83; std = 237868.43; kurtosis = 53.09; skew = 6.76; p value normal test = 0.0***
log_n_employees: mean = 11.01; std = 1.04; kurtosis = 0.58; skew = 0.46; p value normal test = 0.048*
==>
csr_index: mean = 0.04; std = 0.03; kurtosis = 0.62; skew = 0.56; p value normal test = 0.018*
==>
n_data_breaches: mean = 1.34; std = 2.06; kurtosis = 4.39; skew = 2.13; p value normal test = 0.0***
logmin_n_data_breaches: mean = 0.58; std = 0.68; kurtosis = -0.28; skew = 0.86; p value normal test = 0.002**
==>
age_in_years

In [12]:
output_col = [*fixed_model_columns, *final_model_columns, *fixed_dummy_columns, *['young_firm']]
df_clean[output_col].to_csv('../data/modelinput/information_governance_clean_dataset.csv',
                            index=False,
                            quoting=csv.QUOTE_NONNUMERIC,
                            quotechar='"')

waterfall_on_selection['With other variables available'] = df_clean['firmhash'].nunique()