In [185]:
import json
from datetime import datetime
import pandas as pd
import itertools
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [21]:
df = pd.read_csv('../data/modelinput/information_governance_clean_dataset.csv')

In [171]:
y_column = 'reputation_score_2020'
# y_column = 'reputation_score_growth'
x_columns = [
    'revenue_in_millions',
    'profits_in_millions',
#     'return_on_assets',
#     'n_employees',
#     'age_in_years',
#     'pp_n_sentence',
#     'pp_number_of_words',
#     'pp_number_of_unique_words',
#     'pp_existence_of_a_transparency_report',
#     'pp_contact_option',
#     'dummy_pp_legislation_complied_with_standard',
#     'dummy_pp_legislation_complied_with_ccpa',
#     'dummy_pp_legislation_complied_with_gdpr',
#     'pp_third_party_sharing',
#     'pp_existence_of_a_data_protection_officer',
#     'pp_iso_type',
#     'pp_ambiquity_score',
#     'pp_gunning_fog_index',
#     'n_data_breaches',
]

In [173]:
def _model(dataf, x_columns, y_column):
    """"""
    
    scaler = StandardScaler() 
    df_X_train = pd.DataFrame(scaler.fit_transform(dataf[x_columns]), columns=x_columns)
    df_X_train['constant'] = 1

    df_y_train = dataf[[y_column]] 
    model = sm.OLS(df_y_train, df_X_train).fit() 

    return model

def list_combination(x_columns):
    """"""

    subcombinations = []
    for i in range(1, len(x_columns)+1):
        comb = [list(x) for x in list(itertools.combinations(x_columns, i))]
        subcombinations.append(comb)
    combinations = [i for l in subcombinations for i in l]
    
    return combinations
    

def run_feature_selection(dataf, x_columns, y_column):
    """"""
    
    combinations = list_combination(x_columns)

    results = {}
    highest_r_adj = 0
    best_features = None
    for n in range(0, len(combinations)):
        model = _model(dataf, combinations[n], y_column)
        if model.rsquared_adj > highest_r_adj:
            highest_r_adj = model.rsquared_adj
            best_features = combinations[n]
        results[n] = {
            'rsquared_adj': model.rsquared_adj,
            'n_significant': len([p for p in model.pvalues.values if p <= 0.05]),
            'features': combinations[n]
        }
        
    return highest_r_adj, best_features, results

In [174]:
highest_r_adj, best_features, results = run_feature_selection(df, x_columns, y_column)

In [186]:
with open(f'../data/modeloutput/feature_fits_{datetime.now().strftime("%Y%m%d_%H%M%S")}.json', 'w') as outfile:
    json.dump(results, outfile)

In [159]:
count = 0
max_lenght = 0
for k, v in results.items():
    if v['rsquared_adj']>=0.2:
        if len(v['features']) > max_lenght:
            best_lenght_features = v['features']

In [161]:
model = _model(df, best_features, y_column)
model = _model(df, best_lenght_features, y_column)