In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import sklearn.metrics

from Py_Files import analytics
from Py_Files import metric_inventory

In [None]:
# Open modeling df
df = pd.read_csv('/Users/annelilefranc/Documents/QML Files/modeling_dataset_with_bankruptcy_labels.csv')
df['fiscal_end_date'] = df['fiscal_end_date'].apply(lambda x:pd.to_datetime(x))

In [None]:
def model_df_prep(df, test_split_date):
    # in/out of sample split
    if test_split_date is not None:
        pct_df= df[df['fiscal_end_date'] < pd.to_datetime(test_split_date)].copy()
    else:
        pct_df = df.copy()

    mask = pct_df[list(metric_inventory.display_name_dict)].isna().any(axis=1)
    pct_df = pct_df[~mask]
    pct_vars = []

    # Apply percentile bins after splitting test/train dfs
    for x in list(metric_inventory.display_name_dict):
        this_boundaries = analytics.calculate_percentile_bins(pct_df, column=x, num_bins=100)
        cutpoints_dict = {}
        cutpoints_dict[x] = this_boundaries
        try:
            df[f'{x}_pct'] = analytics.assign_to_bins(df, column=x, boundaries=this_boundaries)
            pct_vars.append(x)
        except:
            pass
    pct_vars_dict = {f'{i}_pct':{'category': metric_inventory.display_name_dict[i]['category']} for i in pct_vars}

    return df, pct_vars_dict

In [None]:
def univariate_reg(df, vars, horizon, pct, test_split_date):

    results_list = []
    for var in vars:
        category = vars[var]['category']

        temp_df = df[df[var].notnull()].copy()
        temp_df = temp_df[temp_df[var] != np.inf]
        temp_df = temp_df[temp_df[var] != -np.inf]
        temp_df['constant'] = 1

        # if no percentile adjustment, winsorize
        if pct == 'No':
            lower, upper = temp_df[var].quantile([0.01, 0.99])
            temp_df[var] = temp_df[var].clip(lower=lower, upper=upper)


        # Create train/test dfs
        if test_split_date is not None:
            df_test = temp_df[temp_df['fiscal_end_date'] > pd.to_datetime(test_split_date)].copy()
            df_train = temp_df[temp_df['fiscal_end_date'] <= pd.to_datetime(test_split_date)].copy()
        else:
            df_test = temp_df.copy()
            df_train = temp_df.copy()
        # Drop observations too close to default event
        df_train = df_train[df_train[f'default_{horizon}'] != -1].copy()
        # logit regression
        y = df_train[f'default_{horizon}']
        X = df_train[[var, 'constant']]


        model = sm.Logit(y, X)
        result = model.fit()

        # Calculate in-sample AUROC
        predictions = result.predict(X)
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(y, predictions)
        roc_auc = sklearn.metrics.auc(fpr, tpr)
        p_value = result.pvalues[var]
        coeff = result.params[var]

        # Calculate out_of-sample AUROC
        predictions_o = result.predict(df_test[[var, 'constant']])
        fpr_o, tpr_o, thresholds_o = sklearn.metrics.roc_curve(df_test[f'default_{horizon}'], predictions_o)
        roc_auc_o = sklearn.metrics.auc(fpr_o, tpr_o)

        results_list.append({
            'Variable': var,
            'Category': category,
            'Coefficient': coeff,
            'P-value': p_value,
            'AUROC - Train': roc_auc,
            'AUROC - Test': roc_auc_o
        })
    results_df = pd.DataFrame(results_list)

    return results_df

In [None]:
df_model, pct_vars_dict = model_df_prep(df, None)
df_model_split, pct_vars_dict_split = model_df_prep(df, '2018-01-01')

for n in range(1,6):
    horizon = n
    print(f'Horizon: {n}')
    # with pct transformation = No
    #results = univariate_reg(df_model, display_name_dict, horizon, 'No', None)
    #results.to_csv(f'/Users/annelilefranc/Documents/QML Files/univariate_reg_{n}y.csv', index=False)
    # with train/test split
    results = univariate_reg(df_model_split, metric_inventory.display_name_dict, horizon, 'No', '2018-01-01')
    results.to_csv(f'/Users/annelilefranc/Documents/QML Files/univariate_reg_{n}y_split.csv', index=False)

    # with pct transformation = Yes
    #results = univariate_reg(df_model, pct_vars_dict, horizon, 'Yes', None)
    #results.to_csv(f'/Users/annelilefranc/Documents/QML Files/univariate_reg_{n}y-pct.csv', index=False)
    # with train/test split
    results = univariate_reg(df_model_split, pct_vars_dict_split, horizon, 'Yes', '2018-01-01')
    results.to_csv(f'/Users/annelilefranc/Documents/QML Files/univariate_reg_{n}y-pct_split.csv', index=False)