In [46]:
import pandas as pd
import statsmodels.api as sm
import numpy as np

In [53]:
original_data = pd.read_csv('dissecting-bias-master-data/data/data_new.csv').rename(columns={'gagne_sum_t':'Chronic_illnesses'})
original_data.columns = [a.capitalize().replace('-', '_') for a in original_data.columns]
original_data['Sex'] = original_data['Dem_female'].map(lambda x:'Female' if x == 1 else 'Male')
original_data['Age'] = 'Over 65'
original_data.loc[(original_data['Dem_age_band_55_64_tm1'] == 1) | (original_data['Dem_age_band_45_54_tm1'] == 1), 'Age'] = '45-64'
original_data.loc[(original_data['Dem_age_band_18_24_tm1'] == 1) | 
                  (original_data['Dem_age_band_25_34_tm1'] == 1) | 
                (original_data['Dem_age_band_35_44_tm1'] == 1), 'Age'] = '18-44'

# do some analysis
df = []
for column in original_data.columns:
    if column in ['Race', 'Sex', 'Age', 'Chronic_illnesses']:
        continue
    if 'Dem_age_band' in column:
        continue
    if not column.endswith('_tm1'):
        continue
    if original_data[column].std() < 1e-6:
        continue
    original_data['z_score_x'] = (original_data[column] - original_data[column].mean())/original_data[column].std()
    
    base_model = sm.OLS.from_formula('Chronic_illnesses ~ z_score_x', data=original_data).fit()
    interaction_model = sm.OLS.from_formula('Chronic_illnesses ~ z_score_x*Sex', data=original_data).fit()
    
    df.append({"c":column, 
               "absolute_value_base_coef":abs(base_model.params['z_score_x']), 
               "base_p":base_model.pvalues['z_score_x'], 
               "base_R^2":base_model.rsquared,
               "absolute_value_interaction_coef":abs(interaction_model.params['z_score_x:Sex[T.Male]']), 
               "interaction_p":interaction_model.pvalues['z_score_x:Sex[T.Male]']})
df = pd.DataFrame(df)

# take columns with large interactions with sex. 
df = df.sort_values(by='absolute_value_base_coef')[::-1]
cols_to_keep = list(df['c'].iloc[:5])
df = df.sort_values(by='absolute_value_interaction_coef')[::-1]
cols_to_keep = cols_to_keep + list(df['c'].iloc[:10])


cols_to_keep = ['Chronic_illnesses','Race','Sex','Age'] + cols_to_keep
original_data = original_data[cols_to_keep]
original_data = original_data.sample(frac=1, random_state=42)
train_data = original_data.iloc[:40000]
test_data = original_data.iloc[40000:]
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)


In [52]:
for formula in ['Chronic_illnesses ~ Age * Gagne_sum_tm1', 
                'Chronic_illnesses ~ Gagne_sum_tm1']:
    print(formula)
    model = sm.OLS.from_formula(formula, data=train_data.iloc[:11100]).fit()
    test_preds = model.predict(test_data)
    rmse = np.sqrt(np.mean((test_preds - test_data['Chronic_illnesses']) ** 2))
    print(model.summary())
    print(rmse)

Chronic_illnesses ~ Age * Gagne_sum_tm1
                            OLS Regression Results                            
Dep. Variable:      Chronic_illnesses   R-squared:                       0.722
Model:                            OLS   Adj. R-squared:                  0.722
Method:                 Least Squares   F-statistic:                     5766.
Date:                Fri, 20 May 2022   Prob (F-statistic):               0.00
Time:                        13:38:03   Log-Likelihood:                -16021.
No. Observations:               11100   AIC:                         3.205e+04
Df Residuals:                   11094   BIC:                         3.210e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------

In [55]:
df.sort_values(by='base_R^2')[::-1]

Unnamed: 0,c,absolute_value_base_coef,base_p,base_R^2,absolute_value_interaction_coef,interaction_p
137,Gagne_sum_tm1,1.642144,0.000000,0.714671,0.019429,4.191257e-02
10,Hypertension_elixhauser_tm1,1.008239,0.000000,0.269408,0.017423,2.593248e-01
20,Uncompdiabetes_elixhauser_tm1,0.878116,0.000000,0.204356,0.005841,7.133248e-01
50,Cre_tests_tm1,0.874872,0.000000,0.202849,0.119853,2.533673e-14
57,Sodium_tests_tm1,0.860410,0.000000,0.196198,0.030010,5.889093e-02
...,...,...,...,...,...,...
131,Trig_mean_low_tm1,0.016222,0.065106,0.000070,0.089629,5.330368e-06
134,Trig_max_low_tm1,0.007941,0.366566,0.000017,0.081461,4.199601e-05
89,Ghba1c_max_low_tm1,0.007335,0.404285,0.000014,0.007356,6.756111e-01
86,Ghba1c_mean_low_tm1,0.007335,0.404285,0.000014,0.007356,6.756111e-01


In [50]:
train_data.iloc[0]

Chronic_illnesses                     2
Race                              white
Sex                              Female
Age                               45-64
Gagne_sum_tm1                         2
Hypertension_elixhauser_tm1           1
Uncompdiabetes_elixhauser_tm1         0
Cre_tests_tm1                         0
Sodium_tests_tm1                      0
Cre_max_normal_tm1                    0
Cre_min_low_tm1                       0
Cost_other_tm1                    400.0
Cre_max_high_tm1                      0
Cre_min_high_tm1                      0
Cre_mean_high_tm1                     0
Hct_min_low_tm1                       0
Cre_mean_normal_tm1                   0
Hct_mean_low_tm1                      0
Cre_mean_low_tm1                      0
Name: 9224, dtype: object