# Multivariable Analysis

In [None]:
import matplotlib
%matplotlib inline
import shap, matplotlib.pyplot as plt

import matplotlib
import seaborn as sns
matplotlib.rcParams['figure.dpi'] = 300
sns.set(style='white')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')
import statsmodels.api as sm
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
np.random.seed(42)


def run_shap(X_train, X_test, model, model_type='linear', explainer_options={}, get_shap_values_options={}, overall=False, savefile=''):
    matplotlib.rcParams['figure.dpi'] = 300
    sns.set(style='white')
    #shap.initjs()
    np.random.seed(42)

    shap_model={'tree':shap.TreeExplainer,'kernel':shap.KernelExplainer,'linear':shap.LinearExplainer}[model_type]

    explainer = shap_model(model, X_train,**explainer_options)

    shap_values = explainer.shap_values(X_test,**get_shap_values_options)

    if model_type=='tree' and model.__class__.__name__!='XGBClassifier':
        shap_values=np.array(shap_values)[1,...]

    plt.figure()
    shap.summary_plot(shap_values, X_test,feature_names=[col.replace('_',' ') for col in X_test.columns], plot_type='bar' if overall else 'dot', max_display=30)
    if savefile:
        plt.savefig(savefile,dpi=300)
    return explainer, shap_values

def plot_shap(county):
    i=counties2.index(county)
    #shap.initjs()
    print(counties.values[Set=='test'][i])
    #plt.figure(figsize=(5,5))
    fig=shap.force_plot(explainer.expected_value, shap_values[i,:], np.round(X_test.iloc[i,:],3),matplotlib=True,show=False,figsize=(30,4))
    plt.savefig('../results/shap.lm.{}.png'.format(county),bbox_inches = "tight",figsize=(30,4),dpi=300)

df=pd.read_csv('../data/final_dataset_covariate_model.csv') 
X,y=df[['Adjusted_Race_1','Adjusted_Race_2','Adjusted_Race_4','Hispanic','Sex','Age','Any_College_2015','income_2015']+['Region_{}'.format(i) for i in range(1,9)]],df['Mortality']*1000
counties,Set=df['County'],df['Set']
X_train,y_train=X[Set=='train'],y[Set=='train']
X_val,y_val=X[Set=='val'],y[Set=='val']
X_test,y_test=X[Set=='test'],y[Set=='test']
X_train=np.vstack([X_train,X_val])
y_train=np.hstack([y_train,y_val])

X_train2 = sm.add_constant(X_train)
y_train2 = y_train
result = sm.OLS(y_train2, X_train2).fit()
fit_params=pd.DataFrame(result.params[1:],index=list(X))
fit_params=fit_params.iloc[np.argsort(fit_params.abs().values.flatten())[::-1]]
    
pr=LinearRegression(normalize=False)
pr.fit(X_train,y_train,sample_weight=df.loc[df['Set']!='test','Population'])
pr.score(X_test,y_test) # R2

region_replace=dict([x.split('\t') for x in 
"""Region_1	New England
Region_2	Mideast
Region_3	Great Lakes
Region_4	Plains
Region_5	Southeast
Region_6	Southwest
Region_7	Rocky Mountains
Region_8	Far East
""".splitlines()])

explainer, shap_values=run_shap(X_train,X_test.rename(columns=region_replace),pr,overall=True)

explainer, shap_values=run_shap(X_train,X_test.rename(columns=region_replace),pr)

importances=pd.DataFrame(np.round(np.abs(shap_values).mean(0),1)).T
importances.columns=list(X_test)
importances.T.sort_values(by=0,ascending=False)

counties2=counties[Set=='test'].tolist()
noncounties=[]
for county in df.loc[df['Set']=='test','County'].values:
    if county in counties2:
        plot_shap(county)
    else:
        print('Not a County',county)
        noncounties.append(county)

# Univariable Analysis

In [None]:
import pandas as pd
df=pd.read_csv('../data/covar_model_merged.csv')

In [None]:
import statsmodels.api as sm
import numpy as np
results=[]
for covar in [col for col in list(df) if col not in ['Unnamed: 0','Mortality_x','Mortality_binned_x','Set_x','Population_x','FIPS_x','y_true','y_pred','counties','County','state','Region']]:
    results.append([covar.replace('_x','').replace('_',' ')]+pd.read_html(sm.OLS(df[covar].values, df['y_pred'].values).fit().summary().tables[0].as_html(), header=None, index_col=0)[0].iloc[[1,3],2].values.tolist())

In [None]:
df_res=pd.DataFrame(results,columns=['Covariate','Adjusted R-Squared', 'P-Value']).sort_values(by='P-Value')
df_res.set_index('Covariate')

In [None]:
pd.DataFrame(results,columns=['Covariate','Adjusted R-Squared', 'P-Value']).values