# re-produce FEAT results with a logistic regression model

- the goal here is to reproduce the FEAT model for resistant hypertension using a logistic regression 
model from sklearn. that model can then be used to calculate and visualize shap values. 


In [None]:
import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None

In [None]:
#Load files the same way as Feat runs
targets = {
            'htn_dx_ia':'Htndx',
            'res_htn_dx_ia':'ResHtndx', 
            'htn_hypok_dx_ia':'HtnHypoKdx', 
            'HTN_heuristic':'HtnHeuri', 
            'res_HTN_heuristic':'ResHtnHeuri',
            'hypoK_heuristic_v4':'HtnHypoKHeuri'
            }
    
drop_cols = ['UNI_ID'] + list(targets.keys())
repeat = 101
target = 'res_htn_dx_ia'
fold = 'A'
random_state = 1318
rdir = 'reproduction'

In [None]:
import sys
sys.path.append('../')
from evaluate_model import evaluate_model, read_data
from models.Feat_boolean import clf as feat_clf
from models.Feat_boolean import name as feat_name

X_train, y_train, X_test, y_test = read_data(target, fold, repeat, '../' )



In [None]:
feat_clf, results  = evaluate_model(feat_clf, feat_name,                               
               target, fold, random_state, rdir,        
               repeat, data_dir='../')     

In [None]:
y_pred_train = {}
y_pred_test = {}
y_predproba_train = {}
y_predproba_test = {}
y_pred_train['feat'] = feat_clf.predict(X_train)
y_predproba_train['feat'] = feat_clf.predict_proba(X_train)

In [None]:
feat_clf.get_representation()

In [None]:
print(feat_clf.get_model())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
# LR = LogisticRegressionCV(Cs = np.logspace(-6,3,10),
#                            penalty='l2',
#                            solver = 'liblinear')
LR = LogisticRegression(C=1.0, penalty='l2', intercept_scaling=1.0, solver='liblinear')
XT_train = feat_clf.transform(X_train)
XT_train_norm = StandardScaler().fit_transform(XT_train)
LR.fit(XT_train_norm, y_train)
y_pred_train['LR'] = LR.predict(XT_train_norm)
y_predproba_train['LR'] = LR.predict_proba(XT_train_norm)
#Logistic Regression Model
print('LR trained on normalized data')
print('beta coeffs')
print(LR.coef_)
print('offset')
print(LR.intercept_)



In [None]:
print(np.sum(np.abs(y_pred_train['feat'] - y_pred_train['LR'])))
print(np.sum(np.abs(y_predproba_train['feat'] - y_predproba_train['LR'])))

**conclusion**: These models basically match. There is a very small error in the prediction probabilities. 

# reproduce FEAT's features

that way we can store an LR model and run it thru Shap.
The model's features output should be checked against feat_clf.transform(). 

In [None]:
for col in ['sum_enc_during_htn_meds_3','median_enc_during_htn_meds_4_plus',
            'sd_enc_during_htn_meds_2','mean_systolic','max.CALCIUM', 
            're_htn_spec_sum' ]:
    print('location of',col,':',[i for i,c in enumerate(X_train.columns) if c==col])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from feat_transformer import FeatTransformer

ft_lr_estimator = Pipeline( [
    ('prep', FeatTransformer()),
    ('est', LogisticRegression(C=1.0, penalty='l2', intercept_scaling=1.0, solver='liblinear'))
]
)

ft_lr_estimator.fit(X_train, y_train)

In [None]:
y_pred_train['FT_LR'] = ft_lr_estimator.predict(X_train)
y_predproba_train['FT_LR'] = ft_lr_estimator.predict_proba(X_train)

In [None]:
print(np.sum(np.abs(y_pred_train['feat'] - y_pred_train['FT_LR'])))
print(np.sum(np.abs(y_predproba_train['feat'] - y_predproba_train['FT_LR'])))

In [None]:
import pickle

with open('Feat_reconstruct_{}_{}_{}_{}.pkl'.format(target,fold,repeat,random_state), 'wb') as of:
    pickle.dump(ft_lr_estimator, of)