# Logistic regression

In this notebook, we take the predictions from the previous models and train a logistic regression model to give a final probability of the segment containing a true drug-ae pair

Outputs used: 
- tf-idf:naive bayes, stochastic gradient descent, random forest, xgboost
- word2vec: stochastic gradient descent, xgboost
- lstm

In [1]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# Name of train and test dataset
# Change according to test/train dataset chosen

train_worksheet = 'df_2009_us'# 2009 training set
test_worksheet = 'df_testing_all' # test set - 2010, 2011, MIMIC, MIMIC_unrestricted


In [3]:
# ***************** # IMPORT TRAINING DATASET
# Change according to test/train dataset chosen

train_tfidf= pd.read_excel(r'.\results_external_tfidf.xlsx', sheet_name = train_worksheet)
train_w2v= pd.read_excel(r'.\results_external_w2v.xlsx', sheet_name = train_worksheet)
train_lstm= pd.read_excel(r'.\results_external_lstm.xlsx', sheet_name = train_worksheet)

test_tfidf= pd.read_excel(r'.\results_external_tfidf.xlsx', sheet_name = test_worksheet)
test_w2v= pd.read_excel(r'.\results_external_w2v.xlsx', sheet_name = test_worksheet)
test_lstm= pd.read_excel(r'.\results_external_lstm.xlsx', sheet_name = test_worksheet)

In [4]:
# *************** # Merge columns into single dataset
train = pd.DataFrame({'context':train_tfidf['context'].tolist(),
                        'segment_2':train_tfidf['segment_2'].tolist(),
                        'drug':train_tfidf['drug'].tolist(),
                        'ae': train_tfidf['ae'].tolist(),
                        'label':train_tfidf['label'].tolist(),
                        'prob_tfidf_nb':train_tfidf['prob_tfidf_nb'].tolist(),
                        'prob_tfidf_sgd':train_tfidf['prob_tfidf_sgd'].tolist(),
                        'prob_tfidf_rf':train_tfidf['prob_tfidf_rf'].tolist(),
                        'prob_tfidf_xgb':train_tfidf['prob_tfidf_xgb'].tolist(),
                        'prob_word2vec_sgd':train_w2v['prob_word2vec_sgd'].tolist(),
                        'prob_word2vec_xgb':train_w2v['prob_word2vec_xgb'].tolist(),
                        'pred_lstm':train_lstm['pred_lstm'].tolist(),
                        })

test = pd.DataFrame({'source':test_tfidf['source'].tolist(),
                        'context':test_tfidf['context'].tolist(),
                        'segment_2':test_tfidf['segment_2'].tolist(),
                        'drug':test_tfidf['drug'].tolist(),
                        'ae': test_tfidf['ae'].tolist(),
                        'label':test_tfidf['label'].tolist(),
                        'prob_tfidf_nb':test_tfidf['prob_tfidf_nb'].tolist(),
                        'prob_tfidf_sgd':test_tfidf['prob_tfidf_sgd'].tolist(),
                        'prob_tfidf_rf':test_tfidf['prob_tfidf_rf'].tolist(),
                        'prob_tfidf_xgb':test_tfidf['prob_tfidf_xgb'].tolist(),
                        'prob_word2vec_sgd':test_w2v['prob_word2vec_sgd'].tolist(),
                        'prob_word2vec_xgb':test_w2v['prob_word2vec_xgb'].tolist(),
                        'pred_lstm':test_lstm['pred_lstm'].tolist(),
                        })

In [5]:
train.head()

Unnamed: 0,context,segment_2,drug,ae,label,prob_tfidf_nb,prob_tfidf_sgd,prob_tfidf_rf,prob_tfidf_xgb,prob_word2vec_sgd,prob_word2vec_xgb,pred_lstm
0,"onic polyps, ogd non-erosive gastritis (5/2008...",complet heart block secondari degen diseas pro...,propanolol,complete heart block,1,0.795695,0.583734,0.8,0.663017,0.230238,0.688628,0.998728
1,bl e coli sensitive to meropenem) - developed...,develop erythema vancomycin suspect red man sy...,vancomycin,erythema,1,0.957168,0.652359,0.9,0.750375,0.555737,0.751824,0.999854
2,"referred endocrine, glipizide increased to 10 ...",glipizid increas bd metformin continu bd home ...,glipizide,hypoglycemic,1,0.940857,0.670613,1.0,0.560956,0.371404,0.722117,0.999959
3,hypotension likely secondary to hypotensive me...,hypotens like secondari hypotens med sepsi hel...,hypotensive meds,hypotension,1,0.836085,0.652899,0.9,0.629233,0.643945,0.815202,0.978179
4,"ncreased to 200mg om, 200mg afternoon, 400mg o...",lithium decreas mg sinc patient tremor,lithium,tremor,1,0.9582,0.444959,0.6,0.317999,0.301216,0.554161,0.999239


In [6]:
test.head()

Unnamed: 0,source,context,segment_2,drug,ae,label,prob_tfidf_nb,prob_tfidf_sgd,prob_tfidf_rf,prob_tfidf_xgb,prob_word2vec_sgd,prob_word2vec_xgb,pred_lstm
0,2010,-increased frequency nil symptoms. -no dysuria...,diagnos gp clinic given ciprofloxacin worsen a...,ciprofloxacin,abdominal pain,1,0.093701,0.215557,0.0,0.256562,0.090539,0.0858,0.003876
1,2010,d - to withhold anti-coagulation therapy till ...,view anaemia ogd find withhold warfarin indefi...,warfarin,anaemia,0,0.292828,0.392238,0.2,0.360358,0.311312,0.510333,0.957912
2,2010,tal swelling facial flushing ed ard 5.25am - i...,imp anaphylactoid reaction parvolex given hydr...,promethazine,anaphylactoid reaction,0,0.918596,0.534267,0.3,0.360358,0.732299,0.400538,0.992576
3,2010,"ndependent, community ambulant nkda pmhx 1 dm ...",glipizid metformin htn amlodipin lisinopril hy...,amlodipine,ami,0,0.462487,0.48114,0.9,0.50905,0.686867,0.769346,0.06808
4,2010,rs ago: facial redness rash anaphylaxis angioe...,rs ago facial red rash anaphylaxi angioedema g...,piptazo,anaphylaxis,1,0.880463,0.490452,0.5,0.438987,0.803897,0.756382,0.99769


## Logistic regression

In [7]:
# CREATE ARRAYS FOR TRAINING AND TESTING

x_train = train[['prob_tfidf_nb','prob_tfidf_sgd','prob_tfidf_rf','prob_tfidf_xgb','prob_word2vec_sgd','prob_word2vec_xgb','pred_lstm']]
y_train = train['label']

x_test = test[['prob_tfidf_nb','prob_tfidf_sgd','prob_tfidf_rf','prob_tfidf_xgb','prob_word2vec_sgd','prob_word2vec_xgb','pred_lstm']]
y_test = test['label']

In [8]:
model = LogisticRegression()

In [9]:
# fit log regression model
model.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
y_pred = model.predict(x_test)

In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82      4034
           1       0.63      0.68      0.66      2031

   micro avg       0.76      0.76      0.76      6065
   macro avg       0.73      0.74      0.74      6065
weighted avg       0.77      0.76      0.76      6065



## Get probability scoring and save to excel sheet

In [12]:
y_prob = model.predict_proba(x_test) # probability output by the regression model for the testing set
y_prob_train = model.predict_proba(x_train) # probability output by the regression model for the training set

In [13]:
y_prob

array([[0.99272911, 0.00727089],
       [0.85235641, 0.14764359],
       [0.55369036, 0.44630964],
       ...,
       [0.99425222, 0.00574778],
       [0.69822739, 0.30177261],
       [0.8694194 , 0.1305806 ]])

In [14]:
# Combine results to dataframe
test['logreg_prob']=y_prob[:,1].tolist()
train['logreg_prob']=y_prob_train[:,1].tolist()

In [15]:
book=load_workbook(r'./logistic_external.xlsx')
writer = pd.ExcelWriter(r'./logistic_external.xlsx', engine = 'openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

train.to_excel(writer, sheet_name = train_worksheet, index=False)
test.to_excel(writer, sheet_name = test_worksheet, index=False)

writer.save()