# Logistic regression

In this notebook, we take the predictions from the previous models and train a logistic regression model to give a final probability of the segment containing a true drug-ae pair

Outputs used: 
- tf-idf:naive bayes, stochastic gradient descent, random forest, xgboost
- word2vec: stochastic gradient descent, xgboost
- lstm

In [47]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [48]:
# Name of train and test dataset
# Change according to test/train dataset chosen

train_worksheet = 'train_5'# 1 to 5
test_worksheet = 'test_5' # 1 to 5


In [49]:
# ***************** # IMPORT TRAINING DATASET
# Change according to test/train dataset chosen

train_tfidf= pd.read_excel(r'.\results_k_fold_tfidf.xlsx', sheet_name = train_worksheet)
train_w2v= pd.read_excel(r'.\results_k_fold_w2v.xlsx', sheet_name = train_worksheet)
train_lstm= pd.read_excel(r'.\results_k_fold_lstm.xlsx', sheet_name = train_worksheet)

test_tfidf= pd.read_excel(r'.\results_k_fold_tfidf.xlsx', sheet_name = test_worksheet)
test_w2v= pd.read_excel(r'.\results_k_fold_w2v.xlsx', sheet_name = test_worksheet)
test_lstm= pd.read_excel(r'.\results_k_fold_lstm.xlsx', sheet_name = test_worksheet)

In [50]:
# *************** # Merge columns into single dataset
train = pd.DataFrame({'context':train_tfidf['context'].tolist(),
                        'segment_2':train_tfidf['segment_2'].tolist(),
                        'drug':train_tfidf['drug'].tolist(),
                        'ae': train_tfidf['ae'].tolist(),
                        'label':train_tfidf['label'].tolist(),
                        'prob_tfidf_nb':train_tfidf['prob_tfidf_nb'].tolist(),
                        'prob_tfidf_sgd':train_tfidf['prob_tfidf_sgd'].tolist(),
                        'prob_tfidf_rf':train_tfidf['prob_tfidf_rf'].tolist(),
                        'prob_tfidf_xgb':train_tfidf['prob_tfidf_xgb'].tolist(),
                        'prob_word2vec_sgd':train_w2v['prob_word2vec_sgd'].tolist(),
                        'prob_word2vec_xgb':train_w2v['prob_word2vec_xgb'].tolist(),
                        'pred_lstm':train_lstm['pred_lstm'].tolist(),
                        })

test = pd.DataFrame({'context':test_tfidf['context'].tolist(),
                        'segment_2':test_tfidf['segment_2'].tolist(),
                        'drug':test_tfidf['drug'].tolist(),
                        'ae': test_tfidf['ae'].tolist(),
                        'label':test_tfidf['label'].tolist(),
                        'prob_tfidf_nb':test_tfidf['prob_tfidf_nb'].tolist(),
                        'prob_tfidf_sgd':test_tfidf['prob_tfidf_sgd'].tolist(),
                        'prob_tfidf_rf':test_tfidf['prob_tfidf_rf'].tolist(),
                        'prob_tfidf_xgb':test_tfidf['prob_tfidf_xgb'].tolist(),
                        'prob_word2vec_sgd':test_w2v['prob_word2vec_sgd'].tolist(),
                        'prob_word2vec_xgb':test_w2v['prob_word2vec_xgb'].tolist(),
                        'pred_lstm':test_lstm['pred_lstm'].tolist(),
                        })

In [None]:
train.head()

In [None]:
test.head()

## Logistic regression

In [53]:
# CREATE ARRAYS FOR TRAINING AND TESTING

x_train = train[['prob_tfidf_nb','prob_tfidf_sgd','prob_tfidf_rf','prob_tfidf_xgb','prob_word2vec_sgd','prob_word2vec_xgb','pred_lstm']]
y_train = train['label']

x_test = test[['prob_tfidf_nb','prob_tfidf_sgd','prob_tfidf_rf','prob_tfidf_xgb','prob_word2vec_sgd','prob_word2vec_xgb','pred_lstm']]
y_test = test['label']

In [54]:
model = LogisticRegression()

In [55]:
# fit log regression model
model.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [56]:
y_pred = model.predict(x_test)

In [57]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.79      0.87       888
           1       0.35      0.79      0.49       129

   micro avg       0.79      0.79      0.79      1017
   macro avg       0.66      0.79      0.68      1017
weighted avg       0.89      0.79      0.82      1017



## Get probability scoring and save to excel sheet

In [58]:
y_prob = model.predict_proba(x_test) # probability output by the regression model for the testing set
y_prob_train = model.predict_proba(x_train) # probability output by the regression model for the training set

In [59]:
y_prob

array([[0.83635734, 0.16364266],
       [0.64713839, 0.35286161],
       [0.03777321, 0.96222679],
       ...,
       [0.00933025, 0.99066975],
       [0.9667022 , 0.0332978 ],
       [0.9121983 , 0.0878017 ]])

In [60]:
# Combine results to dataframe
test['logreg_prob']=y_prob[:,1].tolist()
train['logreg_prob']=y_prob_train[:,1].tolist()

In [61]:
book=load_workbook(r'./logistic_k_fold.xlsx')
writer = pd.ExcelWriter(r'./logistic_k_fold.xlsx', engine = 'openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)

train.to_excel(writer, sheet_name = train_worksheet, index=False)
test.to_excel(writer, sheet_name = test_worksheet, index=False)

writer.save()