In [1]:
import pandas as pd
import pickle
import numpy as np
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score

# Load in data and model. 

## Text from our medical documents

In [2]:
#tf-idf vectors
fulldocs = pd.read_csv("fulldocs.csv")

In [3]:
fulldocs

Unnamed: 0,docid,fulltext,docid.1,caseid,label
0,1.0,Document Text: Example 1\nReferring Doctor: Un...,1,-1,1.0
1,2.0,Document Text: Example 2\nProgress Notes\nDate...,2,-1,1.0
2,3.0,Document Text: Example 3\nChief Complaint:\n1....,3,-1,4.0
3,4.0,Document Text: 14\nse\nLAIDA\n*\nPatient Infor...,4,11594,1.0
4,5.0,Document Text: The first set of documents behi...,5,11594,1.0
...,...,...,...,...,...
142,145.0,Document Text: FAX COVER SHEET\nDATE:\nTO:\nPH...,145,207766,2.0
143,146.0,Document Text: Page 1/20\nPatient Profile\nRes...,146,207813,2.0
144,147.0,Document Text: A SPECIALTY INFUSION COMPANY\n....,147,207873,2.0
145,129.0,Document Text: PM\nFROM: Fax\nPAGE: 001 OF 021...,129,206600,1.0


## Important words

In [4]:
with open('important_words.txt', 'r', encoding="utf8") as file:
  data = file.read()
  words = data.split('\n')
  words = list(set(words))

# Prepping tf-idf

In [5]:
corpus_dict = {}
corpus = []
tags = []
for i, row in fulldocs.iterrows():
  corpus.append(row['fulltext'])
  tags.append(row['label'])
  corpus_dict[i] = (row['fulltext'], row['label'])

corpus_np = np.array(corpus)
tags_np = np.array(tags)

vectorizer = TfidfVectorizer(ngram_range=(1,2))
test_docs = vectorizer.fit_transform(corpus)
vocabulary = vectorizer.get_feature_names()
final_vocab = []

# get overlapping words with relevant words and word vectors made from the document
for vocab in vocabulary:
  splits = vocab.split()
  for w in words:
    if w in splits:
      final_vocab.append(vocab)
      break

for i in range(len(tags)):
  if tags[i] == 4:
    tags[i] = 3.0
  elif tags[i] == 0:
    tags[i] = 2.0

# Training our logistic regression using k-folds

In [6]:
skf = StratifiedKFold(n_splits=6, shuffle=True, random_state=1) # using 6 kfolds
curr_train = []
curr_test = []
curr_f1_train = []
curr_f1_test = []
curr_f1_macro_train = []
curr_f1_macro_test = []
    
for train_index, test_index in skf.split(corpus, tags):
    # split corpus into train and test
    features_train, features_test = corpus_np[train_index], corpus_np[test_index]
    # split tags into train and test
    labels_train, labels_test = tags_np[train_index], tags_np[test_index]
    # train tfidf vectorizer
    features_train = vectorizer.fit_transform(features_train)
    features_test = vectorizer.transform(features_test)

    vocabulary = vectorizer.get_feature_names()
    final_vocab = []

    # get overlapping words with relevant words and word vectors made from the document
    for vocab in vocabulary:
        splits = vocab.split()
        for w in words:
            if w in splits:
                final_vocab.append(vocab)
                break

    # get the train vector matrix
    train = pd.DataFrame(data=features_train.toarray(), columns=vocabulary)

    # get the test vector matrix
    test = pd.DataFrame(data=features_test.toarray(), columns=vocabulary)

    curr_model = LogisticRegression(penalty = 'l1', solver = 'saga', C=10).fit(train[final_vocab], labels_train)

    labels_pred = curr_model.predict(test[final_vocab])
    labels_train_pred = curr_model.predict(train[final_vocab])

    curr_train.append(accuracy_score(labels_train, labels_train_pred))
    curr_test.append(accuracy_score(labels_test, labels_pred))

    curr_f1_train.append(f1_score(labels_train, labels_train_pred, average='weighted'))
    curr_f1_test.append(f1_score(labels_test, labels_pred, average='weighted'))

    curr_f1_macro_train.append(f1_score(labels_train, labels_train_pred, average='macro'))
    curr_f1_macro_test.append(f1_score(labels_test, labels_pred, average='macro'))



## Print out accuracy scores and f1-scores

In [7]:
print("Train error: ", np.mean(curr_train))
print("Test error: ", np.mean(curr_test))
print("Train weighted f1: ", np.mean(curr_f1_train))
print("Test weighted f1: ", np.mean(curr_f1_test))
print("Train macro f1: ", np.mean(curr_f1_macro_train))
print("Test macro f1: ", np.mean(curr_f1_macro_test))

Train error:  0.5468701408325559
Test error:  0.4694444444444445
Train weighted f1:  0.43253392842807736
Test weighted f1:  0.31457743457743453
Train macro f1:  0.2858784510300954
Test macro f1:  0.1336136136136136
