In [1]:
# install hmmlearn library for training Hidden Markhov Model
!pip install hmmlearn==0.2.6

Collecting hmmlearn==0.2.6
  Downloading hmmlearn-0.2.6.tar.gz (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.2/155.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: hmmlearn
  Building wheel for hmmlearn (setup.py) ... [?25l[?25hdone
  Created wheel for hmmlearn: filename=hmmlearn-0.2.6-cp310-cp310-linux_x86_64.whl size=464581 sha256=ff4cbe1eed7c30eeca9176617fa7b20eb015cde748eb4f6f0e7b85c8938bfd00
  Stored in directory: /root/.cache/pip/wheels/9a/9c/0d/ad94b4e1c2388b051cf78a0207f033b08b2c7d15ede782b431
Successfully built hmmlearn
Installing collected packages: hmmlearn
Successfully installed hmmlearn-0.2.6


In [2]:
# install crfsuite library for training Conditional Random Fields Classifier
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn-crfsuite-0.3.6


In [3]:
# import dependencies
import pandas as pd
import string
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from itertools import chain

import nltk
import sklearn
import scipy.stats

import sklearn_crfsuite
from sklearn_crfsuite import scorers, CRF
from sklearn_crfsuite import metrics

from nltk.corpus import stopwords
nltk.download('stopwords')

from typing import List

from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# this notebook was run on Google Colab, so data must be pulled from Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# file paths to data on personal Google Drive
suffix = 'drive/MyDrive/medical_dataset_analysis/'
beth_file_directory = 'concept_assertion_relation_training_data/beth/txt/'
beth_concept_directory = 'concept_assertion_relation_training_data/beth/concept/'
partner_file_directory = 'concept_assertion_relation_training_data/partners/txt/'
partner_concept_directory = 'concept_assertion_relation_training_data/partners/concept/'

In [7]:
# function that loads all of the the text and annotations from the folders
def getNotes(file_directory):
    text_dict = {}
    file_list = os.listdir(suffix+file_directory)
    for f in file_list:
        if f[-3:] != 'xml' and f[-3:] != 'txt' and f[-3:] != 'con':
            file_list.remove(f)
    if '.DS_Store' in file_list:
        file_list.remove('.DS_Store')
    for file_name in file_list:
        with open(suffix+file_directory + file_name,'r') as file:
            data = file.read()
            text_dict[file_name[:-4]] = data
        file.close()
    return text_dict

In [8]:
beth_notes = getNotes(beth_file_directory)
beth_concepts = getNotes(beth_concept_directory)
partners_notes = getNotes(partner_file_directory)
partners_concepts = getNotes(partner_concept_directory)

all_notes = beth_notes.copy()
all_notes.update(partners_notes)
all_concepts = beth_concepts.copy()
all_concepts.update(partners_concepts)

In [9]:
# need to manually align the text with the annotations and then store in a dictionary
start_tag = 'B'
inner_tag = 'I'
null_tag = 'O'

data = {}

for record in all_concepts.keys():
    data[record] = {'words':[],'tags':[]}
    ns = all_notes[record]
    cs = all_concepts[record]
    lines = ns.split('\n')
    concept_lines = cs.split('\n')[:-1]

    lines_with_concepts = {}

    for concept_index, concept_line in enumerate(concept_lines):
        note_line = re.findall('[0-9]{1,3}:',concept_line)[0][:-1]
        word_nums = list(map(lambda x: int(x[1:]),re.findall(':[0-9]{1,3}',concept_line)))
        lines_with_concepts[(note_line,str(concept_index))] = word_nums

    for tup, word_nums in lines_with_concepts.items():

        insert_dict = {}

        notes_index, concepts_index = tup

        the_words = lines[int(notes_index)-1].split(' ')
        tags = [null_tag]*len(the_words)
        for word_num in range(word_nums[0],word_nums[-1]+1):
            tag_type = re.findall(r't=\"[a-z]+\"',concept_lines[int(concepts_index)])[0][3:-1]
            if word_num == min(word_nums) and len(word_nums)<3:
                tags[word_num] = start_tag + '-' + tag_type
            else:
                if len(word_nums) < 3:
                    tags[word_num] = inner_tag + '-' + tag_type
        data[record]['words'] += the_words
        data[record]['tags'] +=  tags

In [10]:
# store dictionary in dataframe
# have to transpose the dataframe
df = pd.DataFrame(data)
df_trans = df.transpose()

In [11]:
# reorganizing the data so that it matches the format needed for HMM code
all_records = []
all_words = []
all_tags = []
for record in df_trans.index:
  data = df_trans.loc[record]
  words = data['words']
  tags = data['tags']
  pairs = zip(words,tags)
  for word, tag in pairs:
    all_records.append(record)
    all_words.append(word)
    all_tags.append(tag)

In [None]:
# new_df is the dataframe containing all the data in the required format
new_df = pd.DataFrame({'records':all_records,'words':all_words,'tags':all_tags})
new_df.head()

# **Hidden Markhov Model**

In [13]:
# the next 10 cells are directly sourced from a kaggle tutorial: https://www.kaggle.com/code/annsanababy/hidden-markov-model-hmm-on-ner-dataset
X = new_df.drop(columns= ['tags'], axis=1)
y = new_df.drop(columns= ['words'], axis=1)

In [14]:
# GroupShuffleSplit is used so that the records are randomized rather than each row in the dataframe being randomized
gs = GroupShuffleSplit(n_splits=2, test_size=.1, random_state=42)
train_ix, test_ix = next(gs.split(X, y, groups=new_df['records']))

data_train = new_df.loc[train_ix]
data_test = new_df.loc[test_ix]

In [15]:
# basic preprocessing. Note that more preprocessing could be performed, e.g. removing redundant formatting in the discharge notes
def pre_processing(text_column):
    # lowercase all text in the column
    text_column = text_column.str.lower()

    # replacing numbers with NUM token
    text_column = text_column.str.replace(r'\d+', 'NUM')

    # removing stopwords
    stop_words = set(stopwords.words('english'))
    text_column = text_column.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    return text_column

In [16]:
# Markhov models calculate conditional probabilities according to Bayes' Theorem, which can lead to undefined probabilities since never-before-seen words will have a probability of zero. Using an "UNKNOWN" tag prevents this
dfupdate = data_train.sample(frac=.15, replace=False, random_state=42)
dfupdate.words = 'UNKNOWN'
data_train.update(dfupdate)
tags = list(set(data_train.tags.values))
words = list(set(data_train.words.values))
# Convert words and tags into numbers
word2id = {w: i for i, w in enumerate(words)}
tag2id = {t: i for i, t in enumerate(tags)}
id2tag = {i: t for i, t in enumerate(tags)}
len(tags), len(words)

(7, 10615)

In [17]:
# now we construct our transition matrix by first counting the frequency of words
from tqdm import tqdm
import numpy as np
count_tags = dict(data_train.tags.value_counts())  # Total number of POS tags in the dataset
# Now let's create the tags to words count
count_tags_to_words = data_train.groupby(['tags']).apply(
    lambda grp: grp.groupby('words')['tags'].count().to_dict()).to_dict()
# We shall also collect the counts for the first tags in the sentence
count_init_tags = dict(data_train.groupby('records').first().tags.value_counts())

# Create a mapping that stores the frequency of transitions in tags to it's next tags
count_tags_to_next_tags = np.zeros((len(tags), len(tags)), dtype=int)
sentences = list(data_train.records)
pos = list(data_train.tags)
for i in tqdm(range(len(sentences)), position=0, leave=True):
    if (i > 0) and (sentences[i] == sentences[i - 1]):
        prevtagid = tag2id[pos[i - 1]]
        nexttagid = tag2id[pos[i]]
        count_tags_to_next_tags[prevtagid][nexttagid] += 1

100%|██████████| 304358/304358 [00:00<00:00, 685373.58it/s]


In [18]:
# now we calculate the transition probabilities
startprob = np.zeros((len(tags),))
transmat = np.zeros((len(tags), len(tags)))
emissionprob = np.zeros((len(tags), len(words)))
num_sentences = sum(count_init_tags.values())
sum_tags_to_next_tags = np.sum(count_tags_to_next_tags, axis=1)
for tag, tagid in tqdm(tag2id.items(), position=0, leave=True):
    floatCountTag = float(count_tags.get(tag, 0))
    startprob[tagid] = count_init_tags.get(tag, 0) / num_sentences
    for word, wordid in word2id.items():
        emissionprob[tagid][wordid] = count_tags_to_words.get(tag, {}).get(word, 0) / floatCountTag
    for tag2, tagid2 in tag2id.items():
        transmat[tagid][tagid2] = count_tags_to_next_tags[tagid][tagid2] / sum_tags_to_next_tags[tagid]

100%|██████████| 7/7 [00:00<00:00, 24.13it/s]


In [19]:
#to create word transition matrix

#first step is to count the number of times each word appears in the dataset
count_words = {}
for word in data_train.words.values:
    count_words[word] = count_words.get(word, 0) + 1

# then count the number of times a word appears after another word
count_word_transitions = {}
for sentence in data_train.groupby('records'):
    words = sentence[1]['words'].values
    for i in range(len(words) - 1):
        w1, w2 = words[i], words[i+1]
        if w1 not in count_word_transitions:
            count_word_transitions[w1] = {}
        count_word_transitions[w1][w2] = count_word_transitions[w1].get(w2, 0) + 1

# convert the counts to probabilities
word_transition_matrix = np.zeros((len(word2id)+1, len(word2id)+1))
sum_words_to_next_words = np.sum([count_word_transitions[w1][w2] for w1 in count_word_transitions for w2 in count_word_transitions[w1]])
for w1, w1id in word2id.items():
    for w2, w2id in word2id.items():
        word_transition_matrix[w1id][w2id] = count_word_transitions.get(w1, {}).get(w2, 0) / sum_words_to_next_words
print(word_transition_matrix.shape)

(10616, 10616)


In [20]:
# label new words as "UNKNOWN" to avoid an undefined probability
data_test.loc[~data_test['words'].isin(words), 'words'] = 'UNKNOWN'
word_test = list(data_test.words)
samples = []
for i, val in enumerate(word_test):
    samples.append([word2id[val]])

lengths = []
count = 0
sentences = list(data_test.records)
for i in tqdm(range(len(sentences)), position=0, leave=True):
    if (i > 0) and (sentences[i] == sentences[i - 1]):
        count += 1
    elif i > 0:
        lengths.append(count)
        count = 1
    else:
        count = 1

100%|██████████| 33872/33872 [00:00<00:00, 778505.48it/s]


In [22]:
# we use the Viterbi algorithm to efficiently find the most probable sequence of concepts from the transition matrix

import hmmlearn
from hmmlearn import hmm

model = hmm.MultinomialHMM(n_components=len(tags), algorithm='viterbi')
model.startprob_ = startprob
model.transmat_ = transmat
model.emissionprob_ = emissionprob

In [31]:
concept_predict = model.predict(samples, lengths)
concept_predict

array([4, 4, 4, ..., 4, 4, 4], dtype=int32)

In [32]:
pos_actual = data_test['tags'].apply(lambda x: tag2id[x])

In [33]:
classification_report_hmm = classification_report(pos_actual[:len(concept_predict)],concept_predict)
print("--Results for HMM--")
print(classification_report_hmm)

--Results for HMM--
              precision    recall  f1-score   support

           0       0.41      0.03      0.05       469
           1       0.28      0.05      0.08      1044
           2       0.23      0.02      0.04       729
           3       0.19      0.01      0.02       343
           4       0.90      0.99      0.94     29496
           5       0.30      0.01      0.01       487
           6       0.44      0.02      0.03       437

    accuracy                           0.89     33005
   macro avg       0.39      0.16      0.17     33005
weighted avg       0.83      0.89      0.85     33005



# **Conditional Random Fields**

In [34]:
# the next 5 cells are directly sourced from a kaggle tutorial: https://www.kaggle.com/code/bavalpreet26/ner-using-crf
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

In [35]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

In [36]:
sentences = list()
for record in df_trans.index:
  data = df_trans.loc[record]
  words = data['words']
  tags = data['tags']
  pairs = zip(words,tags)
  for word, tag in pairs:
    sentences.append([(word,tag)])

In [37]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [38]:
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [39]:
# I placed a try and except block because the original code would throw an AttributeError due to a field in the crf object that has no consequence to my model.
# I used the first 12000 tokens has my train set and the remaining 5000 as my test set
try:
  crf.fit(X[:12000],y[:12000])
except AttributeError:
  pass

pred = crf.predict(X[12000:])

classification_report_crf = classification_report(y[12000:],pred)
print("--Results for CRF--")
print(classification_report_crf)

--Results for CRF--
              precision    recall  f1-score   support

   B-problem       0.26      0.05      0.09      6802
      B-test       0.18      0.03      0.05      4444
 B-treatment       0.24      0.12      0.16      4613
   I-problem       0.22      0.05      0.09      9900
      I-test       0.18      0.02      0.03      3757
 I-treatment       0.20      0.05      0.08      3907
           O       0.90      0.98      0.94    292807

    accuracy                           0.89    326230
   macro avg       0.31      0.19      0.21    326230
weighted avg       0.83      0.89      0.85    326230



# **Conclusion**

Both models achieve F1 scores near 0.2, which is far below the benchmark of 0.7-0.9. For next steps, I will fine-tune a BERT model on the same task. This model should perform closer to the benchmark given it is state-of-the-art.