# Identifying Entities in Healthcare Data

In [35]:
# Importing Neccessary Libraries library
import re
import nltk
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd


In [36]:
# Reading in the Data 
with open('train_sent', 'r') as train_sent_file:
  train_sentences = train_sent_file.readlines() #====> train sentences 

with open('train_label', 'r') as train_labels_file:
  train_labels = train_labels_file.readlines() #====> train label

with open('test_sent', 'r') as test_sent_file:
  test_sentences = test_sent_file.readlines() #====> test sentences 

with open('test_label', 'r') as test_labels_file:
  test_labels = test_labels_file.readlines() #====> test Labels 

### Sentence and labels construction

In [37]:
# Defining patterns so that we can convert the given format into sentences using the rules giving for sentence extraction

patterns="^\n" 
pattern2="[^\n]*"

# Function to convert our raw input to sentences

def makesentence(text,kind): #====>Kind to specify 'ifis label' so no space is added between the character
    line=""
    sentence=[]
    for word in text:
        if re.match(patterns,word):
            sentence.append(line)
            line=""     
        else:
            newword=re.search(pattern2,word)
            if (line=="" or kind=="label"):
                line=line+newword.group(0)
            else:
                line=line+" "+newword.group(0)
    return sentence
        

In [38]:
# Calling the makesentence function to construct the sentences and label in the format we need
train_sent=makesentence(train_sentences,"sent") #---> Train Sentences from words
train_label=makesentence(train_labels,"label")  #---> Train Labels
test_sent=makesentence(test_sentences,"sent")   #---> Test Sentences from Words
test_label=makesentence(test_labels,"label")    #---> Test Labels

### Count the number of sentences in the processed train and test dataset and Count the number of lines of labels in the processed train and test dataset.

In [39]:
print("Training Set===>")
print("Number of Sentence in train set :",len(train_sent))
print("Number of Label rows in train set :",len(train_label))
print("Test Set===>")
print("Number of Sentence in test set :",len(test_sent))
print("Number of Label rows in train set :",len(test_label))

Training Set===>
Number of Sentence in train set : 2599
Number of Label rows in train set : 2599
Test Set===>
Number of Sentence in test set : 1056
Number of Label rows in train set : 1056


In [40]:
# We have 2599 sentences with 2599 rows of labels for train dataset and 1056 sentences and 1056 labels for test data set

### Printing 5 Sentences from Training Set along with their labels

In [41]:
# We will print out the sentence along with it's corresponding labels and the length of both. 
# A mismatch in len should indicate data extraction error

for i in range(0,2599,500): #----> Gap of 500 so we can sample the data across the set
    print("Sentence :",train_sent[i])
    print("Label :",train_label[i])
    print("Number of Words in the sentences are : ",len(train_sent[i].split()))
    print("Number of labels for this sentence is : ",len(train_label[i]))
    print("-------------------------------------------")

Sentence : All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Label : OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
Number of Words in the sentences are :  45
Number of labels for this sentence is :  45
-------------------------------------------
Sentence : RESULTS : Forty-four patients in the partner intrauterine insemination group and 37 in the donor insemination group were identified as having primary infertility
Label : OOOOOOOTTOOOOOTTOOOOOOD
Number of Words in the sentences are :  23
Number of labels for this sentence is :  23
-------------------------------------------
Sentence : The effect of epinephrine on immunoreactive insulin levels in man
Label : OOOOOOOOOO
Number of Words in the sentences are :  10
Number of labels for this sentence is :  10
-------------------------------------------
Sentence 

### Printing 5 Sentences from Test Set along with their labels

In [42]:
# We will print out the sentence along with it's corresponding labels and the length of both. 
# A mismatch in len should indicate data extraction error

for i in range(0,1056,200): #----> Gap of 200 so we can sample the data across the set
    print("Sentence :",test_sent[i])
    print("Label :",test_label[i])
    print("Number of Words in the sentences are : ",len(test_sent[i].split()))
    print("Number of labels for this sentence is : ",len(test_label[i]))
    print("-------------------------------------------")

Sentence : Furthermore , when all deliveries were analyzed , regardless of risk status but limited to gestational age > or = 36 weeks , the rates did not change ( 12.6 % , 280 of 2214 ; primary 9.2 % , 183 of 1994 )
Label : OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO
Number of Words in the sentences are :  44
Number of labels for this sentence is :  44
-------------------------------------------
Sentence : The study was terminated at 30 min if satisfactory analgesia was not achieved
Label : OOOOOOOOOOOOO
Number of Words in the sentences are :  13
Number of labels for this sentence is :  13
-------------------------------------------
Sentence : The measuring function of the first legs of Araneus diadematus Cl
Label : OOOOOOOOOOO
Number of Words in the sentences are :  11
Number of labels for this sentence is :  11
-------------------------------------------
Sentence : Drugs for the Third World
Label : OOOOO
Number of Words in the sentences are :  5
Number of labels for this sentence is

### Extract those tokens which have NOUN or PROPN as their PoS tag and find their frequency

### Combining train and test sets into a single dataset

In [43]:
# Combining train and test into one
full_sent=train_sent+test_sent
full_label=train_label+test_label
print("Total Number of Sentence in the full dataset :",len(full_sent))
print("Total Number of labels rows in the full dataset :",len(full_label))

Total Number of Sentence in the full dataset : 3655
Total Number of labels rows in the full dataset : 3655


### Error Check

In [44]:
# Checking to see if all labels are the same length as their respective sentences
error=0
for i in range(0,3655):
    if(len(full_sent[i].split())!=len(full_label[i])):
        error=error+1
if error==0:
    print("No sentences with mismatched labels found")
else:
    print("Number of Sentences with mismatched labels are :", error)

No sentences with mismatched labels found


### Checking for words marked as nouns and proper nouns
##### We will check once with whole set as it was given once  and once again with stopword removed so we will print 4 set of frequency
###### PLEASE NOTE:  Stop word removal is just for checking full dataset. It will not be implemented on either the train or test data set for modelling

In [45]:
# Loading Spacy parameters
model = spacy.load("en_core_web_sm",disable=['parser','ner','lemmatizer','textcat'])

In [46]:
#--- Function to count and display information for nouns and propn from the passed list
def countanddisplay(full_sent):
    NOUN=[] #-- Empty list to store words marked as nouns
    PROPN=[] # -- Empty list to store words marked as propn
    totalwords=0 #-- Counter to count number of words in the dataset
    for sent in full_sent:
        doc=model(sent)
        for tok in doc:
            totalwords += 1
            if tok.pos_=="NOUN":
                NOUN.append(tok.text.lower())
            if tok.pos_=="PROPN":
                PROPN.append(tok.text.lower())
    #Frequency of Nouns & PROPN
    print("--------------Noun--------------")
    print("The number of word marked as nouns is :",len(NOUN)) #-- Number of Nouns
    print("The number of unique nouns is :",len(set(NOUN)))    #-- Number of Unique Nouns
    print("The Percentage of words marked as nouns is  :",len(NOUN)/totalwords*100,"%") #-percentage of Nouns over the full dataset
    print("--------------PRONP--------------")
    print("The number of word marked as proper noun is :",len(PROPN)) #-- Number of P-Nouns
    print("The number of unique proper noun is :",len(set(PROPN))) #-- Number of unique P-Nouns
    print("The Percentage of words marked as proper noun is  :",len(PROPN)/totalwords*100,"%\n") #-Percentage og P-noun over full dataset
    # Top 25 words marked as Nouns
    print("Top 25 words marked as Nouns")
    NOUN=pd.Series(NOUN)
    print(NOUN.value_counts().head(25),"\n")
    # Top 25 words marked as Proper Nouns
    print("Top 25 words marked as Proper Nouns")
    PROPN=pd.Series(PROPN)
    print(PROPN.value_counts().head(25))
    

In [47]:
print("---------Full Dataset with no stopword removal---------")
countanddisplay(full_sent)

---------Full Dataset with no stopword removal---------
--------------Noun--------------
The number of word marked as nouns is : 20637
The number of unique nouns is : 4465
The Percentage of words marked as nouns is  : 30.39009233216016 %
--------------PRONP--------------
The number of word marked as proper noun is : 3739
The number of unique proper noun is : 1756
The Percentage of words marked as proper noun is  : 5.506059758198713 %

Top 25 words marked as Nouns
patients        507
treatment       303
%               247
cancer          204
therapy         177
study           161
disease         142
cell            141
lung            117
results         117
effects          99
group            94
gene             91
chemotherapy     89
effect           82
women            81
analysis         76
use              75
risk             74
surgery          73
cases            72
rate             68
response         66
children         65
survival         65
dtype: int64 

Top 25 words mark

In [48]:
## Checking with stopword removed
### Again note: Only for exploration, stopword will not be removed for training or test during modeling

In [49]:
from nltk.corpus import stopwords

def stopword(list_sent):
    for sent in list_sent:
        sent_word = sent.split()
        sent_word = [word for word in sent_word if word not in stopwords.words('english')]
        sent = " ".join(sent_word)
        stop_sent.append(sent)
    return(stop_sent)

stop_sent=[]
stopped=stopword(full_sent)


In [50]:
print("---------Full Dataset with  stopword removal---------")
countanddisplay(stopped)

---------Full Dataset with  stopword removal---------
--------------Noun--------------
The number of word marked as nouns is : 19755
The number of unique nouns is : 4382
The Percentage of words marked as nouns is  : 40.0004049648693 %
--------------PRONP--------------
The number of word marked as proper noun is : 4085
The number of unique proper noun is : 1977
The Percentage of words marked as proper noun is  : 8.271407455403244 %

Top 25 words marked as Nouns
patients        502
treatment       299
%               247
cancer          203
therapy         177
study           156
disease         142
cell            140
lung            115
results         107
effects          98
gene             91
group            90
chemotherapy     89
women            81
analysis         76
effect           74
risk             73
surgery          73
rate             68
cases            66
children         65
response         65
survival         65
p                62
dtype: int64 

Top 25 words marked 

In [None]:
# Conclusion : As expected stopword removal did not really affect the nouns and propernouns as they most likely would not 
# be removed using stopwords

## PRE - Model Building Steps
#### Get POS Tags
#### Create CRF features

In [51]:
# Function to take full list of sentence and than break them into sentences and than break those sentences into words
# and add pos tags to the words using nltk
def pos_tags(sentences):
        sentences = [nltk.word_tokenize(sent) for sent in sentences]
        sentences = [nltk.pos_tag(sent) for sent in sentences]
        return sentences


In [52]:
#Tag  both train and test dataset
train_sent_tagged=pos_tags(train_sent) #-- Get tagged train data
test_sent_tagged=pos_tags(test_sent) #-- Get tagged data

In [53]:
# Check output of pos_tags function
#For train dataset
train_sent_tagged[0]

[('All', 'DT'),
 ('live', 'JJ'),
 ('births', 'NNS'),
 ('>', 'VBP'),
 ('or', 'CC'),
 ('=', 'VBP'),
 ('23', 'CD'),
 ('weeks', 'NNS'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('University', 'NNP'),
 ('of', 'IN'),
 ('Vermont', 'NNP'),
 ('in', 'IN'),
 ('1995', 'CD'),
 ('(', '('),
 ('n', 'IN'),
 ('=', 'NNP'),
 ('2395', 'CD'),
 (')', ')'),
 ('were', 'VBD'),
 ('retrospectively', 'RB'),
 ('analyzed', 'VBN'),
 ('for', 'IN'),
 ('delivery', 'NN'),
 ('route', 'NN'),
 (',', ','),
 ('indication', 'NN'),
 ('for', 'IN'),
 ('cesarean', 'NN'),
 (',', ','),
 ('gestational', 'JJ'),
 ('age', 'NN'),
 (',', ','),
 ('parity', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('practice', 'NN'),
 ('group', 'NN'),
 ('(', '('),
 ('to', 'TO'),
 ('reflect', 'VB'),
 ('risk', 'NN'),
 ('status', 'NN'),
 (')', ')')]

In [54]:
# As we can see the sentences are being returned with words marked with their POS tags

In [55]:
#For test dataset
test_sent_tagged[0]

[('Furthermore', 'RB'),
 (',', ','),
 ('when', 'WRB'),
 ('all', 'DT'),
 ('deliveries', 'NNS'),
 ('were', 'VBD'),
 ('analyzed', 'VBN'),
 (',', ','),
 ('regardless', 'RB'),
 ('of', 'IN'),
 ('risk', 'NN'),
 ('status', 'NN'),
 ('but', 'CC'),
 ('limited', 'VBD'),
 ('to', 'TO'),
 ('gestational', 'JJ'),
 ('age', 'NN'),
 ('>', 'NN'),
 ('or', 'CC'),
 ('=', '$'),
 ('36', 'CD'),
 ('weeks', 'NNS'),
 (',', ','),
 ('the', 'DT'),
 ('rates', 'NNS'),
 ('did', 'VBD'),
 ('not', 'RB'),
 ('change', 'NN'),
 ('(', '('),
 ('12.6', 'CD'),
 ('%', 'NN'),
 (',', ','),
 ('280', 'CD'),
 ('of', 'IN'),
 ('2214', 'CD'),
 (';', ':'),
 ('primary', 'JJ'),
 ('9.2', 'CD'),
 ('%', 'NN'),
 (',', ','),
 ('183', 'CD'),
 ('of', 'IN'),
 ('1994', 'CD'),
 (')', ')')]

In [56]:
# As we can see the sentences are being returned with words marked with their POS tags

### Define function to create features for CRF

In [57]:
# from ==> sklearn-crfsuite Documentation release 0.3 by Mikhail Korobov

def word2features(sent, i):
    word = sent[i][0] # --- The word
    postag = sent[i][1] # --- The postag of the world
    
    features = {
        'bias': 0.75, #-- Expermimented with 1, 0.9,0,8,0.75 and 0.7 and 0.75 gave the highest f1 score
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
        
    }
    if i > 0: #--------> Previous Word 
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True #-----> If begining of sentence
        
    if i < len(sent)-1: #--------> Next Word
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True #-----> If end of sentence
                
    return features

### - Sends the sentence with the index of the word which has to be processed to the word2features function
def sent2features(sent): 
    return [word2features(sent, i) for i in range(len(sent))]

In [58]:
# Function to split labels as list===>
def getLabel(labels):
    return [label for label in labels]

In [59]:
# Calling sent2feature function on the pos tagged data to create features for CRF
# Calling getlabels to create label in the required format

X_train=[sent2features(s) for s in train_sent_tagged] #---- Train Data -X
Y_train=[getLabel(label) for label in train_label]   #---- Train Data -Y
X_test=[sent2features(s) for s in test_sent_tagged] #---- Test Data -X
Y_test=[getLabel(label) for label in test_label]   #---- Test Data -Y

In [60]:
# Lets check Features for the 1st sentence in the train dataset
X_train[0]

[{'bias': 0.75,
  'word.lower()': 'all',
  'word[-3:]': 'All',
  'word[-2:]': 'll',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'DT',
  'postag[:2]': 'DT',
  'BOS': True,
  '+1:word.lower()': 'live',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.isdigit()': False,
  '+1:postag': 'JJ',
  '+1:postag[:2]': 'JJ'},
 {'bias': 0.75,
  'word.lower()': 'live',
  'word[-3:]': 'ive',
  'word[-2:]': 've',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'JJ',
  'postag[:2]': 'JJ',
  '-1:word.lower()': 'all',
  '-1:word.istitle()': True,
  '-1:word.isupper()': False,
  '-1:word.isdigit()': False,
  '-1:postag': 'DT',
  '-1:postag[:2]': 'DT',
  '+1:word.lower()': 'births',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:word.isdigit()': False,
  '+1:postag': 'NNS',
  '+1:postag[:2]': 'NN'},
 {'bias': 0.75,
  'word.lower()': 'births',
  'word[-3:]': 'ths',
  'word[-2:

In [61]:
# Lets check if EOS and BOS are correctly marked
#BOS - 1st word of the 1st sentence
print("-----First Word - to check if BOS tag is present-----\n")
print(X_train[0][0],"\n")
#EOS - last word of the 1st sentence
print("-----Last Word - to check if EOS tag is present-----\n")
print(X_train[0][-1],"\n")
print("-----Any other word - to check that both EOS and BOS are not present-----\n")
print(X_train[0][2])


-----First Word - to check if BOS tag is present-----

{'bias': 0.75, 'word.lower()': 'all', 'word[-3:]': 'All', 'word[-2:]': 'll', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'DT', 'postag[:2]': 'DT', 'BOS': True, '+1:word.lower()': 'live', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:word.isdigit()': False, '+1:postag': 'JJ', '+1:postag[:2]': 'JJ'} 

-----Last Word - to check if EOS tag is present-----

{'bias': 0.75, 'word.lower()': ')', 'word[-3:]': ')', 'word[-2:]': ')', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': ')', 'postag[:2]': ')', '-1:word.lower()': 'status', '-1:word.istitle()': False, '-1:word.isupper()': False, '-1:word.isdigit()': False, '-1:postag': 'NN', '-1:postag[:2]': 'NN', 'EOS': True} 

-----Any other word - to check that both EOS and BOS are not present-----

{'bias': 0.75, 'word.lower()': 'births', 'word[-3:]': 'ths', 'word[-2:]': 'hs', 'word.isupper()': False, 'word

In [62]:
# We are getting features and we can see the tags are also correctly added

#### Lets check the label for the same

In [63]:
print(Y_train[0])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [64]:
### We are getting the feature value and the proper label as expected!!

### DATA ERROR 
 During the first instance of model fit, the fit failed. Upon further investigation it was found that there was a length mismatch between label and sentences. So before we go back to that we will try and find the issue

In [65]:
## We had checked the data before POS tagging and it was alright. The assumption is that POS tagging could have caused 
## some error
## We will manually check the mismatched sentences to see if we can find the issue

In [66]:
## We will check the length of sentences against their labels and find the sentences that have error
def errorcheck(x,y): #--- x is the sentences and y is the labels
    error=0 #-- Counter to see how many errors are there
    print("-------Sentences with Mismatched Labels-------")
    for i in range(0,len(x)):
        if(len(x[i])!=len(y[i])):
            error=error+1
            print("Index of sentence :",i,"---Length of Sentence :",len(x[i]),"---Length of Labels :",len(y[i]))
    if error==0:
        print("No sentences with mismatched labels found")
    else:
        print("Number of Sentences with mismatched labels are :", error)

In [67]:
errorcheck(X_train,Y_train)

-------Sentences with Mismatched Labels-------
Index of sentence : 85 ---Length of Sentence : 38 ---Length of Labels : 32
Index of sentence : 752 ---Length of Sentence : 56 ---Length of Labels : 55
Index of sentence : 938 ---Length of Sentence : 13 ---Length of Labels : 12
Index of sentence : 997 ---Length of Sentence : 15 ---Length of Labels : 14
Number of Sentences with mismatched labels are : 4


In [68]:
# Checking what went wrong
baddata=[85,752,938,997] #---- > Index of sentences with errors obtained from error check
for num in baddata:
    print("Original Sentence===>\n")
    print(train_sent[num],"\n")
    print("Sentence after Tagging===>\n")
    print(train_sent_tagged[num],"\n")
    print("Label===>\n")
    print(train_label[num],"\n")
    print("Length of Label===>\n")
    print(len(train_label[num]),"\n")
    print("Length of Original Sentence===>\n")
    print(len(train_sent[num].split()),"\n")
    print("Length of Tagged Sentence===>\n")
    print(len(train_sent_tagged[num]))
    print("----------------------------")



Original Sentence===>

RESULTS : The HLA effect is due to the class II DR2 haplotype , DRB1*1501-DQA1*0102-DRB1*0602 ; contributions to MS susceptibility from additional DRB1-DQB1 alleles or other HLA region loci were not observed 

Sentence after Tagging===>

[('RESULTS', 'NN'), (':', ':'), ('The', 'DT'), ('HLA', 'NNP'), ('effect', 'NN'), ('is', 'VBZ'), ('due', 'JJ'), ('to', 'TO'), ('the', 'DT'), ('class', 'NN'), ('II', 'NNP'), ('DR2', 'NNP'), ('haplotype', 'NN'), (',', ','), ('DRB1', 'NNP'), ('*', 'NNP'), ('1501-DQA1', 'JJ'), ('*', 'NNP'), ('0102-DRB1', 'CD'), ('*', 'NN'), ('0602', 'CD'), (';', ':'), ('contributions', 'NNS'), ('to', 'TO'), ('MS', 'NNP'), ('susceptibility', 'NN'), ('from', 'IN'), ('additional', 'JJ'), ('DRB1-DQB1', 'NNP'), ('alleles', 'NNS'), ('or', 'CC'), ('other', 'JJ'), ('HLA', 'NNP'), ('region', 'NN'), ('loci', 'NN'), ('were', 'VBD'), ('not', 'RB'), ('observed', 'VBN')] 

Label===>

OOOOOOOOOOOOOOOOOOOOOOOOOOOOOOOO 

Length of Label===>

32 

Length of Original Se

In [69]:
# As we can see from above, these sentences have been tagged incorrectly example 'DRB1*1501-DQA1*0102-DRB1*0602' has
# a single label in our data but pos tagging has broken it into multiple parts. Also L. which is considered a single 
# word as per label as been broken into 2

## Given that the mismatched error is only on 4 sentences, we will remove them

In [70]:
# Size of train before removal
print(len(X_train))
print(len(Y_train))

2599
2599


In [71]:
#Removing mismatched sentences
for i in range(85,997): #-- Because the indexes lie between these values
        if(len(X_train[i])!=len(Y_train[i])): 
            del X_train[i]
            del Y_train[i]

In [72]:
# Size of train after removal
print(len(X_train))
print(len(Y_train))

2595
2595


In [73]:
#Lets check to see if the error is eliminated in Train Data
errorcheck(X_train,Y_train)

-------Sentences with Mismatched Labels-------
No sentences with mismatched labels found


In [74]:
# Lets check on test data too
errorcheck(X_test,Y_test)

-------Sentences with Mismatched Labels-------
Index of sentence : 24 ---Length of Sentence : 27 ---Length of Labels : 25
Number of Sentences with mismatched labels are : 1


In [75]:
# Because we have only 1 bad data, we will remove it
# because we will use test data for building the dictionary later too, we are removing it from all list including 
# original test_sent and test_sent_tagged
del X_test[24]
del Y_test[24]
del test_sent[24]
del test_sent_tagged[24]

In [76]:
#Lets check to see if the error is eliminated in Test Data
errorcheck(X_test,Y_test)

-------Sentences with Mismatched Labels-------
No sentences with mismatched labels found


## Creating the CRF Model

In [77]:
# Defining Model Parameters
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1, # coefficient for L1 regularization
    c2=0.1, # coefficient for L2 regularization
    max_iterations=500,
    all_possible_transitions=True
)

In [78]:
crf.fit(X_train, Y_train)

In [79]:
# Model Prediction and Evaluation
Y_pred = crf.predict(X_test) #-- Predict using the model above
print("F1 Score for this model is:")
metrics.flat_f1_score(Y_test, Y_pred, average='weighted')

F1 Score for this model is:


0.9255884062060182

### Hyper-Parameter Tuning

In [80]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

### Grid search

In [81]:
# GridSearch 

crf = sklearn_crfsuite.CRF(max_iterations=500, all_possible_transitions=True)
params_space = {
    "c1": [0,0.01,0.05,0.1, 0.25,0.5,0.6,0.7,0.8,0.9,1],
    "c2": [0,0.01,0.05,0.1, 0.25,0.5,0.6,0.7,0.8,0.9,1]
}

# Using F1 score as the scorer
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted')

# search

grid_search = GridSearchCV(estimator=crf,
                           param_grid=params_space,
                           cv=3,
                           n_jobs=-1, verbose=1,scoring=f1_scorer)

grid_search.fit(X_train, Y_train)
print('best params:', grid_search.best_params_)
print('best CV score:', grid_search.best_score_)

Fitting 3 folds for each of 121 candidates, totalling 363 fits
best params: {'c1': 0.25, 'c2': 0}
best CV score: 0.875601681374312


In [82]:
Y_pred_GCV = grid_search.best_estimator_.predict(X_test)
print("F1 Score for the best model from Gridsearch is(On Test Data):")
metrics.flat_f1_score(Y_test, Y_pred_GCV, average='weighted')

F1 Score for the best model from Gridsearch is(On Test Data):


0.9270757731516192

### Random Search

In [86]:
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# Using F1 score as the scorer
f1_scorer = make_scorer(metrics.flat_f1_score, 
                        average='weighted')

# search
rs = RandomizedSearchCV(crf, params_space, 
                        cv=3, 
                        verbose=1, 
                        n_jobs=-1, 
                        n_iter=100, 
                        scoring=f1_scorer)
rs.fit(X_train, Y_train)
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
best params: {'c1': 0.2382696683317979, 'c2': 0.005693003141979298}
best CV score: 0.8759733013135981


In [87]:
Y_pred_RCV = rs.best_estimator_.predict(X_test)
print("F1 Score for the best model from Randomsearch is(On Test Data):")
metrics.flat_f1_score(Y_test, Y_pred_RCV, average='weighted')

F1 Score for the best model from Randomsearch is(On Test Data):


0.9273203246324682

In [89]:
# We got the highest F1 score from the randomsearch CV where L1 and L2 were :
# 'c1': 0.2382696683317979, 'c2': 0.005693003141979298
### we will use this as the final model###

### Best F1 Score on test data was from the Random Search Model and the F1 for that was 0.9273203246324682

### Checking the predicted labels 

In [92]:
# Checking for the length of the the labels for 1st sentence in the test data
print("Length of Predicted Label",len(Y_pred_RCV[0]))
print("Length of Original Label",len(test_label[0]))

Length of Predicted Label 44
Length of Original Label 44


In [96]:
## Checking a random sentence from the test data to see the predicted and actual label
id = 935 ##-- Random index to check
print("Sentence:",test_sent[id])
print("Sentence:",len(test_sent[id].split()))
print("Orig Labels:", len(Y_test[id]))
print("Pred Labels:", len(Y_pred_RCV[id]))
print("Orig Labels:", (Y_test[id]))
print("Pred Labels:", (Y_pred_RCV[id]))

Sentence: He needed mechanical ventilation for severe hypoxemia , but recovered with glucocorticoid pulse therapy
Sentence: 14
Orig Labels: 14
Pred Labels: 14
Orig Labels: ['O', 'O', 'T', 'T', 'O', 'D', 'D', 'O', 'O', 'O', 'O', 'T', 'T', 'T']
Pred Labels: ['O', 'O', 'T', 'T', 'O', 'D', 'D', 'O', 'O', 'O', 'O', 'T', 'T', 'T']


In [94]:
## Looks like the model predicted labels are vey close to the actual labels

### Create the logic to get all the predicted treatments (T) labels corresponding to each disease (D) label in the test dataset.

### Explanation 
How -
Every Sentence was scanned to look for D and T tag and if both D & T tags were present that the D tags were concatenated to form the disease name and T tags were concatenated to find the treatment. These were than added as key-value pairs to a dictionary
Code ensured that if a disease was already present, than the just the treatment was added to existing values

### Qualifier

The disease names were also concatenated with a qualifier.
The choice was between prefix or an adjective. Prefix would have added a lot of unnecessary information so adjective was used.
This qualifier was added only if the adjective was found before the first D tag in a sentence


In [98]:
treat={} #--- Dictionary to store the Disease-treatment as key-value pair
for i in range(0,len(Y_pred_RCV)):
    if "D" and "T" in Y_pred_RCV[i]:
        words=test_sent[i].lower().split()
        disease=""
        treatment=""
        for j in range(0,len(Y_pred_RCV[i])):
            if((Y_pred_RCV[i][j])=="D"):
                if j==0:
                    disease=disease+" "+words[j]
                else:

                    if(test_sent_tagged[i][j-1][1]=="JJ" and (Y_pred_RCV[i][j-1])!="D" ):
                        disease=disease+" "+test_sent_tagged[i][j-1][0]+" "+words[j]
                    else:

                        disease=disease+" "+words[j]
            if((Y_pred_RCV[i][j])=="T"):
                treatment=treatment+" "+words[j]
        if(len(treatment)!=0 and len(disease)!=0):
            disease=disease.strip()
            treatment=treatment.strip()
            if disease in treat.keys():

                treat[disease].add(treatment)
            else:

                treat[disease]={treatment}


In [100]:
for key in treat:
    print(key,"====>>>>>>",treat[key])


macrosomic infants in gestational diabetes cases ====>>>>>> {'good glycemic control'}
hereditary retinoblastoma ====>>>>>> {'radiotherapy'}
epilepsy adhd ====>>>>>> {'methylphenidate'}
unstable angina or non-q-wave myocardial infarction ====>>>>>> {'roxithromycin'}
coronary-artery disease ====>>>>>> {'antichlamydial antibiotics'}
primary pulmonary hypertension ( pph ) ====>>>>>> {'fenfluramines'}
essential hypertension ====>>>>>> {'moxonidine'}
cellulitis ====>>>>>> {'g-csf therapy intravenous antibiotic treatment'}
foot infection in diabetic patients ====>>>>>> {'g-csf treatment'}
stroke hemorrhagic stroke ====>>>>>> {'double-bolus alteplase accelerated infusion of alteplase ( p=0.24'}
cardiac disease ====>>>>>> {'fenfluramine-phentermine'}
rheumatoid arthritis ====>>>>>> {'arthrodesis'}
early parkinson 's disease ====>>>>>> {'ropinirole monotherapy'}
sore throat ====>>>>>> {'antibiotics'}
crohn 's disease ====>>>>>> {'steroids'}
stress urinary incontinence ====>>>>>> {'surgery', 'sur

In [124]:
# Function to find treatment if disease is present in the dictionary
def treatmentfor(disease):
    try:
        disease=disease.lower()
        alltreat=treat[disease]
        print("The treatment(s) for",disease,"is/are\n")
        for treatments in alltreat:
            print(treatments)
    except:
        print("We currently do not have any treatment for",disease)

In [125]:
## Now lets see if we can predict the treatment by using the dictionary
# We will do this by looking up the value of the key (which here is the disease we want to find the treatment for)


#### Testing treatmentfor function

In [126]:
#Test Case 1- Single Treatment in dictionary
# Lets predict the treatment for hereditary retinoblastoma
treatmentfor("hereditary retinoblastoma")

The treatment(s) for hereditary retinoblastoma is/are

radiotherapy


In [127]:
#Test Case 2 - Multiple treatment in dictionary
# Lets predict the treatment for multiple sclerosis
treatmentfor("multiple sclerosis")

The treatment(s) for multiple sclerosis is/are

intravenous immunoglobulin treatment
interferon beta treatment


In [128]:
#Test Case 3 - Disease not in dictionary
# Lets predict the treatment for multiple sclerosis
treatmentfor("covid")

We currently do not have any treatment for covid


In [129]:
# Once again lets us print put the treatment for hereditary retinoblastoma
treatmentfor("hereditary retinoblastoma")

The treatment(s) for hereditary retinoblastoma is/are

radiotherapy
