In [1]:
import spacy
from sklearn_crfsuite import CRF, metrics

from collections import Counter
import numpy as np

model = spacy.load("en_core_web_sm")

## 1. Data Preprocessing

### Construct proper sentences from individual words and prints five sentences

In [2]:
def preprocess_file(file):
  f = open(file, 'r')
  file_lines = f.readlines() 
  f.close()

  processed_lines = [] 

  line_content = ""

  for word in file_lines:
    word = word.strip() 
    if word == "": 
      processed_lines.append(line_content)
      line_content = ""
    else:
      if line_content: 
        line_content += " "+word
      else:
        line_content = word 

  return processed_lines

In [3]:
train_sentences = preprocess_file('train_sent')
train_labels = preprocess_file('train_label')
test_sentences = preprocess_file('test_sent')
test_labels = preprocess_file('test_label')

In [4]:
# Print the 5 sentences from the processed dataset
for i in np.arange(0,5):
  print(i+1)
  print("Sentence:", train_sentences[i])
  print("Labels:", train_labels[i], "\n\n")

1
Sentence: All live births > or = 23 weeks at the University of Vermont in 1995 ( n = 2395 ) were retrospectively analyzed for delivery route , indication for cesarean , gestational age , parity , and practice group ( to reflect risk status )
Labels: O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 


2
Sentence: The total cesarean rate was 14.4 % ( 344 of 2395 ) , and the primary rate was 11.4 % ( 244 of 2144 )
Labels: O O O O O O O O O O O O O O O O O O O O O O O O O 


3
Sentence: Abnormal presentation was the most common indication ( 25.6 % , 88 of 344 )
Labels: O O O O O O O O O O O O O O O 


4
Sentence: The `` corrected '' cesarean rate ( maternal-fetal medicine and transported patients excluded ) was 12.4 % ( 273 of 2194 ) , and the `` corrected '' primary rate was 9.6 % ( 190 of 1975 )
Labels: O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O 


5
Sentence: Arrest of dilation was the most common indication in 

### Print the correct count of the number of sentences in the processed train and test dataset.

In [5]:
print("No. of sentences in processed train dataset:", len(train_sentences))
print("No. of sentences in processed test dataset:", len(test_sentences))

No. of sentences in processed train dataset: 2599
No. of sentences in processed test dataset: 1056


### Correctly count the number of lines of labels in the processed train and test dataset.

In [6]:
# The lengths of the four variables should match the below output
print("No. of labels in train dataset:", len(train_labels))
print("No. of labels in test dataset:", len(test_labels))

No. of labels in train dataset: 2599
No. of labels in test dataset: 1056


## 2. Concept Identification

### Use a toolkit like spaCy to extract those tokens that have NOUN or PROPN as their PoS tag and finds their frequency from the entire dataset that comprises both the train and the test datasets

In [7]:
concepts = {}

for sentences in (train_sentences, test_sentences):
  for sentence in sentences:
    processed_sentence = model(sentence) 
    #print(processed_sentence)
    for tag in processed_sentence:
      #print(tag.pos_)
      if(tag.pos_ == 'NOUN' or tag.pos_ == 'PROPN'): 
        #print(tag.pos_)
        concepts[tag.text] = concepts.get(tag.text, 0) + 1; 
        

In [8]:
concepts

{'births': 2,
 'weeks': 37,
 'University': 6,
 'Vermont': 1,
 'delivery': 20,
 'route': 2,
 'indication': 5,
 'age': 56,
 'parity': 4,
 'practice': 31,
 'group': 94,
 'risk': 71,
 'status': 17,
 'rate': 67,
 '%': 247,
 'presentation': 7,
 'medicine': 36,
 'patients': 492,
 'Arrest': 1,
 'dilation': 4,
 'subgroups': 4,
 'rates': 34,
 'care': 56,
 'hospitals': 10,
 'community': 15,
 'groups': 34,
 'trimester': 4,
 'fluid': 8,
 'index': 13,
 'AFI': 8,
 'temperature': 10,
 'increases': 8,
 'decrease': 5,
 'June': 2,
 'August': 6,
 'period': 28,
 'heat': 5,
 'women': 77,
 'singleton': 1,
 'pregnancies': 7,
 'gestation': 4,
 'testing': 25,
 'determinations': 2,
 'ambient': 1,
 'area': 11,
 'day': 36,
 'mean': 3,
 'test': 20,
 'date': 3,
 'Spearman': 1,
 'rank': 2,
 'Correlation': 3,
 'relationship': 12,
 'account': 4,
 'measure': 4,
 'well': 2,
 'being': 2,
 'study': 154,
 'population': 31,
 'diabetes': 31,
 'screening': 12,
 'clinic': 5,
 'Hadassah': 1,
 'Medical': 6,
 'Center': 3,
 'year':

### Print the top 25 most common tokens with NOUN or PROPN PoS tags for the entire dataset that comprises both the train and the test datasets

In [9]:
concept_counter = Counter(concepts)
concept_counter.most_common(25)

[('patients', 492),
 ('treatment', 281),
 ('%', 247),
 ('cancer', 200),
 ('therapy', 175),
 ('study', 154),
 ('disease', 142),
 ('cell', 140),
 ('lung', 116),
 ('group', 94),
 ('chemotherapy', 88),
 ('gene', 87),
 ('effects', 85),
 ('results', 79),
 ('women', 77),
 ('use', 74),
 ('TO_SEE', 74),
 ('risk', 71),
 ('cases', 71),
 ('surgery', 71),
 ('analysis', 70),
 ('rate', 67),
 ('response', 66),
 ('survival', 65),
 ('children', 64)]

## 3. Defining features for CRF

In [10]:
def getWordFeatures(sentence, pos, pos_tags):
  word = sentence[pos]

  #Define 12 features with PoS tag as one of the features
  features = [
    'word.lower=' + word.lower(), # serves as word id
    'word[-3:]=' + word[-3:],     # last three characters
    'word[-2:]=' + word[-2:],     # last two characters
    'word.isupper=%s' % word.isupper(),  # is the word in all uppercase
    'word.isdigit=%s' % word.isdigit(),  # is the word a number
    'word.startsWithCapital=%s' % word[0].isupper(), # is the word starting with a capital letter
    'word.pos=' + pos_tags[pos]
  ]

  #Use the previous word also while defining features
  if(pos > 0):
    prev_word = sentence[pos-1]
    features.extend([
    'prev_word.lower=' + prev_word.lower(), 
    'prev_word.isupper=%s' % prev_word.isupper(),
    'prev_word.isdigit=%s' % prev_word.isdigit(),
    'prev_word.startsWithCapital=%s' % prev_word[0].isupper(),
    'prev_word.pos=' + pos_tags[pos-1]
  ])
  # Mark the begining and the end words of a sentence correctly in the form of features.
  else:
    features.append('BEG') # feature to track begin of sentence 

  if(pos == len(sentence)-1):
    features.append('END') # feature to track end of sentence

  return features

## 4. Getting the features words and sentences

### Write the code to get the features' value of a sentence after defining the features in the previous step.

In [11]:

def getSentenceFeatures(sentence):
  
  processed_sentence = model(sentence) 
  
  pos_tags = [] 
  for token in processed_sentence:
    pos_tags.append(token.pos_)

  sentence_list = sentence.split() 
  return [getWordFeatures(sentence_list, pos, pos_tags) for pos in range(len(sentence_list))]

### Write the code to get a list of labels of a given preprocessed label line that you have created earlier.

In [12]:
def getLabelsInListForOneSentence(labels):
  return labels.split()

## 5. Define input and target variables


### Extract the features' values for each sentence as an input variable for the CRF model in the test and the train dataset.

In [13]:
X_train = [getSentenceFeatures(sentence) for sentence in train_sentences]
X_test = [getSentenceFeatures(sentence) for sentence in test_sentences]

### Extract the labels as the target variable for the test and the train dataset.

In [14]:
Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

## 6. Build the CRF Model

In [15]:
crf = CRF(max_iterations=100)

try:
    crf.fit(X_train, Y_train)
except AttributeError:
    pass

## 7. Evaluating the model

### Predict the labels of each of the tokens in each sentence of the test dataset that has been pre processed earlier.

In [16]:
Y_pred = crf.predict(X_test)

### Calculate the f1 score using the actual labels and the predicted labels of the test dataset.

In [17]:
metrics.flat_f1_score(Y_test, Y_pred, average='weighted')

0.9078368188374558

## 8. Identifying Diseases and Treatments using Custom NER

### Create the code or logic to get all the predicted treatments (T) labels corresponding to each disease (D) label in the test dataset

In [18]:
diseases_and_treatments =  {} # dictionary with disease as key an list of treatments as value

pred_len = len(Y_pred)

for i in range(pred_len): 
  labels = Y_pred[i]

  disease = "";
  treatment = "";
  
  for j in range(len(labels)): 
    if labels[j] == 'O': 
      continue

    if(labels[j] == 'D'): 
      disease += test_sentences[i].split()[j] + " "
      continue

    if(labels[j] == 'T'): 
      #print(test_sentences[i].split()[j])
      treatment += test_sentences[i].split()[j] + " "

  disease = disease.strip() 
  treatment = treatment.strip()

  if disease != "" and treatment != "":
    if disease not in diseases_and_treatments.keys():
      diseases_and_treatments[disease] = [treatment]
    else:
      treatment_list = diseases_and_treatments.get(disease)
      treatment_list.append(treatment)
      diseases_and_treatments[disease] = treatment_list 

### Predict the treatment for the disease name: 'hereditary retinoblastoma'

In [19]:
diseases_identified = list(diseases_and_treatments.keys())
diseases_identified[0]

'hereditary retinoblastoma'

In [20]:
print("Disease: ",diseases_identified[0])
print("Treatment:", diseases_and_treatments.get(diseases_identified[0]))

Disease:  hereditary retinoblastoma
Treatment: ['radiotherapy']
