# Machine-Learning Drug-Name Classifier
This notebook contains the machine-learning classifier for the AHLT course, UPC.

Author: Jake Watson, 22/03/2020

In [0]:
import xml.dom
from xml.dom.minidom import parse
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import string_span_tokenize
from nltk.tokenize.util import align_tokens
from google.colab import drive

import copy
import os
import sys

In [3]:
def install_pycrf():
  !pip install -q python-crfsuite
install_pycrf()
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

openjdk version "11.0.6" 2020-01-14
OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)


In [0]:
import pycrfsuite

In [5]:
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Both cells below do the same job, but have different data sources. Choose the top one to read only internal knowledge, and train the classifier on that. Choose the lower one to read both internal and external knowledge, and train the classifier on that.

In [0]:
entities, prefixes, suffixes, non_entities = read_saved()
tags, features = readFeatures('drive/My Drive/UPC/Semester2/features/features.txt')
train('drive/My Drive/UPC/Semester2/features/trained.crfsuite', features, tags, 1, 0.1, 50, True, True)

In [0]:
entities, prefixes, suffixes, non_entities = read_saved_large()
tags, features = readFeatures('drive/My Drive/UPC/Semester2/features/features_large.txt')
train('drive/My Drive/UPC/Semester2/features/trained_large.crfsuite', features, tags, 0.1, 0.001, 50, True, True)

The following two fields show instances of the evaluator output for the NERC function, for the 'Devel' and 'Test-NER' datasets.

In [295]:
nerc('drive/My Drive/UPC/Semester2/data/Test-NER/','task9.1_MLinternalTest_1.txt', entities, 'drive/My Drive/UPC/Semester2/features/trained.crfsuite')

Considering file 112/112: Terconazole.xmlGold drive/My Drive/UPC/Semester2/data/Test-NER/
Submission  task9.1_MLinternalTest_1.txt
Directory gold drive/My Drive/UPC/Semester2/data/Test-NER/
[transDirXMLToMapEntities] dir:drive/My Drive/UPC/Semester2/data/Test-NER/
log4j:WARN No appenders could be found for logger (org.castor.core.util.Configuration).
log4j:WARN Please initialize the log4j system properly.
 Gold standard saved in goldNER.txt

Gold loaded. Sentences=324, entities: 686
task9.1_MLinternalTest_1_scores.log created...
SCORES FOR THE GROUP: MLinternalTest RUN=1


Strict matching (boundaries + type)
cor	inc	par	mis	spu	total	prec	recall	F1
306	26	0	354	22	686	0.86	0.45	0.59




Exact matching
cor	inc	par	mis	spu	total	prec	recall	F1
313	19	0	354	22	686	0.88	0.46	0.6




Partial matching
cor	inc	par	mis	spu	total	prec	recall	F1
313	0	19	354	22	686	0.88	0.47	0.61




type matching
cor	inc	par	mis	spu	total	prec	recall	F1
320	12	0	354	22	686	0.9	0.47	0.62




SCORES FOR ENTITY TY

In [290]:
nerc('drive/My Drive/UPC/Semester2/data/Test-NER/','task9.1_MLexternalTest_1.txt', entities, 'drive/My Drive/UPC/Semester2/features/trained_large.crfsuite')

Considering file 112/112: Terconazole.xmlGold drive/My Drive/UPC/Semester2/data/Test-NER/
Submission  task9.1_MLexternalTest_1.txt
Directory gold drive/My Drive/UPC/Semester2/data/Test-NER/
[transDirXMLToMapEntities] dir:drive/My Drive/UPC/Semester2/data/Test-NER/
log4j:WARN No appenders could be found for logger (org.castor.core.util.Configuration).
log4j:WARN Please initialize the log4j system properly.
 Gold standard saved in goldNER.txt

Gold loaded. Sentences=324, entities: 686
task9.1_MLexternalTest_1_scores.log created...
SCORES FOR THE GROUP: MLexternalTest RUN=1


Strict matching (boundaries + type)
cor	inc	par	mis	spu	total	prec	recall	F1
362	80	0	244	44	686	0.74	0.53	0.62




Exact matching
cor	inc	par	mis	spu	total	prec	recall	F1
395	47	0	244	44	686	0.81	0.58	0.67




Partial matching
cor	inc	par	mis	spu	total	prec	recall	F1
395	0	47	244	44	686	0.81	0.61	0.7




type matching
cor	inc	par	mis	spu	total	prec	recall	F1
393	49	0	244	44	686	0.81	0.57	0.67




SCORES FOR ENTITY T

In [0]:
# Main function: parses the XML files, extracts the sentences, tokenizes them, labels each token, outputs the results, and evaluate the results.

def nerc(inputdir, outputfile, known_entities, trainfile):
  output = open(outputfile, "w+")
  count = 1
  n_files = len(os.listdir(inputdir))
  tagger = getTagger(trainfile)
  for fil in os.listdir(inputdir):
    sys.stdout.write("\rConsidering file " + str(count) + "/" + str(n_files) + ": " + str(fil))
    sys.stdout.flush()
    count += 1
    fil = open(str(inputdir) + str(fil))
    tree = parse(fil)
    fil.close()
    sentences = tree.getElementsByTagName("sentence")
    for sentence in sentences:
      sid = sentence.attributes["id"].value
      stext = sentence.attributes["text"].value
      tokens = tokenize(stext)
      classes = classify(tokens, known_entities, tagger)
      output_entities(sid, tokens, classes, output)
  output.close
  evaluate(inputdir, outputfile)

In [0]:
# Tokenizes the input text. 
# Returns the tokens, with their offsets from the beginning of the sentence.

def tokenize(input):
  s = input.replace('"', "'")
  tokens = TreebankWordTokenizer().tokenize(s)
  offsets = list(align_tokens(tokens, s))
  offsets = [tuple((i, j-1)) for i, j in offsets]
  output = [tuple((i, j)) for i, j in zip(tokens, offsets)]
  return output

In [0]:
# Given an input set of tokens, extracts a set of binary features.

def extract_features(sentence, entities):
  tokens = [sentence[i][0] for i in range(len(sentence))]
  features = []
  labels = sliding_window(sentence, entities)

  for i in range(len(tokens)):
    token = tokens[i]

    pre1 = token[:1]
    pre2 = token[:2]
    pre3 = token[:3]
    pre4 = token[:4]
    pre5 = token[:5]

    suf1 = token[-1:]
    suf2 = token[-2:]
    suf3 = token[-3:]
    suf4 = token[-4:]
    suf5 = token[-5:]

    caps = is_capitalised(token)
    contains_caps = has_capitals(token)
    numbers = has_numbers(token)
    dashes = has_dashes(token)

    known_type = labels[i]    

    prev = "BoS"
    nxt = "EoS"
    if (i > 0):
      prev = tokens[i-1]
    if (i < len(tokens)-1):
      nxt = tokens[i+1]
    
    vector = []
    vector.append("form="+token)
    vector.append("pre="+pre1) 
    vector.append("pre2="+pre2) 
    vector.append("pre3="+pre3)
    vector.append("pre4="+pre4)
    vector.append("pre5="+pre5)
    vector.append("suf1="+suf1)
    vector.append("suf2="+suf2)
    vector.append("suf3="+suf3)
    vector.append("suf4="+suf4)
    vector.append("suf5="+suf5)
    vector.append("caps="+str(caps))
    vector.append("has_caps="+str(contains_caps))
    vector.append("has_nums="+str(numbers))
    vector.append("has_dash="+str(dashes))
    vector.append("known_type="+str(known_type))
    vector.append("prev="+prev)
    vector.append("next="+nxt)

    features.append(vector)
  return features

In [0]:
# Set of functions used to extract feature vectors.

def is_capitalised(token):
  if token.isupper():
    return 1
  return 0

def has_capitals(token):
  if (token.isupper() == False) and any(x.isupper() for x in token):
    return 1
  else: 
    return 0

def has_dashes(token):
  if ("-" in token):
    return 1
  return 0

def has_numbers(token):
  if any(i.isdigit() for i in token):
    return 1
  return 0

def in_dictionary(token, dictionary):
  punctuations = '''![]{};'"\,<>./?@#$%^&*~'''

  no_punct = token
  for char in punctuations:
    no_punct = no_punct.replace(char, " ")
  token = no_punct.casefold().strip()

  if token in dictionary:
    return True
  return False


def in_entities(token, entities):
  punctuations = '''![]{};,'"\<>./?@#$%^&*~'''
  to_find = token.casefold().strip()

  no_punct = to_find
  for char in punctuations:
    no_punct = no_punct.replace(char, " ")
  to_find = no_punct.casefold().strip()

  for label in entities.keys():
    if in_dictionary(to_find, entities[label]):
      return True, label

  return False, "unknown"

def sliding_window(tokens, entities):
  punctuations = '''![]{};,'"\<>./?@#$%^&*~'''
  classes = []
  for token in tokens:
    classes.append("O")

  for i in range(0, len(tokens)-1):
    if (classes[i] is not "O"):
      next
    else:
      tmp = tokens[i][0]
      if any(x in tmp for x in punctuations):
        next

      known, label = in_entities(tmp.casefold(), entities)
      if known:
        classes[i] = "B-"+label
        tmp = ""
        next

      for j in range(1, 5):
        if i+j == len(tokens):
          break
        if classes[i+j] != "O":
          break
        tmp = tmp + " " + tokens[i+j][0]
        if any(x in tmp for x in punctuations):
          break
        known, label = in_entities(tmp.casefold(), entities)
        if known:
          for x in range(i, i + j + 1):
            if x == i:
              classes[x] = "B-"+label
            else:
              classes[x] = "I-"+label
          break

  return classes

In [0]:
# Returns a trained CRF model, and saves to a file

def train(file, features, tags, c1, c2, iters, poss_trans, poss_states):
  trainer = pycrfsuite.Trainer(verbose=False)

  for feature, tag in zip(features, tags):
    trainer.append(feature, tag)
  
  trainer.set_params({
    'c1': c1,   # coefficient for L1 penalty
    'c2': c2,  # coefficient for L2 penalty
    'max_iterations': iters,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': poss_trans,
    'feature.possible_states': poss_states
  })

In [0]:
# Returns a CRF sentence tagger, using a given trained model.

def getTagger(trainfile):
  tagger = pycrfsuite.Tagger()
  tagger.open(trainfile)
  return tagger

In [0]:
# Classifies the set of tokens as a set of BIO tags

def classify(tokens, known_entities, tagger):
  features = extract_features(tokens, known_entities)
  return tagger.tag(features)

In [0]:
# Prints the entities to a file in the required format.
# Joins the BIO-tagged tokens into entities.

def output_entities(id, tokens, classes, outf):
  def join_tags(tokens, classes):
    joined_tags = []
    joined_tag = ""
    label = ""
    span_start = 0
    span_end = 0

    for i in range(len(tokens)):

      if classes[i][:1] == "B":
        label = classes[i].split("-")[1]
        joined_tag= tokens[i][0]
        span_start = tokens[i][1][0]
        span_end = tokens[i][1][1]
      
      elif classes[i][:1] == "I":
        joined_tag = joined_tag + " " + tokens[i][0]
        span_end = tokens[i][1][1]
      
      elif (classes[i][:1] == "O") and joined_tag.strip():
        joined_tags.append([joined_tag, [span_start, span_end], label])
        joined_tag = ""
        span_start = 0
        span_end = 0
    return joined_tags

  joined = join_tags(tokens, classes)
  for i in range(len(joined)):
    name = joined[i][0]
    offset = str(str(joined[i][1][0]) + "-" + str(joined[i][1][1]))
    label = joined[i][2]
    if(label is not "O"):
      outstring = str(id) + "|" + str(offset) + "|" + str(name) + "|" + str(label) + "\n"
      outf.write(outstring)

In [0]:
# Runs the official evaluator on the results

def evaluate(inputdir, outputfile):
  !java -jar 'drive/My Drive/UPC/Semester2/eval/evaluateNER.jar' "$inputdir" "$outputfile"

# Reading and Writing
Functions to read the features file, produced by the AHLT_LAB_1_FEATURE_EXTRACTOR notebook.



1.   **features.txt: ** Contains features extracted from the training set.
2.   **features_large.txt: ** Contains features extracted from the training set and external sources.
3.   **trained.crfsuite: ** Contains trained model extracted from the training set.
4.   **trained_large.crfsuite: ** Contains trained model extracted from the training set and external sources.

**External data**

The external data is in a different form to the training set: it consists of annotated entities with no extraneous words. This leads to biasing of the Position of Speech feature, as entities in the external data are always the only words in the sentence. To combat this, I extracted a random set of non-entity words of random length, and padded each side of the entities. 

**Sources**

The sources of the external data were the DrugBank annotated file, HSDB annotated file, and the 'EN' set of random English sentences, found in the lab directory.

In [0]:
def readFeatures(inputfile):
  data = open(inputfile)
  features = []
  tags = []
  sentence_features = []
  sentence_tags = []

  for line in data.read().splitlines():
    if line == "\n" or not line:
      features.append(sentence_features)
      tags.append(sentence_tags)
      sentence_features = []
      sentence_tags = []
      next
    else:
      fields = line.split("\t")
      
      sentence_id = fields[0]
      token = fields[1]
      start = fields[2]
      end = fields[3]
      tag = fields[4]

      feature = fields[5:len(fields)]
      sentence_features.append(feature)
      sentence_tags.append(tag)
  return tags, features

In [0]:
def read_saved():
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["group"] = set()
  prefixes["brand"] = set()  

  suffixes = dict()
  suffixes["drug"] = set()
  suffixes["drug_n"] = set()
  suffixes["group"] = set()
  suffixes["brand"] = set()  

  entities = dict()
  entities["drug"] = set()
  entities["drug_n"] = set()
  entities["group"] = set()
  entities["brand"] = set()

  non_entities = set()

  prefix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes.txt")
  for line in prefix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    prefixes[e_type].add(name.casefold().strip())
  prefix_file.close()

  suffix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes.txt")
  for line in suffix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    suffixes[e_type].add(name.casefold().strip())
  suffix_file.close()

  entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/entities.txt")
  for line in entities_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    entities[e_type].add(name.casefold().strip())
  entities_file.close()

  non_entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities.txt")
  for line in non_entities_file.read().splitlines():
    non_entities.add(line.casefold().strip())
  non_entities_file.close()

  return entities, prefixes, suffixes, non_entities;

In [0]:
def read_saved_large():
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["group"] = set()
  prefixes["brand"] = set()  

  suffixes = dict()
  suffixes["drug"] = set()
  suffixes["drug_n"] = set()
  suffixes["group"] = set()
  suffixes["brand"] = set()  

  entities = dict()
  entities["drug"] = set()
  entities["drug_n"] = set()
  entities["group"] = set()
  entities["brand"] = set()

  non_entities = set()

  prefix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes_large.txt")
  for line in prefix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    prefixes[e_type].add(name.casefold().strip())
  prefix_file.close()

  suffix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes_large.txt")
  for line in suffix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    suffixes[e_type].add(name.casefold().strip())
  suffix_file.close()

  entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/entities_large.txt")
  for line in entities_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    entities[e_type].add(name.casefold().strip())
  entities_file.close()

  non_entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities_large.txt")
  for line in non_entities_file.read().splitlines():
    non_entities.add(line.casefold().strip())
  non_entities_file.close()

  return entities, prefixes, suffixes, non_entities;