# Machine-Learning Drug-Name Classifier
This notebook contains the feature extractor for the machine-learning classifier for the AHLT course, UPC.

Author: Jake Watson, 22/03/2020


In [0]:
import xml.dom
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import string_span_tokenize
from xml.dom.minidom import parse
from nltk.metrics import *
from nltk.tokenize.util import align_tokens
from collections import OrderedDict
from google.colab import drive
import copy
import os
import sys
import random


def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

In [0]:
drive.mount('/content/drive', force_remount=True)

Loads the data taken from the training set.
```
read_saved()
```
Loads the data taken from the training set and external knowledge sources
```
read_saved_large()
```


In [0]:
entities, prefixes, suffixes, non_entities = read_saved()

In [0]:
entities, prefixes, suffixes, non_entities = read_saved_large()

Saves the features taken from the training set.
```
save_features()
```
Saves the features taken from the training set and external knowledge sources
```
read_saved_large()
```


In [0]:
save_features('/content/drive/My Drive/UPC/Semester2/data/Train/', entities)

In [0]:
save_features_large(entities, non_entities)

In [0]:
# Tokenizes the input text. 
# Returns the tokens, with their offsets from the beginning of the sentence.

def tokenize(input):
  s = input.replace('"', "'")
  tokens = TreebankWordTokenizer().tokenize(s)
  offsets = list(align_tokens(tokens, s))
  offsets = [tuple((i, j-1)) for i, j in offsets]
  output = [tuple((i, j)) for i, j in zip(tokens, offsets)]
  return output

# Features
To feed into the learner, we need to extract a set of features for each token in a sentence. These are similar to the rules in the Rule-Based classifier:
*   Capitalised: is the token entirely capitalised?
*   Has Capitals: does the token contain any capitals?
*   Part of Speech: is the token at the beginning or end of the sentence?
*   Dashes: does the token contain dashes (-) ?
*   Numbers: does the token contain numbers?
*   Suffixes/Prefixes: extracts the suffixes and prefixes of lengths 1-5 for each token
*   Known: is the token in the list of known entities? If so, what is its BIO tag? (as evaluated using the lists of known entities, not from the training set XML). Uses the sliding-window technique to find entity names of up to 5 tokens.

As well as the features, the sentence is also labelled with a set of B-I-O tags, indicating that a token either Begins, is In, or is Outside of an entity name. These tags are taken from the annotated training set.



In [0]:
# Given an input set of tokens, extracts a set of binary features.

def extract_features(sentence, entities):
  tokens = [sentence[i][0] for i in range(len(sentence))]
  features = []

  labels = sliding_window(sentence, entities)

  for i in range(len(tokens)):
    token = tokens[i]

    pre1 = token[:1]
    pre2 = token[:2]
    pre3 = token[:3]
    pre4 = token[:4]
    pre5 = token[:5]

    suf1 = token[-1:]
    suf2 = token[-2:]
    suf3 = token[-3:]
    suf4 = token[-4:]
    suf5 = token[-5:]

    caps = is_capitalised(token)
    contains_caps = has_capitals(token)
    numbers = has_numbers(token)
    dashes = has_dashes(token)

    dict_type = labels[i]

    prev = "BoS"
    nxt = "EoS"
    if (i > 0):
      prev = tokens[i-1]
    if (i < len(tokens)-1):
      nxt = tokens[i+1]
    
    vector = OrderedDict()
    vector["form"]=token
    vector["pre1"]=pre1 
    vector["pre2"]=pre2 
    vector["pre3"]=pre3 
    vector["pre4"]=pre4 
    vector["pre5"]=pre5 
    vector["suf1"]=suf1 
    vector["suf2"]=suf2 
    vector["suf3"]=suf3 
    vector["suf4"]=suf4 
    vector["suf5"]=suf5 
    vector["caps"]=str(caps)
    vector["has_caps"]=str(contains_caps)
    vector["has_nums"]=str(numbers)
    vector["has_dash"]=str(dashes)
    vector["known_type"]=str(dict_type)
    vector["prev"]=prev
    vector["next"]=nxt

    features.append(vector)
  return features

In [0]:
# Uses the known offsets of the entities in a sentence to label the tokens with their BIO tags

def extract_bio_tag(span_start, span_end, sentence_entities):
  for entity in sentence_entities:
    for e_span in entity[1]:
      if (span_start == e_span[0]):
        return "B-"+entity[2]
      elif (span_start > e_span[0] and span_end <= e_span[1]):
        return "I-"+entity[2]
  return "O"

In [0]:
# Outputs the features for a set of tokens in the format requieed by the evaluator

def output_features(id, tokens, entities, features, outfile):
  for i in range(len(tokens)):
    name = tokens[i][0]
    offsets = tokens[i][1]

    feature = features[i]

    span_start = offsets[0]
    span_end = offsets[1]

    gold_class = extract_bio_tag(span_start, span_end, entities)

    feats = ""
    for f in feature.keys():
      feats += "\t" + f + "=" + feature[f]
    
    outstring = str(id) + "\t" + str(name) + "\t" + str(span_start) + "\t" + str(span_end) + "\t" + gold_class + feats + "\n"
    outfile.write(outstring)
  outfile.write("\n")


In [0]:
def is_capitalised(token):
  if token.isupper():
    return 1
  return 0

def has_capitals(token):
  if (token.isupper() == False) and any(x.isupper() for x in token):
    return 1
  else: 
    return 0

def has_dashes(token):
  if ("-" in token):
    return 1
  return 0

def has_numbers(token):
  if any(i.isdigit() for i in token):
    return 1
  return 0

def ends_with_s(token):
  if token[-1:].casefold().strip() == 's':
    return 1
  else: 
    return 0

def has_suffix(token, suffs):
  for suffix in suffs:
    if (suffix == token.casefold()[-len(suffix.strip()):]):
      return 1
  return 0

def has_prefix(token, prefs):
  for prefix in prefs:
    if (prefix == token.casefold()[:len(prefix.strip())]):
      return 1
  return 0

def in_dictionary(token, dictionary):
  punctuations = '''![]{};'"\,<>./?@#$%^&*~'''

  no_punct = token
  for char in punctuations:
    no_punct = no_punct.replace(char, " ")
  token = no_punct.casefold().strip()

  if token in dictionary:
    return True
  return False


def in_entities(token, entities):
  punctuations = '''![]{};,'"\<>./?@#$%^&*~'''
  to_find = token.casefold().strip()

  no_punct = to_find
  for char in punctuations:
    no_punct = no_punct.replace(char, " ")
  to_find = no_punct.casefold().strip()

  for label in entities.keys():
    if in_dictionary(to_find, entities[label]):
      return True, label

  return False, "unknown"

def sliding_window(tokens, entities):
  punctuations = '''![]{};,'"\<>./?@#$%^&*~'''
  classes = []
  for token in tokens:
    classes.append("O")

  for i in range(0, len(tokens)-1):
    if (classes[i] is not "O"):
      next
    else:
      tmp = tokens[i][0]
      if any(x in tmp for x in punctuations):
        next

      known, label = in_entities(tmp.casefold(), entities)
      if known:
        classes[i] = "B-"+label
        tmp = ""
        next

      for j in range(1, 5):
        if i+j == len(tokens):
          break
        if classes[i+j] != "O":
          break
        tmp = tmp + " " + tokens[i+j][0]
        if any(x in tmp for x in punctuations):
          break
        known, label = in_entities(tmp.casefold(), entities)
        if known:
          for x in range(i, i + j + 1):
            if x == i:
              classes[x] = "B-"+label
            else:
              classes[x] = "I-"+label
          break

  return classes

# Reading and Writing
Functions to write the features to file, for use by the learner.

1.   **features.txt: ** Contains features extracted from the training set.
2.   **features_large.txt: ** Contains features extracted from the training set and external sources.
3.   **trained.crfsuite: ** Contains trained model extracted from the training set.
4.   **trained_large.crfsuite: ** Contains trained model extracted from the training set and external sources.

**External data**

The external data is in a different form to the training set: it consists of annotated entities with no extraneous words. This leads to biasing of the Position of Speech feature, as entities in the external data are always the only words in the sentence. To combat this, I extracted a random set of non-entity words of random length, and padded each side of the entities. 

**Sources**

The sources of the external data were the DrugBank annotated file, HSDB annotated file, and the 'EN' set of random English sentences, found in the lab directory.


In [0]:
def save_features(inputdir, known_entities):
  outfile = open('/content/drive/My Drive/UPC/Semester2/features/features.txt', 'w+')
  for f in os.listdir(inputdir):
    parsed = parse(open(str(inputdir)+str(f)))
    sentences = parsed.getElementsByTagName("sentence")
    for sentence in sentences:
      sentence_id = sentence.getAttribute('id')
      
      entities = []
      for child in sentence.childNodes:
        if child.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
          if child.tagName == 'entity':
            name = child.getAttribute('text')
            offsets = []
            multi = child.getAttribute('charOffset').split(';')
            for offset in multi:
              offsets.append(list(map(int, offset.split('-'))))
            label = child.getAttribute('type')
            entities.append([name, offsets, label])

      tokens = tokenize(sentence.attributes["text"].value)
      features = extract_features(tokens, known_entities)
      output_features(sentence_id, tokens, entities, features, outfile)
      

In [0]:
def save_features_large(known_entities, non_entities):
  outfile = open('/content/drive/My Drive/UPC/Semester2/features/features_large.txt', 'w+')

  inputdir = '/content/drive/My Drive/UPC/Semester2/data/Train/'
  for f in os.listdir(inputdir):
    parsed = parse(open(str(inputdir)+str(f)))
    sentences = parsed.getElementsByTagName("sentence")
    for sentence in sentences:
      sentence_id = sentence.getAttribute('id')
      
      entities = []
      for child in sentence.childNodes:
        if child.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
          if child.tagName == 'entity':
            name = child.getAttribute('text')
            offsets = []
            multi = child.getAttribute('charOffset').split(';')
            for offset in multi:
              offsets.append(list(map(int, offset.split('-'))))
            label = child.getAttribute('type')
            entities.append([name, offsets, label])

      tokens = tokenize(sentence.attributes["text"].value)
      features = extract_features(tokens, known_entities)
      output_features(sentence_id, tokens, entities, features, outfile)
    
  drugbank = open('drive/My Drive/UPC/Semester2/external/DrugBank.txt')#
  hsdb = open('drive/My Drive/UPC/Semester2/external/HSDB.txt')
  en = open('drive/My Drive/UPC/Semester2/external/en.txt')

  counter = 0
  for line in drugbank.read().splitlines():
    counter += 1
    split = line.split("|")
    entity = split[0]
    label = split[1]
    padded_entity, n_start_words, n_end_words = insert_random_padding(entity, non_entities)
    tokens = tokenize(padded_entity)
    features = extract_features(tokens, known_entities)
    id = "drugbank_sentence_" + str(counter)
    outstrings = extract_bio_tag_from_padded_entity(id, tokens, features, padded_entity, n_start_words, n_end_words, label)
    for string in outstrings:  
      outfile.write(string) 
    outfile.write("\n")
    
  counter = 0
  for line in hsdb.read().splitlines():
    counter += 1
    entity = line.casefold().strip()
    padded_entity, n_start_words, n_end_words = insert_random_padding(entity, non_entities)
    tokens = tokenize(padded_entity)
    features=extract_features(tokens, known_entities)
    id = "hsdb_sentence_" + str(counter)
    outstrings = extract_bio_tag_from_padded_entity(id, tokens, features, padded_entity, n_start_words, n_end_words, "drug_n")
    for string in outstrings:
      outfile.write(string)
    outfile.write("\n")
  hsdb.close()

  counter = 0
  for line in en.read().splitlines():
    counter += 1
    id = "en_sentence_" + str(counter)
    tokens = tokenize(line)
    features = extract_features(tokens, known_entities)
    for i in range(len(tokens)):
      name = tokens[i][0]
      offsets = tokens[i][1]
      feature = features[i]
      span_start = offsets[0]
      span_end = offsets[1]
      gold_class = "O"
      feats = ""
      for f in feature.keys():
        feats += "\t" + f + "=" + feature[f]
      outstring = str(id) + "\t" + str(name) + "\t" + str(span_start) + "\t" + str(span_end) + "\t" + gold_class + feats + "\n"
      outfile.write(outstring)
    outfile.write("\n")

In [0]:
# Used exclusively to process the external data

def insert_random_padding(string, non_entities):
  n_start_words = random.randint(0, 4)
  n_end_words = random.randint(0, 4)

  start_words = random.sample(non_entities, n_start_words)
  end_words = random.sample(non_entities, n_end_words)

  start_string = " ".join([str(elem) for elem in start_words]) + " "
  end_string = " " + " ".join([str(elem) for elem in end_words]) + "."

  start_offset = len(start_string)
  end_offset = len(end_string)

  padded_entity = str(start_string) + str(string) + str(end_string)
  return padded_entity, n_start_words, n_end_words

In [0]:
def extract_bio_tag_from_padded_entity(id, tokens, features, entity, n_start_words, n_end_words, label):
  outstrings = []
  for i in range(len(tokens)):
    name = tokens[i][0]
    offsets = tokens[i][1]
    feature = features[i]

    span_start = offsets[0]
    span_end = offsets[1]

    gold_class = "O"
    if i == n_start_words:
      gold_class = "B-" + label
    elif i > n_start_words and i < len(tokens) - n_end_words - 1:
      gold_class = "I-" + label

    feats = ""
    for f in feature.keys():
      feats += "\t" + f + "=" + feature[f]
    
    outstring = str(id) + "\t" + str(name) + "\t" + str(span_start) + "\t" + str(span_end) + "\t" + gold_class + feats + "\n"
    outstrings.append(outstring)

  return outstrings

In [0]:
def read_saved():
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["brand"] = set()
  prefixes["group"] = set()
  
  suffixes = dict()
  suffixes["drug"] = set()
  suffixes["drug_n"] = set()
  suffixes["group"] = set()
  suffixes["brand"] = set()  

  entities = dict()
  entities["drug"] = set()
  entities["drug_n"] = set()
  entities["group"] = set()
  entities["brand"] = set()

  non_entities = set()

  prefix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes.txt")
  for line in prefix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    prefixes[e_type].add(name.casefold().strip())
  prefix_file.close()

  suffix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes.txt")
  for line in suffix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    suffixes[e_type].add(name.casefold().strip())
  suffix_file.close()

  entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/entities.txt")
  for line in entities_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    entities[e_type].add(name.casefold().strip())
  entities_file.close()

  non_entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities.txt")
  for line in non_entities_file.read().splitlines():
    non_entities.add(line.casefold().strip())
  non_entities_file.close()

  return entities, prefixes, suffixes, non_entities;

In [0]:
def read_saved_large():
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["group"] = set()
  prefixes["brand"] = set()  

  suffixes = dict()
  suffixes["drug"] = set()
  suffixes["drug_n"] = set()
  suffixes["group"] = set()
  suffixes["brand"] = set()  

  entities = dict()
  entities["drug"] = set()
  entities["drug_n"] = set()
  entities["group"] = set()
  entities["brand"] = set()

  non_entities = set()

  prefix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes_large.txt")
  for line in prefix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    prefixes[e_type].add(name.casefold().strip())
  prefix_file.close()

  suffix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes_large.txt")
  for line in suffix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    suffixes[e_type].add(name.casefold().strip())
  suffix_file.close()

  entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/entities_large.txt")
  for line in entities_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    entities[e_type].add(name.casefold().strip())
  entities_file.close()

  non_entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities_large.txt")
  for line in non_entities_file.read().splitlines():
    non_entities.add(line.casefold().strip())
  non_entities_file.close()

  return entities, prefixes, suffixes, non_entities;