# Rule Based Drug-Name Classifier
This notebook contains the rule-based classifier for the AHLT course, UPC.

Author: Jake Watson, 22/03/2020


In [13]:
import xml.dom
from xml.dom.minidom import parse
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import string_span_tokenize
from nltk.metrics import *
from nltk.tokenize.util import align_tokens
from google.colab import drive
import copy
import os
import sys

drive.mount('/content/drive', force_remount=True)

def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
openjdk version "11.0.6" 2020-01-14
OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1)
OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)


Loads the data taken from the training set.
```
read_saved()
```
Loads the data taken from the training set and external knowledge sources
```
read_saved_large()
```





In [0]:
entities, prefixes, suffixes, non_entities = read_saved()

In [0]:
entities, prefixes, suffixes, non_entities = read_saved_large()

The following two fields show instances of the evaluator output for the NERC function, for the 'Devel' and 'Test-NER' datasets.

In [15]:
nerc('drive/My Drive/UPC/Semester2/data/Devel/','task9.1_RBexternalDevel_1.txt', entities, prefixes, suffixes, non_entities, 0.9)

Considering file 85/85: Gemcitabine_ddi.xmlGold drive/My Drive/UPC/Semester2/data/Devel/
Submission  task9.1_RBexternalDevel_1.txt
Directory gold drive/My Drive/UPC/Semester2/data/Devel/
[transDirXMLToMapEntities] dir:drive/My Drive/UPC/Semester2/data/Devel/
log4j:WARN No appenders could be found for logger (org.castor.core.util.Configuration).
log4j:WARN Please initialize the log4j system properly.
 Gold standard saved in goldNER.txt

Gold loaded. Sentences=681, entities: 1771
task9.1_RBexternalDevel_1_scores.log created...
SCORES FOR THE GROUP: RBexternalDevel RUN=1


Strict matching (boundaries + type)
cor	inc	par	mis	spu	total	prec	recall	F1
1327	268	0	176	517	1771	0.63	0.75	0.68




Exact matching
cor	inc	par	mis	spu	total	prec	recall	F1
1474	121	0	176	517	1771	0.7	0.83	0.76




Partial matching
cor	inc	par	mis	spu	total	prec	recall	F1
1474	0	121	176	517	1771	0.7	0.87	0.77




type matching
cor	inc	par	mis	spu	total	prec	recall	F1
1398	197	0	176	517	1771	0.66	0.79	0.72




SCORES 

In [0]:
nerc('drive/My Drive/UPC/Semester2/data/Test-NER/','task9.1_RBexternalTest_1.txt', entities, prefixes, suffixes, non_entities, 0.9)

Considering file 112/112: Terconazole.xmlGold drive/My Drive/UPC/Semester2/data/Test-NER/
Submission  task9.1_RBexternalTest_1.txt
Directory gold drive/My Drive/UPC/Semester2/data/Test-NER/
[transDirXMLToMapEntities] dir:drive/My Drive/UPC/Semester2/data/Test-NER/
log4j:WARN No appenders could be found for logger (org.castor.core.util.Configuration).
log4j:WARN Please initialize the log4j system properly.
 Gold standard saved in goldNER.txt

Gold loaded. Sentences=324, entities: 686
task9.1_RBexternalTest_1_scores.log created...
SCORES FOR THE GROUP: RBexternalTest RUN=1


Strict matching (boundaries + type)
cor	inc	par	mis	spu	total	prec	recall	F1
420	144	0	122	305	686	0.48	0.61	0.54




Exact matching
cor	inc	par	mis	spu	total	prec	recall	F1
518	46	0	122	305	686	0.6	0.76	0.67




Partial matching
cor	inc	par	mis	spu	total	prec	recall	F1
518	0	46	122	305	686	0.6	0.79	0.68




type matching
cor	inc	par	mis	spu	total	prec	recall	F1
443	121	0	122	305	686	0.51	0.65	0.57




SCORES FOR ENT

In [0]:
# Main function: parses the XML files, extracts the sentences, tokenizes them, labels each token, outputs the results, and evaluate the results.

def nerc(inputdir, outputfile, ents, pres, sufs, nes, sensitivity):
  output = open(outputfile, "w+")
  count = 1
  n_files = len(os.listdir(inputdir))
  for fil in os.listdir(inputdir):
    sys.stdout.write("\rConsidering file " + str(count) + "/" + str(n_files) + ": " + str(fil))
    sys.stdout.flush()
    count += 1
    fil = open(str(inputdir) + str(fil))
    tree = parse(fil)
    fil.close()
    sentences = tree.getElementsByTagName("sentence")
    for sentence in sentences:
      sid = sentence.attributes["id"].value
      stext = sentence.attributes["text"].value
      tokens = tokenize(stext)
      labelled = extract_entities(tokens, ents, pres, sufs, nes, sensitivity)
      output_entities(sid, labelled, output)
  output.close
  evaluate(inputdir, outputfile)

In [0]:
# Tokenizes the input text. 
# Returns the tokens, with their offsets from the beginning of the sentence.

def tokenize(input):
  s = input.replace('"', "'")
  tokens = TreebankWordTokenizer().tokenize(s)
  offsets = list(align_tokens(tokens, s))
  offsets = [tuple((i, j-1)) for i, j in offsets]
  output = [tuple((i, j)) for i, j in zip(tokens, offsets)]
  return output

In [0]:
# Classifies the set of tokens.
# Requires the set of known entities, their common suffixes and prefixes, and the non-entities.

def extract_entities(tokens, entities, prefixes, suffixes, non_entities, sensitivity):
  labelled = applyRules(tokens, entities, prefixes, suffixes, non_entities, sensitivity)
  string_output = []
  for (token, offsets, typ) in labelled:
    string_output.append(
        {
          "name": str(token),
          "offset": str(str(offsets[0]) + "-" + str(offsets[1])),
          "type": str(typ)
        })
  return string_output

In [0]:
# Prints the entities to a file in the required format.

def output_entities(id, ents, outf):
  for entity in ents:
    name = entity["name"]
    offset = entity["offset"]
    typ = entity["type"]
    outstring = str(id) + "|" + str(offset) + "|" + str(name) + "|" + str(typ) + "\n"
    outf.write(outstring)

In [0]:
# Runs the official evaluator on the results

def evaluate(inputdir, outputfile):
  !java -jar 'drive/My Drive/UPC/Semester2/eval/evaluateNER.jar' "$inputdir" "$outputfile"

# Rules
The following functions attempt to classify a given set of tokens. This is done by applying a set of rules to the token. If the token satisfies a rule for a certain class, it is given a score for that class. After applying all rules for all classes, these scores are summed for each class, and the token is classified according to the maximum scoring class.

The scores for each class are normalized to be between 0 and 1, and a threshold is applied: if no score is above the threshold, the token is not classified.

The rules applied are as follows:


*   Capitalised: is the token entirely capitalised?
*   Has Capitals: does the token contain any capitals?
*   Dashes: does the token contain dashes (-) ?
*   Numbers: does the token contain numbers?
*   Ends with S: does the token end with 's'?
*   Has suffix/prefix: does the token have a common prefix or suffix?


Finally, the token is also checked against a list of known entities. If it is known, then no other rules are checked and the token is classified according to the list. 

To ensure multi-word entities are also found by this method, the sliding-window method is applied. A sliding window of length 5 tokens is moved across the sentence, with each set of tokens in the window being checked against the list of known entities. Additionally, once classified, if there ny adjacent tokens with the same classification, they are joined together as one entity by the 'adjacent_join' function.


In [0]:
# Returns the list of classified entities in a sentence.
# Applies the set of rules to a sentence, given the set of known entities and non-entities.

def applyRules(tokens, entities, prefixes, suffixes, non_entities, sensitivity):

  labelled_drugs = []
  labelled_drugs_n = []
  labelled_groups = []
  labelled_brands = []


  #Functions to return the score for a token in each class.

  def drug_score(token, drugs, pres, suffs):
    score = 0
    score += is_capitalised(token)
    score += has_dashes(token)
    score += has_numbers(token)
    score += has_suffix(token, suffs)
    score += has_prefix(token, pres)
    return score / 5.0
  
  def group_score(token, groups, pres, suffs):
    score = 0
    score += 1 - is_capitalised(token)
    score += ends_with_s(token)
    score += has_suffix(token, suffs)
    score += has_prefix(token, pres)
    return score / 4.0

  def brand_score(token, brands, pres, suffs):
    score = 0
    score += is_capitalised(token)
    score += has_capitals(token)
    score += has_suffix(token, suffs)
    score += has_prefix(token, pres)
    return score / 4.0

  # Check for and remove known multi-word entities
  remaining, labelled = sliding_window(tokens, entities)

  for (token, offset, label) in labelled:
    if label == "drug":
      labelled_drugs.append((token, offset, "drug"))
    elif label == "drug_n":
      labelled_drugs_n.append((token, offset, "drug_n"))
    elif label == "group":
      labelled_groups.append((token, offset, "group"))
    elif label  == "brand":
      labelled_brands.append((token, offset, "brand"))



  for (token, offset) in remaining:

    # Check if known nonentity
    if in_dictionary(token, non_entities) is True:
      next

    # Check if known entity
    known, e_type = in_entities(token, entities)

    if known:
      if e_type == "drug":
        labelled_drugs.append((token, offset, "drug"))
      elif e_type == "drug_n":
        labelled_drugs_n.append((token, offset, "drug_n"))
      elif e_type == "group":
        labelled_groups.append((token, offset, "group"))
      elif e_type == "brand":
        labelled_brands.append((token, offset, "brand"))
      next

    # If not known entity or non-entity, check scores for each class
    else:
      d_score = drug_score(token, entities["drug"], prefixes["drug"], suffixes["drug"])
      dn_score = drug_score(token, entities["drug_n"], prefixes["drug_n"], suffixes["drug_n"])
      g_score = group_score(token, entities["group"], prefixes["group"], suffixes["group"])
      b_score = brand_score(token, entities["brand"], prefixes["brand"], suffixes["brand"])

    # If the maximum scoring class is above the threshold, classify the token
      if max(d_score, dn_score, g_score, b_score) >= sensitivity:
        maxScore = max(d_score, dn_score, g_score, b_score)
        if maxScore == d_score:
          labelled_drugs.append((token, offset, "drug"))
        elif maxScore == dn_score:
          labelled_drugs_n.append((token, offset, "drug_n"))
        elif maxScore == g_score:
          labelled_groups.append((token, offset, "group"))
        elif maxScore == b_score:
          labelled_brands.append((token, offset, "brand"))

  # Two passes of adjacent join - joins up to three adjacent words    
  joineddrugs1 = adjacentjoin(labelled_drugs)
  joineddrugs2 = adjacentjoin(joineddrugs1)

  joineddrugs_n_1 = adjacentjoin(labelled_drugs_n)
  joineddrugs_n_2 = adjacentjoin(joineddrugs_n_1)

  joinedgroups1 = adjacentjoin(labelled_groups)
  joinedgroups2 = adjacentjoin(joinedgroups1)

  joinedbrands1 = adjacentjoin(labelled_brands)
  joinedbrands2 = adjacentjoin(joinedbrands1)

  # Mix the data together again and sort by the sentence position - putting the tokens in order
  mixed = sorted(joineddrugs2 + joineddrugs_n_2 + joinedgroups2 + joinedbrands2, key = lambda x: x[1][0])
  
  return mixed

In [0]:
def sliding_window(tokens, entities):
  punctuations = '''![]{};,'"\<>./?@#$%^&*~'''
  no_punct = []
  joined = []
  cut_tokens = copy.deepcopy(tokens)

  for i in range(0, len(tokens)-1):
    tmp = tokens[i][0]
    start_offset = tokens[i][1][0]
    if any(x in tmp for x in punctuations):
      next
    for j in range(1, 5):
      if i+j == len(tokens):
        break
      end_offset = tokens[i+j][1][1]
      tmp = tmp + " " + tokens[i+j][0]
      if any(x in tmp for x in punctuations):
        break
      known, typ = in_entities(tmp.casefold(), entities)
      if known:
        for x in range(i, i + j + 1):
          cut_tokens[x] = None
        combined = [tmp, [start_offset, end_offset], typ]
        joined.append(combined)
        break


  final_tokens = []
  for elem in cut_tokens:
    if elem is not None:
      final_tokens.append(elem)
  
  return final_tokens, joined

def is_capitalised(token):
  if token.isupper():
    return 1
  return 0

def has_capitals(token):
  if (token.isupper() == False) and any(x.isupper() for x in token):
    return 1
  else: 
    return 0

def has_dashes(token):
  if ("-" in token):
    return 1
  return 0

def has_numbers(token):
  if any(i.isdigit() for i in token):
    return 1
  return 0

def ends_with_s(token):
  if token[-1:].casefold().strip() == 's':
    return 1
  else: 
    return 0

def has_suffix(token, suffs):
  for suffix in suffs:
    if (suffix == token.casefold()[-len(suffix.strip()):]):
      return 1
  return 0

def has_prefix(token, prefs):
  for prefix in prefs:
    if (prefix == token.casefold()[:len(prefix.strip())]):
      return 1
  return 0

def in_dictionary(token, dictionary):
  punctuations = '''![]{};'"\,<>./?@#$%^&*~'''

  no_punct = token
  for char in punctuations:
    no_punct = no_punct.replace(char, " ")
  token = no_punct.casefold().strip()

  if token in dictionary:
    return True
  return False

def in_entities(token, entities):

  for typ in entities.keys():
    if in_dictionary(token, entities[typ]):
      return True, typ

  return False, "unknown"

def adjacentjoin(tokens):
  def listit(t):
    return list(map(listit, t)) if isinstance(t, (list, tuple)) else t
  unjoined = listit(tokens)

  for i in range(1, len(unjoined)):
    prev = unjoined[i-1]
    curr = unjoined[i]

    prev_end = prev[1][1]
    curr_start = curr[1][0]

    if ((prev_end + 2) == curr_start):
      unjoined[i-1][0] = (str(prev[0]) + " " + str(curr[0]))
      unjoined[i-1][1][1] = curr[1][1]

      unjoined[i][0] = ""
      unjoined[i][1] = [0,0]

  joined = []
  for i in range (len(unjoined)):
    if unjoined[i][0] != "":
      joined.append(unjoined[i])
  return joined

# Reading and Writing
Functions to read the training data produced by the AHLT_DATA_PREP notebook.

1.   **entities.txt, prefixes.txt, suffixes.txt, non_entities.txt** Contains known entities extracted from the training set.
2.   **entities_large.txt, prefixes_large.txt, suffixes_large.txt, non_entities_large.txt** Contains known entities extracted from the training set and external sources.

**Sources**

The sources of the external data were the DrugBank annotated file, HSDB annotated file, and the 'EN' set of random English sentences, found in the lab directory.


In [0]:
def read_saved():
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["group"] = set()
  prefixes["brand"] = set()  

  suffixes = dict()
  suffixes["drug"] = set()
  suffixes["drug_n"] = set()
  suffixes["group"] = set()
  suffixes["brand"] = set()  

  entities = dict()
  entities["drug"] = set()
  entities["drug_n"] = set()
  entities["group"] = set()
  entities["brand"] = set()

  non_entities = set()

  prefix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes.txt")
  for line in prefix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    prefixes[e_type].add(name.casefold().strip())
  prefix_file.close()

  suffix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes.txt")
  for line in suffix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    suffixes[e_type].add(name.casefold().strip())
  suffix_file.close()

  entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/entities.txt")
  for line in entities_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    entities[e_type].add(name.casefold().strip())
  entities_file.close()

  non_entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities.txt")
  for line in non_entities_file.read().splitlines():
    non_entities.add(line.casefold().strip())
  non_entities_file.close()

  return entities, prefixes, suffixes, non_entities;

In [0]:
def read_saved_large():
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["group"] = set()
  prefixes["brand"] = set()  

  suffixes = dict()
  suffixes["drug"] = set()
  suffixes["drug_n"] = set()
  suffixes["group"] = set()
  suffixes["brand"] = set()  

  entities = dict()
  entities["drug"] = set()
  entities["drug_n"] = set()
  entities["group"] = set()
  entities["brand"] = set()

  non_entities = set()

  prefix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes_large.txt")
  for line in prefix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    prefixes[e_type].add(name.casefold().strip())
  prefix_file.close()

  suffix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes_large.txt")
  for line in suffix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    suffixes[e_type].add(name.casefold().strip())
  suffix_file.close()

  entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/entities_large.txt")
  for line in entities_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    entities[e_type].add(name.casefold().strip())
  entities_file.close()

  non_entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities_large.txt")
  for line in non_entities_file.read().splitlines():
    non_entities.add(line.casefold().strip())
  non_entities_file.close()

  return entities, prefixes, suffixes, non_entities;