# Rule Based Drug-Name Classifier - Data Prep
This notebook functions to prepare the data for the rule-based classifier for the AHLT course, UPC.

Author: Jake Watson, 22/03/2020

In [0]:
import xml.dom
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import string_span_tokenize
from xml.dom.minidom import parse
from nltk.metrics import *
from nltk.tokenize.util import align_tokens
from google.colab import drive
import copy
import os


def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

In [0]:
drive.mount('/content/drive', force_remount=True)

In [0]:
entities, suffixes, non_entities = read_large()

In [0]:
entities, suffixes, non_entities = read_saved_large()

In [0]:
entities, prefixes, suffixes, non_entities = extract_training_data('/content/drive/My Drive/UPC/Semester2/data/Train/')

In [0]:
entities, prefixes, suffixes, non_entities = extract_external_data(entities, non_entities)

In [0]:
write_training_data()

In [0]:
write_large(entities, prefixes, suffixes, non_entities)

# Training Set Analysis
This section contains the functions used to extract the known entities and non-entities in the training set.

These entities are organized into a dict of sets. The set object was chosen as it is O(1) to check membership of the set, which is useful to check if a potential entity is known. The keys of the dict are the entity classes, allowing us to easily separate the entities.

In [0]:
# Extracts the labelled entities, their common suffixes and prefixes, and the non-entities in the training set. All known
# entities are organized by their class, using a dict of sets.


def extract_training_data(inputdir):

  entities = extract_entities(inputdir)
  suffixes = extract_suffixes(entities)
  prefixes = extract_prefixes(entities)
  non_entities = extract_non_entities(inputdir, entities)

  return entities, prefixes, suffixes, non_entities

In [0]:
# Extracts the set of labelled entities, organized by entity class, in the training set


def extract_entities(inputdir):
  punctuations = '''![]{};:'"\,<>./?@#$%^&*_~'''

  entities_by_type = dict()
  entities_by_type["drug"] = set()
  entities_by_type["drug_n"] = set()
  entities_by_type["brand"] = set()
  entities_by_type["group"] = set()

  for f in os.listdir(inputdir):
    tagged = parse(open(str(inputdir) + str(f)))
    
    entities = tagged.getElementsByTagName("entity")
    sentences = tagged.getElementsByTagName("sentence")

    for entity in entities:

      xml_type = entity.getAttribute('type')
      name = entity.getAttribute('text')
      
      no_punct = name
      for char in punctuations:
        no_punct = no_punct.replace(char, " ")
      name = no_punct.casefold().strip()

      if xml_type == 'drug':
        if name not in entities_by_type["drug"]:
          entities_by_type["drug"].add(name)
      elif xml_type =='drug_n':
        if name not in entities_by_type["drug_n"]:
          entities_by_type["drug_n"].add(name)
      elif xml_type == 'brand':
        if name not in entities_by_type["brand"]:
          entities_by_type["brand"].add(name)
      elif xml_type == 'group':
        if name not in entities_by_type["group"]:
          entities_by_type["group"].add(name)
 
  return entities_by_type

In [0]:
# Extracts the set of common entity prefixes, organized by entity class, in the training set


def extract_prefixes(entities):
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["brand"] = set()
  prefixes["group"] = set()

  for e_type in entities.keys():
    raw_prefixes = []
    for ent in entities[e_type]:
      raw_prefixes.append(ent[:5])
    frequencies = {s:raw_prefixes.count(s) for s in raw_prefixes}
    pres = set()
    for (key, val) in frequencies.items():
      if (val > 10 or val/len(entities[e_type]) > 0.001):
        pres.add(key.casefold().strip())
    prefixes[e_type] = pres 
 
  return prefixes

In [0]:
# Extracts the set of common entity suffixes, organized by entity class, in the training set

def extract_suffixes(entities):
  suffixes_by_type = dict()
  suffixes_by_type["drug"] = set()
  suffixes_by_type["drug_n"] = set()
  suffixes_by_type["brand"] = set()
  suffixes_by_type["group"] = set()

  for e_type in entities.keys():
    raw_suffixes = []
    for ent in entities[e_type]:
      raw_suffixes.append(ent[-5:])

    frequencies = {s:raw_suffixes.count(s) for s in raw_suffixes}
    suffixes = set()
    for (key, val) in frequencies.items():
      if (val > 10 or val/len(entities[e_type]) > 0.001):
        suffixes.add(key.casefold().strip())

    suffixes_by_type[e_type] = suffixes 
 
  return suffixes_by_type

In [0]:
# Extracts all non-labelled words in the training set

def extract_non_entities(inputdir, entities):
  punctuations = '''![]{};:'"\,<>./?@#$%^&*_~'''
  
  combined = set()  
  for e_type in entities.keys():
    combined = combined.union(entities[e_type])  

  splitstrings = copy.deepcopy(combined)
  for item in combined:
    tokens = TreebankWordTokenizer().tokenize(item)
    for token in tokens:
      splitstrings.add(token.casefold().strip())

  non_entities = set()

  for f in os.listdir(inputdir):
    tagged = parse(open(str(inputdir) + str(f)))
    sentences = tagged.getElementsByTagName("sentence")

    for sentence in sentences:
      stext = sentence.attributes["text"].value

      no_punct = stext
      for char in punctuations:
        no_punct = no_punct.replace(char, " ")
      stext = no_punct.casefold().strip()

      tokens = TreebankWordTokenizer().tokenize(stext)
      for token in tokens:
        if token.casefold().strip() not in splitstrings:
          non_entities.add(token)
 
  return non_entities

In [0]:
# Modifies the set of known entities and non-entities with data from external knowledge sources.

def extract_external_data(entities, non_entities):
  punctuations = '''![]{};:'"\,<>./?@#$%^&*_~'''

  drugbank = open('drive/My Drive/UPC/Semester2/external/DrugBank.txt')
  en = open('drive/My Drive/UPC/Semester2/external/en.txt')
  hsdb = open('drive/My Drive/UPC/Semester2/external/HSDB.txt')

  for line in drugbank.read().splitlines():
    split = line.split("|")
    e_type = split[1]
    name = split[0].casefold().strip()
    no_punct = name
    for char in punctuations:
      no_punct = no_punct.replace(char, " ")
    name = no_punct
    entities[e_type].add(name)
  drugbank.close()

  counter = 0
  for line in hsdb.read().splitlines():
    name = line.casefold().strip()
    no_punct = name
    for char in punctuations:
      no_punct = no_punct.replace(char, " ")
    name = no_punct
    if name not in entities["drug"] and name not in entities["brand"]  and name not in entities["group"]:
      entities["drug_n"].add(name)
      counter += 1
  hsdb.close()

  for e_type in entities.keys():
    for entity in entities[e_type]:
      if entity in non_entities:
        non_entities.remove(entity)

  punctuations = '''![]{};:'"\,<>./?@#$%^&*_~'''
  combined = set()  
  for e_type in entities.keys():
    combined = combined.union(entities[e_type])  
  splitstrings = copy.deepcopy(combined)

  for line in en.read().splitlines():
    no_punct = line.casefold().strip()
    for char in punctuations:
      no_punct = no_punct.replace(char, " ")
    text = TreebankWordTokenizer().tokenize(no_punct)

    for token in text:
      if token.casefold().strip() not in splitstrings:
        non_entities.add(token.casefold().strip())

  prefixes = extract_prefixes(entities)
  suffixes = extract_suffixes(entities)

  return entities, prefixes, suffixes, non_entities

# Data Analysis
These functions are utilities to perform basic data analysis, or to check for data consistency.

In [0]:
# Function to check training file consistency.
# Checks the dictionary of entities for repeated values in multple sets,
# and checks there is no formatting.

def is_data_correct():
  punctuations = '''![]{};'"\,<>./?@#$%^&*~'''

  ents = open('/content/drive/My Drive/UPC/Semester2/data/extracted/entities_large.txt')

  for line in ents.read().splitlines():
    if line.isupper(): 
      print("Line is upper")
      print(line)
      return False 
    if any(x.isupper() for x in line):
      print("Line has upper")
      print(line)
      return False
    
    for char in line:
      if char in punctuations: 
        print("Line has punc")
        print(line)
        return False
  
  entities, suffixes, non_entities = read_saved_large()

  count = 0
  offenders = set()
  for e_type in entities.keys():
    for entity in entities[e_type]:
      if entity in non_entities:
        count += 1
        offenders.add(entity)

  if count > 0:
    print("Entity also in nonentities")
    print(len(offenders))
    print(offenders)
    return False

  return True

In [0]:
# Checks how many entities in a list end with 's'

def ends_with_s(entities):
  count = 0
  for entity in entities:
    if entity[-1:].casefold().strip() == "s":
      count += 1
  return count/len(entities)


In [0]:
# Checks how many entities in a list have one capitalised character

def one_capital(entity):
  count = 0 
  for entity in entities:
      if entity.isupper() == False and any(x.isupper() for x in entity):
        count += 1
  return count/len(entity)


In [0]:
# Returns the minimum and maximum lengths of strings in a list

def min_max_lengths(entities):
  for e_type in entities.keys():
    minimum = 1000
    maximum = 0
    minvalue = ""
    maxvalue = ""
    for name in entities[e_type]:
      length = len(name)
      if (length < minimum):
        minimum = length
        minvalue = name
      elif (length > maximum):
        maximum = length
        maxvalue = name
    print(e_type)
    print(minvalue)
    print(minimum)
    print(maxvalue)
    print(maximum)

In [0]:
# Finds the set of entity classes in the training data

def find_entity_types():
  types = []
  for fil in os.listdir('drive/My Drive/UPC/Semester2/data/Train/'):
    f = str('drive/My Drive/UPC/Semester2/data/Train/') + str(fil) 
    fil = open(f)
    tagged = parseXML(fil)
    entities = tagged.getElementsByTagName("entity")

    for entity in entities:\
      xml_type = entity.getAttribute('type')
      types.append(xml_type)
  
  types = list(dict.fromkeys(types) )
  return types

# Reading and writing training files
These functions are for reading or saving training data to files, for later use.

1.   **entities.txt, prefixes.txt, suffixes.txt, non_entities.txt** Contains known entities extracted from the training set.
2.   **entities_large.txt, prefixes_large.txt, suffixes_large.txt, non_entities_large.txt** Contains known entities extracted from the training set and external sources.

**Sources**

The sources of the external data were the DrugBank annotated file, HSDB annotated file, and the 'EN' set of random English sentences, found in the lab directory.

In [0]:
def write_training_data():
  inputdir = 'drive/My Drive/UPC/Semester2/data/Train/'
  entities, prefixes, suffixes, non_entities = extract_training_data(inputdir)

  with open('/content/drive/My Drive/UPC/Semester2/data/extracted/entities.txt', 'w+') as entities_file:
    for e_type in entities.keys():
      for name in entities[e_type]:
        entities_file.write(e_type + ":" + name + "\n") 
  !cat /content/drive/My\ Drive/UPC/Semester2/data/extracted/entities.txt

  with open('/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes.txt', 'w+') as prefixes_file:
    for e_type in prefixes.keys():
      for name in prefixes[e_type]:
        prefixes_file.write(e_type + ":" + name + "\n") 
  !cat /content/drive/My\ Drive/UPC/Semester2/data/extracted/prefixes.txt
  
  with open('/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes.txt', 'w+') as suffixes_file:
    for e_type in suffixes.keys():
      for name in suffixes[e_type]:
        suffixes_file.write(e_type + ":" + name + "\n") 
  !cat /content/drive/My\ Drive/UPC/Semester2/data/extracted/suffixes.txt

  with open('/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities.txt', 'w+') as non_entities_file:
    for name in non_entities:
      non_entities_file.write(name + "\n") 
  !cat /content/drive/My\ Drive/UPC/Semester2/data/extracted/non_entities.txt

In [0]:
def write_large(entities, prefixes, suffixes, non_entities):
  with open('/content/drive/My Drive/UPC/Semester2/data/extracted/entities_large.txt', 'w+') as entities_file:
    for e_type in entities.keys():
      for name in entities[e_type]:
        entities_file.write(e_type + ":" + name + "\n") 
  !cat /content/drive/My\ Drive/UPC/Semester2/data/extracted/entities_large.txt

  with open('/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes_large.txt', 'w+') as prefixes_file:
    for e_type in prefixes.keys():
      for name in prefixes[e_type]:
        prefixes_file.write(e_type + ":" + name + "\n") 
  !cat /content/drive/My\ Drive/UPC/Semester2/data/extracted/prefixes.txt
  
  with open('/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes_large.txt', 'w+') as suffixes_file:
    for e_type in suffixes.keys():
      for name in suffixes[e_type]:
        suffixes_file.write(e_type + ":" + name + "\n") 
  !cat /content/drive/My\ Drive/UPC/Semester2/data/extracted/suffixes_large.txt

  with open('/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities_large.txt', 'w+') as non_entities_file:
    for name in non_entities:
      non_entities_file.write(name + "\n") 
  !cat /content/drive/My\ Drive/UPC/Semester2/data/extracted/non_entities_large.txt

In [0]:
def read_saved():
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["brand"] = set()
  prefixes["group"] = set()
  
  suffixes = dict()
  suffixes["drug"] = set()
  suffixes["drug_n"] = set()
  suffixes["group"] = set()
  suffixes["brand"] = set()  

  entities = dict()
  entities["drug"] = set()
  entities["drug_n"] = set()
  entities["group"] = set()
  entities["brand"] = set()

  non_entities = set()

  prefix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes.txt")
  for line in prefix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    prefixes[e_type].add(name.casefold().strip())
  prefix_file.close()

  suffix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes.txt")
  for line in suffix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    suffixes[e_type].add(name.casefold().strip())
  suffix_file.close()

  entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/entities.txt")
  for line in entities_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    entities[e_type].add(name.casefold().strip())
  entities_file.close()

  non_entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities.txt")
  for line in non_entities_file.read().splitlines():
    non_entities.add(line.casefold().strip())
  non_entities_file.close()

  return entities, prefixes, suffixes, non_entities;

In [0]:
def read_saved_large():
  prefixes = dict()
  prefixes["drug"] = set()
  prefixes["drug_n"] = set()
  prefixes["group"] = set()
  prefixes["brand"] = set()  

  suffixes = dict()
  suffixes["drug"] = set()
  suffixes["drug_n"] = set()
  suffixes["group"] = set()
  suffixes["brand"] = set()  

  entities = dict()
  entities["drug"] = set()
  entities["drug_n"] = set()
  entities["group"] = set()
  entities["brand"] = set()

  non_entities = set()

  prefix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/prefixes_large.txt")
  for line in prefix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    prefixes[e_type].add(name.casefold().strip())
  prefix_file.close()

  suffix_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/suffixes_large.txt")
  for line in suffix_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    suffixes[e_type].add(name.casefold().strip())
  suffix_file.close()

  entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/entities_large.txt")
  for line in entities_file.read().splitlines():
    split = line.split(":")
    e_type = split[0]
    name = split[1]
    entities[e_type].add(name.casefold().strip())
  entities_file.close()

  non_entities_file = open("/content/drive/My Drive/UPC/Semester2/data/extracted/non_entities_large.txt")
  for line in non_entities_file.read().splitlines():
    non_entities.add(line.casefold().strip())
  non_entities_file.close()

  return entities, prefixes, suffixes, non_entities;