<a href="https://colab.research.google.com/github/hitharr/Relation-Extraction-Using-NLP/blob/main/NLP_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**SETUP**
---



In [None]:
#@title Imports

from google.colab import files
from IPython.display import display

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

import spacy
from spacy import displacy
from nltk.corpus import wordnet as wn
import itertools
import networkx as nx
import matplotlib.pyplot as plt

import re
import pickle
import time
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
#@title Save and Load Pickle File Functions

# Save pickle files
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

# Load pickle files
def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
#@title Feature Selection and Addition to Dataframe

# Perform feature selection
def feature_selection(dataset, t = 0.16):
    sel = VarianceThreshold(threshold=t)
    sel.fit(dataset)
    return (sel.transform(dataset), sel.get_support(indices=True))

# Convert counts to columns
def toColumn(counts):
    i = 0
    data = dict()
    for d in counts:
        for k in d:
            if k in data: data[k].append(d[k])
            else: data.update({k: [0]*i + [d[k]]})
        i += 1

    # Append lists with zeros if needed
    for key in data:
        data[key] += [0]*(corr_len - len(data[key]))

    return data

# Add columns to pandas dataframe
def addKeys(dataset, data):
    start = time.time()
    for key in data:
        dataset = np.append(dataset, np.expand_dims(np.array(data[key]), -1), 1)
        
    # Remove useless features
    dataset, columns = feature_selection(dataset)
    return (dataset, columns)

def addAllKeys(dataset, data):
    start = time.time()
    cols = []
    for key in data:
        dataset = np.append(dataset, np.expand_dims(np.array(data[key]), -1), 1)
        cols.append(key)
    return dataset, cols   

In [None]:
#@title Parse Label Function
def parse_label(dataset):
  rel = []
  dir = []
  i = 0;
  for row in dataset['label']:
    if row == "Other":
      rel.append("Other")
      dir.append("Other")
    else:
      rel.append(row[0:-7])
      dir.append(row[-7:])

  dataset.insert(1, 'rel', rel)
  dataset.insert(2, 'dir', dir)
  return dataset

In [None]:
#@title Model Metrics, Predictions, and Accuracy Functions

def print_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    macro_precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    fB = fbeta_score(y_true, y_pred, average='macro', beta=0.5)
    print("Accuracy:", accuracy)
    print("Macro Precision: ", macro_precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("FBeta Score:", fB)
    print()

def model_predictions(X_test, y_test, stack, label):
    print("\n\nMaking Predictions for", label)
    print("=======================================")
    start_time = time.time()
    y_pred = stack.predict(X_test)
    print("Time to make predictions:", time.time()-start_time, "seconds")
    print_metrics(y_test, y_pred)
    return y_test, y_pred

def combined_model_accuracy(y_pred_rel, y_pred_dir, y_test_label):
    inverse_rel = le_rel.inverse_transform(y_pred_rel.astype(int))
    inverse_dir = le_dir.inverse_transform(y_pred_dir.astype(int))
    combined_relation = []
    for rel in inverse_rel:
      if rel == "Other" or inverse_dir[0] == "Other":
        combined_relation.append("Other")
      else:
        combined_relation.append(rel+inverse_dir[0])
    print("\n\nCombined Relation and Direction Metrics")
    print("=======================================")
    print("Predicted:", combined_relation[0])
    print()
    y_test_label = le_label.transform([y_test_label])
    combined = le_label.transform(combined_relation)
    
    print_metrics(y_test_label, combined)

def combined_model_accuracy_test(y_pred_rel, y_pred_dir, y_test_label):
    inverse_rel = le_rel.inverse_transform(y_pred_rel.astype(int))
    inverse_dir = le_dir.inverse_transform(y_pred_dir.astype(int))
    combined_relation = []
    i = 0
    while i < len(inverse_rel):
      rel = inverse_rel[i]

      if rel == "Other" or inverse_dir[i] == "Other":
        combined_relation.append("Other")

      else:
        combined_relation.append(rel+inverse_dir[i])
      i += 1

    combined = le_label.transform(combined_relation)
    print("\n\nCombined Relation and Direction Metrics")
    print("=======================================")
    print_metrics(y_test_label, combined)

**UPLOAD FILES** *(expand to upload)*
---

In [None]:
#@title Upload Files

#Only uncomment the below line if this is your first time running
#upload semeval_train.txt and .pkl files
#upload dataset.csv file for features
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

**CORPUS READER, DATA PROCESSOR, NLP PIPELINE, AND GET SENTENCE LIST**
---



In [None]:
#@title Corpus Reader

###############################################################################
# CorpusReader
#
#   Input:
#       lines       - entire training/test set file content
#
#   Output: (sentence, e1, e2, relation)
#       sentence    - sentence in between quotes excluding e1 and e2 tags
#       e1          - name of e1
#       e2          - name of e2
#       relation    - name of relation with direction
#       e1_loc      - index of first word from e1 in sentence
#       e2_loc      - index of first word from e2 in sentence
#
#   Description:
#       Return examples one at a time
def CorpusReader(lines):
    
    # Loop through lines
    i = 0
    while i + 1 < len(lines):
            
        #######################################################################
        # Get sentence line
        sentence_line = lines[i]
        sentence_no_quotes = sentence_line.split('"', 1)[1][:-2]
        
        # Get sentence by removing tags, quotes, and newline characters
        sentence = sentence_no_quotes
        for r in ['<e1>', '</e1>', '<e2>', '</e2>']:
            sentence = sentence.replace(r, " ")
        
        # Replace multiple spaces with single spaces
        sentence = re.sub(' +', ' ', sentence)

        # Split sentence into words (separate e1 and e2 tags into separate words)
        sentence_split = sentence_no_quotes
        sentence_split = sentence_split.replace('<', " <")
        sentence_split = sentence_split.replace('>', "> ")
        sentence_split = sentence_split.split()
        
        # Loop through words in sentence and find entities
        j = 0
        while j < len(sentence_split):
            word = sentence_split[j]
            
            # Get e1
            if word == '<e1>':
                e1 = ""
                e1_loc = j - 1
                j += 1
                while sentence_split[j] != '</e1>':
                    e1 += " " + sentence_split[j]
                    j += 1
            
            # Get e2
            elif word == '<e2>':
                e2 = ""
                e2_loc = j - 3
                j += 1
                while sentence_split[j] != '</e2>':
                    e2 += " " + sentence_split[j]
                    j += 1
                break
            
            # Increment to next word
            j += 1
                
        #######################################################################
        # Get relation line
        relation_line = lines[i + 1]
            
        #######################################################################
        # Increment to next example
        #   skip sentence line, relation line, two new lines
        i += 4
        
        # Yield return example details
        yield (sentence.strip(), e1.strip(), e2.strip(), relation_line.strip(), e1_loc, e2_loc)

In [None]:
#@title NLP Pipeline
###############################################################################
# NLPPipeline
#
#   Input: (sentence, e1, e2)
#       sentence              - sentence in between quotes excluding e1 and e2 tags
#       e1                    - name of e1
#       e2                    - name of e2
#
#   Output: (dep_tree, ner_tags, wordNet_extractions, wordNet_holo_meronymy_relations, wordnet_entity_relations, pipeline_out, e_tags, framenet_extractions)
#       dep_tree                          - dependency parsing tree from sentence
#       ner_tags                          - named entity tags
#       wordNet_extractions               - hypernymns, hyponyms, meronyms, and holonyms Wordnet extractions
#       wordNet_holo_meronymy_relations   - holonyms and meronym matches within the sentence
#       wordnet_entity_relations          - direct holonyms and meronyms between the entities
#       pipeline_out                      - output of NLP pipeline on sentence
#       e_tags                            - entity tags as result of NLP pipeline
#   Description:
#       Implements a NLP pipeline to extract NLP based features from natural
#       language statements
def NLPPipeline(sentence, e1, e2, display=False):
  nlp=spacy.load('en_core_web_sm')

  # From https://ryan-cranfill.github.io/sentiment-pipeline-sklearn-3/
  def pipelinize(function, active=True):
    def list_comprehend_a_function(list_or_series, active=True):
        if active:
            return [function(i) for i in list_or_series]
        else: # if it's not active, just pass it right back
            return list_or_series
    return FunctionTransformer(list_comprehend_a_function, validate=False, kw_args={'active':active})

  def word_tokenizer(sentence):
    out = nltk.word_tokenize(sentence)
    if display: print("Tokenized words:", out)
    return (out)

  def token_lemmatizer(tokens):
    lemmatizer = WordNetLemmatizer() 
    lemmatized_tokens=[]
    for token in tokens:
      lemmatized_tokens.append(lemmatizer.lemmatize(token))

    if display: print('Lemmatized words:', lemmatized_tokens)
    return lemmatized_tokens

  def tag_lemmatized(lemmatized_tokens):
    out = nltk.pos_tag(lemmatized_tokens)
    if display: print("POS tagged:", out)
    return (out)

  def dep_parse(sentence, e1, e2):
    e = []
    for token in nlp(sentence):
      # print(token.text,'=>',token.dep_,'=>',token.head.text)
      for child in token.children:
        e.append(('{0}'.format(token.lower_), '{0}'.format(child.lower_)))
    g = nx.Graph(e)

    #nx.draw(g, with_labels=True)
    #plt.show()  
    try:
      path_length_between_entities = nx.shortest_path_length(g, source=e1, target=e2)    
      path_between_entities =  nx.shortest_path(g, source=e1, target=e2)
      path_between_entities.remove(e1)
      path_between_entities.remove(e2)
    except:
      path_length_between_entities=np.nan
      path_between_entities=np.nan
    
    if display: displacy.render(nlp(sentence),jupyter=True)
    return (path_length_between_entities -1, path_between_entities)
    

  def print_tags(text):
    entText = nlp(text)
    for entity in entText.ents:
      print(entity.text, entity.label_)

  def get_tags(text):
    entText = nlp(text)
    ner = []
    tags = []
    for entity in entText.ents:
      ner.append(entity.text)
      tags.append(entity.label_)
    return (ner, tags)

  def ner_tags(e1, e2):
    e1_ner, e1_tag = get_tags(e1)
    e2_ner, e2_tag = get_tags(e2)
    if display:
      print('e1 ner tags:', e1_ner)
      print('e1 tag:', e1_tag)
      print('e2 ner tags:', e2_ner)
      print('e2 tag:', e2_tag)
    return (e1_ner, e1_tag, e2_ner,e2_tag)

  def get_wordnet_pos(tag):
    if tag.startswith('J'):
      return 'a'
    elif tag.startswith('V'):
      return 'v'
    elif tag.startswith('N'):
      return 'n'
    elif tag.startswith('R'):
      return 'r'
    else:
      return '' 

  def wordNet_extractions(word_pos_list):
    wordnet_extractions = []
    for word, nltk_pos in word_pos_list[0]:
      wordnet_pos = get_wordnet_pos(nltk_pos)
      if wn.synsets(word) and not wordnet_pos=='':
        synset = wn.synsets(word)[0]
        wordnet_extractions.append((word, nltk_pos, synset.hypernyms(), synset.hyponyms(), synset.member_meronyms(), synset.member_holonyms()))    
      else:
        wordnet_extractions.append((word, nltk_pos, wordnet_pos))

    if display: print('WordNet Extractions:', wordnet_extractions)
    return wordnet_extractions

  def wordNet_holo_meronymy_relations(word_pos_list):
    wordnet_holo_mero = []
    synsets_list = []
    for word, nltk_pos in word_pos_list[0]:
      wordnet_pos = get_wordnet_pos(nltk_pos)
      if wn.synsets(word) and not wordnet_pos=='':
        synset = wn.synsets(word)[0]
        synsets_list.append(synset)
        if synset.member_holonyms():
          wordnet_holo_mero.append(synset.member_holonyms())
        if synset.member_meronyms():
          wordnet_holo_mero.append(synset.member_meronyms())        
    wordnet_cleaned_list = list(itertools.chain(*wordnet_holo_mero))
    check_sentence_relations =  any(item in wordnet_cleaned_list for item in synsets_list)
    return check_sentence_relations

  #Note: this did not return many matches from the entities
  def wordnet_entity_relations(e1, e2):
    if not wn.synsets(e1) or not wn.synsets(e2):
      return False;
    wordnet_holo_mero_e1=[]
    wordnet_holo_mero_e2=[]
    for synset_e1 in wn.synsets(e1):
      if synset_e1.member_holonyms():
        wordnet_holo_mero_e1.append(synset_e1.member_holonyms())
      if synset_e1.member_meronyms():
        wordnet_holo_mero_e1.append(synset_e1.member_meronyms())   
    for synset_e2 in wn.synsets(e2):
      if synset_e2.member_holonyms():
        wordnet_holo_mero_e2.append(synset_e2.member_holonyms())
      if synset_e2.member_meronyms():
        wordnet_holo_mero_e2.append(synset_e2.member_meronyms())   
    wordnet_holo_mero_e1 = list(itertools.chain(*wordnet_holo_mero_e1))
    wordnet_holo_mero_e2 = list(itertools.chain(*wordnet_holo_mero_e2))
    if not wordnet_holo_mero_e1 or not wordnet_holo_mero_e2:
      return False
    else:     
      check1 =  any(item in wn.synsets(e1) for item in wordnet_holo_mero_e2)
      check2 =  any(item in wn.synsets(e2) for item in wordnet_holo_mero_e1)
      return (check1 or check2)

  def wordnet_similarity(e1, e2):
    if not wn.synsets(e1) or not wn.synsets(e2):
        return 'NA'
    else:
      return wn.synsets(e1)[0].path_similarity(wn.synsets(e2)[0])

  estimators = [('tokenizer', pipelinize(word_tokenizer)), ('lemmatizer', pipelinize(token_lemmatizer)), ('posTagger', pipelinize(tag_lemmatized)) ] 
  pipeline = Pipeline(estimators)
  new_sentence = pipeline.transform([sentence])
  new_e1 = pipeline.transform([e1])
  new_e2 = pipeline.transform([e2])

  return (dep_parse(sentence, e1, e2), ner_tags(e1,e2), wordNet_extractions(new_sentence), wordNet_holo_meronymy_relations(new_sentence), wordnet_entity_relations(e1,e2), new_sentence, (new_e1, new_e2), wordnet_similarity(e1, e2))

In [None]:
#@title DataProcessor

###############################################################################
# DataProcessor
#
#   Input:
#       lines       - lines from training/test set file content
#
#   Output: (data, pos_counts, lem_counts, bw_pos_counts, pos_bigram, lem_bigram, pos_trigram)
#       data    - sentence in between quotes excluding e1 and e2 tags
#       pos_counts          - part of speech counts
#       lem_counts          - lemma counts
#       bw_pos_counts       - bw parts of speech counts
#       pos_bigram          - parts of speech bigram
#       lem_bigram          - lemma bigrams
#       pos_trigram         - parts of speech trigrams
#
#   Description:
#       Return examples processed data
start_time = time.time()

def DataProcessor(lines):
  # List columns for dataset
  columns = ['label', 'e1_ner_tag_len', 'e2_ner_tag_len', 'e1_ner_tag', 'e2_ner_tag',
            'sentence_len', 'entity_distance', 'wordNet_holo_meronymy_relations',
            'wordnet_entity_relations','path_len_bw_ent']

  # Initialize data dict
  data = dict()
  for key in columns:
      data.update({key: []})

  # Lists for unigram counts
  #token_counts = []
  pos_counts = []
  lem_counts = []
  bw_pos_counts = []
  #bw_lem_counts = []

  # Lists for bigram counts
  pos_bigram = []
  lem_bigram = []

  # Lists for trigram counts
  pos_trigram = []

  # Lists for counts of dependency path between entitity
  dep_counts = []

  # Loop through examples in corpus
  i = 1
  for sentence, e1, e2, relation, e1_loc_orig, e2_loc_orig in CorpusReader(lines):
      dep_tree, ner_tags, wordNet_extractions, wordNet_holo_meronymy_relations, wordnet_entity_relations, pipeline_out, e_tags, wordnet_similarity = NLPPipeline(sentence, e1, e2)
      e1_ner, e1_tags, e2_ner, e2_tags = ner_tags
      e1_tag, e2_tag = e_tags
      path_len_bw_ent, path_bw_ents = dep_tree

      ############################################################################
      # Get token counts
      sentence_split = sentence.split()
      """token_dict = dict()
      for word in sentence_split:
          if "token_" + word in token_dict: token_dict["token_" + word] += 1
          else: token_dict.update({"token_" + word: 1})
        
      token_counts.append(token_dict)"""

      ############################################################################
      # Get POS counts, lemmatized token counts, e1 and e2 tag locations
      # Initialize values
      pos_dict = dict()
      lem_dict = dict()
      bw_pos_dict = dict()
      #bw_lem_dict = dict()
      pos_bigram_dict = dict()
      lem_bigram_dict = dict()
      pos_trigram_dict = dict()
      dep_words = dict()
      j = 0
      e1_loc = None
      e2_loc = None
      prev_pos = None
      prev_prev_pos = None
      prev_lem = None

      # Loop through pipeline
      while j < len(pipeline_out[0]):
          word, pos = pipeline_out[0][j]

          # Add POS count
          if "pos_" + pos in pos_dict: pos_dict["pos_" + pos] += 1
          else: pos_dict.update({"pos_" + pos: 1})

          # Add lemmatized token count
          if "lem_" + word in lem_dict: lem_dict["lem_" + word] += 1
          else: lem_dict.update({"lem_" + word: 1})

          # Add bigram POS count and bigram lemmatized token count
          if prev_pos != None:
              if "bigram_pos_" + prev_pos + "_" + pos in pos_bigram_dict: pos_bigram_dict["bigram_pos_" + prev_pos + "_" + pos] += 1
              else: pos_bigram_dict.update({"bigram_pos_" + prev_pos + "_" + pos: 1})

              if "bigram_lem_" + prev_lem + "_" + word in lem_bigram_dict: lem_bigram_dict["bigram_lem_" + prev_lem + "_" + word] += 1
              else: lem_bigram_dict.update({"bigram_lem_" + prev_lem + "_" + word: 1})

          # Add trigram POS count
          if prev_prev_pos != None:
              if "trigram_pos_" + prev_prev_pos + "_" + prev_pos + "_" + pos in pos_trigram_dict: pos_trigram_dict["trigram_pos_" + prev_prev_pos + "_" + prev_pos + "_" + pos] += 1
              else: pos_trigram_dict.update({"trigram_pos_" + prev_prev_pos + "_" + prev_pos + "_" + pos: 1})

          # Find entity token locations
          if e1_loc == None and e1_loc_orig <= j and len(e1_tag[0]) > 0 and word == e1_tag[0][0][0]: e1_loc = j
          elif e2_loc == None and e2_loc_orig <= j and len(e2_tag[0]) > 0 and word == e2_tag[0][0][0]: e2_loc = j
          
          # Update POS counts and lemmatized token counts between e1 and e2
          elif len(e1_tag[0]) > 0 and len(e2_tag[0]) > 0 and e1_loc != None and e2_loc == None: 
              if "bw_pos_" + pos in bw_pos_dict: bw_pos_dict["bw_pos_" + pos] += 1
              else: bw_pos_dict.update({"bw_pos_" + pos: 1})

              #if "bw_lem_" + word in bw_lem_dict: bw_lem_dict["bw_lem_" + word] += 1
              #else: bw_lem_dict.update({"bw_lem_" + word: 1})

          j += 1
          prev_prev_pos = prev_pos
          prev_pos = pos
          prev_lem = word

      # Print if e1/e2 are not found
      if e1_loc == None: print(i, "\t: e1_loc = NONE")
      if e2_loc == None: print(i, "\t: e2_loc = NONE")

      pos_counts.append(pos_dict)
      lem_counts.append(lem_dict)
      bw_pos_counts.append(bw_pos_dict)
      #bw_lem_counts.append(bw_lem_dict)
      pos_bigram.append(pos_bigram_dict)
      lem_bigram.append(lem_bigram_dict)
      pos_trigram.append(pos_trigram_dict)

      ############################################################################
      # Add relation as label to data
      data['label'].append(relation)

      # Add sentence length to data
      data['sentence_len'].append(len(sentence_split))

      # Add distance between entities
      if e1_loc != None and e2_loc != None: data['entity_distance'].append(e2_loc - e1_loc)
      else: data['entity_distance'].append(np.nan)

      ############################################################################
      # Add WordNet relations
      data['wordNet_holo_meronymy_relations'].append(wordNet_holo_meronymy_relations)
      data['wordnet_entity_relations'].append(wordnet_entity_relations)

      #Add Dependency Path Length Between Entities
      data['path_len_bw_ent'].append(path_len_bw_ent)
      for word in path_bw_ents:
          if "dep_" + word in dep_words: dep_words["dep_" + word] += 1
          else: dep_words.update({"dep_" + word: 1})
      dep_counts.append(dep_words)
      
      ############################################################################
      # Add first ner tag for e1 if available
      if len(e1_tags) == 0: data['e1_ner_tag'].append(np.nan)
      elif len(e1_tags) == 1: data['e1_ner_tag'].append(e1_tags[0])
      else:
          # Look for PERSON or ORG tags
          found = False
          for tag in e1_tags:
              if tag == 'PERSON' or tag == 'ORG':
                  data['e1_ner_tag'].append(tag)
                  found = True
                  break
          
          # No PERSON or ORG tags -> add first tag
          if not found: data['e1_ner_tag'].append(e1_tags[0])

      # Add first ner tag for e2 if available
      if len(e2_tags) == 0: data['e2_ner_tag'].append(np.nan)
      elif len(e2_tags) == 1: data['e2_ner_tag'].append(e2_tags[0])
      else:
          # Look for PERSON or ORG tags
          found = False
          for tag in e2_tags:
              if tag == 'PERSON' or tag == 'ORG':
                  data['e2_ner_tag'].append(tag)
                  found = True
                  break
          
          # No PERSON or ORG tags -> add first tag
          if not found: data['e2_ner_tag'].append(e2_tags[0])

      # Add ner tag len for e1 -> likely not useful for ml, but good to know
      data['e1_ner_tag_len'].append(len(e1_tags))

      # Add ner tag len for e2
      data['e2_ner_tag_len'].append(len(e2_tags))

      ############################################################################
      #if i%100 == 0: print(i, "\t:", (time.time() - start_time)/60.0, "minutes")
      i += 1

  #print("Total time:", (time.time() - start_time)/60.0, "minutes")
  return data, pos_counts, lem_counts, bw_pos_counts, pos_bigram, lem_bigram, pos_trigram, dep_counts

In [None]:
#@title Get Sentence List

def get_sentence_list(file_name):
    # Get file lines
    file_ = open(file_name)
    lines = file_.readlines()
    file_.close()

    list_of_sentences = []

    # Loop through examples in corpus
    i = 1
    for sentence, e1, e2, relation, e1_loc, e2_loc in CorpusReader(lines):
        list_of_sentences.append(sentence)

    return list_of_sentences

**CREATE ORIGINAL TRAIN DATAFRAME**
---


In [None]:
#@title Loop through examples in corpus for training file
###############################################################################
# Hard-coded train file
train_file = "semeval_train.txt"

# Get train file lines
train = open(train_file)
train_lines = train.readlines()
train.close()

# Loop through examples in corpus
i = 1
for sentence, e1, e2, relation, e1_loc, e2_loc in CorpusReader(train_lines):
    print()
    print(i)
    print("sentence:", sentence)
    dep_tree, ner_tags, wordNet_extractions, wordNet_holo_meronymy_relations, wordnet_entity_relations, pipeline_out, e_tags, wordnet_similarity = NLPPipeline(sentence, e1, e2, display=True)
    print("dep_tree:", dep_tree)
    print("ner_tags", ner_tags)
    print("wordNet_extractions:", wordNet_extractions)
    print("wordNet_holo_meronymy_relations:", wordNet_holo_meronymy_relations)
    print("wordnet_entity_relations:", wordnet_entity_relations)
    print("wordnet_similarity:", wordnet_similarity)
    print("pipeline_out:", pipeline_out)
    print("e_tags", e_tags)
    print("e1:", e1)
    print("e2:", e2)
    print("e1_loc:", e1_loc)
    print("e2_loc:", e2_loc)
    print("relation:", relation)
    if i==4:  break
    i += 1


1
sentence: The system as described above has its greatest application in an arrayed configuration of antenna elements .
Tokenized words: ['The', 'system', 'as', 'described', 'above', 'has', 'its', 'greatest', 'application', 'in', 'an', 'arrayed', 'configuration', 'of', 'antenna', 'elements', '.']
Lemmatized words: ['The', 'system', 'a', 'described', 'above', 'ha', 'it', 'greatest', 'application', 'in', 'an', 'arrayed', 'configuration', 'of', 'antenna', 'element', '.']
POS tagged: [('The', 'DT'), ('system', 'NN'), ('a', 'DT'), ('described', 'VBN'), ('above', 'IN'), ('ha', 'NN'), ('it', 'PRP'), ('greatest', 'JJS'), ('application', 'NN'), ('in', 'IN'), ('an', 'DT'), ('arrayed', 'JJ'), ('configuration', 'NN'), ('of', 'IN'), ('antenna', 'JJ'), ('element', 'NN'), ('.', '.')]
Tokenized words: ['configuration']
Lemmatized words: ['configuration']
POS tagged: [('configuration', 'NN')]
Tokenized words: ['elements']
Lemmatized words: ['element']
POS tagged: [('element', 'NN')]


e1 ner tags: []
e1 tag: []
e2 ner tags: []
e2 tag: []
WordNet Extractions: [('The', 'DT', ''), ('system', 'NN', [Synset('instrumentality.n.03')], [Synset('audio_system.n.01'), Synset('communication_system.n.01'), Synset('computer_system.n.01'), Synset('containment.n.02'), Synset('control_system.n.01'), Synset('data_system.n.01'), Synset('drainage_system.n.01'), Synset('exhaust.n.02'), Synset('explosive_detection_system.n.01'), Synset('explosive_trace_detection.n.01'), Synset('guidance_system.n.01'), Synset('hookup.n.02'), Synset('inertial_guidance_system.n.01'), Synset('lockage.n.02'), Synset('maze.n.01'), Synset('mechanical_system.n.01'), Synset('navigational_system.n.01'), Synset('network.n.04'), Synset('network.n.05'), Synset('propulsion_system.n.01'), Synset('resonator.n.03'), Synset('scaffolding.n.01'), Synset('security_system.n.01'), Synset('selsyn.n.01'), Synset('shipboard_system.n.01'), Synset('solar_thermal_system.n.01'), Synset('sprinkler_system.n.01'), Synset('synchromesh.n.

e1 ner tags: []
e1 tag: []
e2 ner tags: []
e2 tag: []
WordNet Extractions: [('The', 'DT', ''), ('child', 'NN', [Synset('juvenile.n.01')], [Synset('bairn.n.01'), Synset('buster.n.02'), Synset('changeling.n.02'), Synset('child_prodigy.n.01'), Synset('foster-child.n.01'), Synset('imp.n.02'), Synset('kiddy.n.01'), Synset('orphan.n.01'), Synset('peanut.n.03'), Synset('pickaninny.n.01'), Synset('poster_child.n.01'), Synset('preschooler.n.01'), Synset('silly.n.01'), Synset('sprog.n.02'), Synset('toddler.n.01'), Synset('urchin.n.01'), Synset('waif.n.01')], [], []), ('wa', 'NN', [], [], [], []), ('carefully', 'RB', [], [], [], []), ('wrapped', 'VBD', [Synset('cover.v.01')], [Synset('cere.v.01'), Synset('do_up.v.01'), Synset('gift-wrap.v.01'), Synset('parcel.v.03'), Synset('shrinkwrap.v.01'), Synset('shroud.v.03')], [], []), ('and', 'CC', ''), ('bound', 'VBD', [Synset('line.n.04')], [Synset('brink.n.01'), Synset('lower_bound.n.01'), Synset('margin.n.01'), Synset('periphery.n.01'), Synset('rim.n.

e1 ner tags: []
e1 tag: []
e2 ner tags: []
e2 tag: []
WordNet Extractions: [('The', 'DT', ''), ('author', 'NN', [Synset('communicator.n.01')], [Synset('abstractor.n.01'), Synset('alliterator.n.01'), Synset('authoress.n.01'), Synset('biographer.n.01'), Synset('coauthor.n.01'), Synset('commentator.n.02'), Synset('compiler.n.01'), Synset('contributor.n.02'), Synset('cyberpunk.n.02'), Synset('drafter.n.01'), Synset('dramatist.n.01'), Synset('essayist.n.01'), Synset('folk_writer.n.01'), Synset('framer.n.02'), Synset('gagman.n.02'), Synset('ghostwriter.n.01'), Synset('gothic_romancer.n.01'), Synset('hack.n.03'), Synset('journalist.n.01'), Synset('librettist.n.01'), Synset('lyricist.n.01'), Synset('novelist.n.01'), Synset('pamphleteer.n.01'), Synset('paragrapher.n.01'), Synset('poet.n.01'), Synset('polemicist.n.01'), Synset('rhymer.n.01'), Synset('scenarist.n.01'), Synset('scriptwriter.n.01'), Synset('space_writer.n.01'), Synset('speechwriter.n.01'), Synset('tragedian.n.01'), Synset('word-pai

e1 ner tags: []
e1 tag: []
e2 ner tags: []
e2 tag: []
WordNet Extractions: [('A', 'DT', ''), ('misty', 'JJ', [], [], [], []), ('ridge', 'NN', [Synset('natural_elevation.n.01')], [Synset('bank.n.03'), Synset('bar.n.08'), Synset('dune.n.01'), Synset('esker.n.01'), Synset('ledge.n.01'), Synset('reef.n.01'), Synset('ripple_mark.n.01')], [], []), ('uprises', 'NNS', [Synset('become.v.03')], [Synset('come.v.05'), Synset('come_forth.v.02'), Synset('head.v.06'), Synset('resurge.v.01'), Synset('well_up.v.01')], [], []), ('from', 'IN', ''), ('the', 'DT', ''), ('surge', 'NN', [Synset('flow.n.01')], [Synset('debris_surge.n.01'), Synset('onrush.n.02')], [], []), ('.', '.', '')]
dep_tree: (2, ['uprises', 'from'])
ner_tags ([], [], [], [])
wordNet_extractions: [('A', 'DT', ''), ('misty', 'JJ', [], [], [], []), ('ridge', 'NN', [Synset('natural_elevation.n.01')], [Synset('bank.n.03'), Synset('bar.n.08'), Synset('dune.n.01'), Synset('esker.n.01'), Synset('ledge.n.01'), Synset('reef.n.01'), Synset('ripple

In [None]:
#@title Run DataProcessor
train_file = open("semeval_train.txt")
train_file_lines = train_file.readlines()
train_file.close()
data, pos_counts, lem_counts, bw_pos_counts, pos_bigram, lem_bigram, pos_trigram, dep_counts = DataProcessor(train_file_lines)

In [None]:
#@title Save and download training data pkl files
"""
#Only uncomment if saving is needed
# Save all objects
save_obj(data, 'data')
save_obj(pos_counts, 'pos_counts')
save_obj(lem_counts, 'lem_counts')
save_obj(bw_pos_counts, 'bw_pos_counts')
save_obj(pos_bigram, 'pos_bigram')
save_obj(lem_bigram, 'lem_bigram')
save_obj(pos_trigram, 'pos_trigram')
save_obj(dep_counts, 'dep_counts')

files.download('data.pkl')
files.download('pos_counts.pkl')
files.download('lem_counts.pkl')
files.download('bw_pos_counts.pkl')
files.download('pos_bigram.pkl')
files.download('lem_bigram.pkl')
files.download('pos_trigram.pkl')
files.download('dep_counts.pkl')
"""

In [None]:
#@title Load training data from pkl files
# Load all objects
data = load_obj('data')
pos_counts = load_obj('pos_counts')
lem_counts = load_obj('lem_counts')
bw_pos_counts = load_obj('bw_pos_counts')
pos_bigram = load_obj('pos_bigram')
lem_bigram = load_obj('lem_bigram')
pos_trigram = load_obj('pos_trigram')

In [None]:
#@title Create Pandas DataFrame for Training
dataset = pd.DataFrame(data)
display(dataset)

Unnamed: 0,label,e1_ner_tag_len,e2_ner_tag_len,e1_ner_tag,e2_ner_tag,sentence_len,entity_distance,wordNet_holo_meronymy_relations,wordnet_entity_relations,path_len_bw_ent,path_bw_ents
0,"Component-Whole(e2,e1)",0,0,,,17,3,False,False,2.0,[of]
1,Other,0,0,,,15,8,False,False,2.0,[the]
2,"Instrument-Agency(e2,e1)",0,0,,,15,6,False,False,2.0,[uses]
3,Other,0,0,,,8,4,False,False,3.0,"[uprises, from]"
4,"Member-Collection(e1,e2)",0,0,,,20,1,False,False,1.0,[]
...,...,...,...,...,...,...,...,...,...,...,...
7995,Other,0,0,,,20,4,False,False,3.0,"[sent, by]"
7996,"Entity-Origin(e1,e2)",0,0,,,19,6,False,False,3.0,"[derived, from]"
7997,"Entity-Destination(e1,e2)",0,0,,,26,3,False,False,2.0,[in]
7998,Other,0,0,,,26,1,False,False,1.0,[]


In [None]:
#@title Parse Label Parsing

dataset = parse_label(dataset)  
display(dataset)

Unnamed: 0,label,rel,dir,e1_ner_tag_len,e2_ner_tag_len,e1_ner_tag,e2_ner_tag,sentence_len,entity_distance,wordNet_holo_meronymy_relations,wordnet_entity_relations,path_len_bw_ent,path_bw_ents
0,"Component-Whole(e2,e1)",Component-Whole,"(e2,e1)",0,0,,,17,3,False,False,2.0,[of]
1,Other,Other,Other,0,0,,,15,8,False,False,2.0,[the]
2,"Instrument-Agency(e2,e1)",Instrument-Agency,"(e2,e1)",0,0,,,15,6,False,False,2.0,[uses]
3,Other,Other,Other,0,0,,,8,4,False,False,3.0,"[uprises, from]"
4,"Member-Collection(e1,e2)",Member-Collection,"(e1,e2)",0,0,,,20,1,False,False,1.0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,Other,Other,Other,0,0,,,20,4,False,False,3.0,"[sent, by]"
7996,"Entity-Origin(e1,e2)",Entity-Origin,"(e1,e2)",0,0,,,19,6,False,False,3.0,"[derived, from]"
7997,"Entity-Destination(e1,e2)",Entity-Destination,"(e1,e2)",0,0,,,26,3,False,False,2.0,[in]
7998,Other,Other,Other,0,0,,,26,1,False,False,1.0,[]


In [None]:
#@title Convert string columns to numeric and fill nans
le_label = preprocessing.LabelEncoder()
le_rel = preprocessing.LabelEncoder()
le_dir = preprocessing.LabelEncoder()
le_e1 = preprocessing.LabelEncoder()
le_e2 = preprocessing.LabelEncoder()
lb_wordNet_holo_meronymy_relations = preprocessing.LabelBinarizer()
lb_wordnet_entity_relations = preprocessing.LabelBinarizer()
dataset['label'] = le_label.fit_transform(dataset['label'])
dataset['rel'] = le_rel.fit_transform(dataset['rel'])
dataset['dir'] = le_dir.fit_transform(dataset['dir'])
dataset['e1_ner_tag'] = dataset['e1_ner_tag'].fillna("NAN")
dataset['e1_ner_tag'] = le_e1.fit_transform(dataset['e1_ner_tag'])
dataset['e2_ner_tag'] = dataset['e2_ner_tag'].fillna("NAN")
dataset['e2_ner_tag'] = le_e2.fit_transform(dataset['e2_ner_tag'])
dataset['wordNet_holo_meronymy_relations'] = lb_wordNet_holo_meronymy_relations.fit_transform(dataset['wordNet_holo_meronymy_relations'])
dataset['wordnet_entity_relations'] = lb_wordnet_entity_relations.fit_transform(dataset['wordnet_entity_relations'])


In [None]:
#@title Saving encoders
save_obj(le_label, 'le_label')
save_obj(le_rel, 'le_rel')
save_obj(le_dir, 'le_dir')
save_obj(le_e1, 'le_e1')
save_obj(le_e2, 'le_e2')
save_obj(lb_wordNet_holo_meronymy_relations, 'lb_wordNet_holo_meronymy_relations')
save_obj(lb_wordnet_entity_relations, 'lb_wordnet_entity_relations')

In [None]:
#@title Convert to numpy array
# Set correct length of columns
corr_len = len(data['label'])

# Convert pandas dataframe to numpy array
dataset = dataset.to_numpy()

display(dataset)
print(dataset.shape)

array([[ 3,  1,  1,  2, 17,  3],
       [16,  8,  2,  2, 15,  8],
       [11,  5,  1,  2, 15,  6],
       ...,
       [ 6,  3,  0,  2, 26,  3],
       [16,  8,  2,  2, 26,  1],
       [18,  9,  1,  2, 20,  4]])

(8000, 6)


In [None]:
#@title Remove useless features - 8 columns left
# label, rel, dir, e2_ner_tag	sentence_len	entity_distance
dataset, columns = feature_selection(dataset)
all_columns = ['label', 'rel', 'dir', 'e1_ner_tag','e2_ner_tag', 'sentence_len', 'entity_distance', 'path_len_bw_ent']
col_len = len(all_columns)
display(dataset)
print(dataset.shape)
print(columns)

array([[ 3,  1,  1,  2, 17,  3],
       [16,  8,  2,  2, 15,  8],
       [11,  5,  1,  2, 15,  6],
       ...,
       [ 6,  3,  0,  2, 26,  3],
       [16,  8,  2,  2, 26,  1],
       [18,  9,  1,  2, 20,  4]])

(8000, 6)
[0 1 2 3 4 5]


In [None]:
#@title Dependency Counts
start_time = time.time()
data = toColumn(dep_counts)
dataset, columns = addKeys(dataset, data)
all_columns += list(np.array(list(data))[columns[col_len:] - col_len])
col_len = len(all_columns)
display(dataset)
print(dataset.shape)
print(columns)
print(time.time()-start_time, "seconds")

In [None]:
#@title POS counts - 18 columns
#   Time: 0.11 seconds
start_time = time.time()
data = toColumn(pos_counts)
dataset, columns = addKeys(dataset, data)
all_columns += list(np.array(list(data))[columns[col_len:] - col_len])
col_len = len(all_columns)
display(dataset)
print(dataset.shape)
print(columns)
print(time.time()-start_time, "seconds")

array([[ 3,  1,  1, ...,  0,  0,  0],
       [16,  8,  2, ...,  0,  0,  0],
       [11,  5,  1, ...,  0,  0,  0],
       ...,
       [ 6,  3,  0, ...,  0,  0,  0],
       [16,  8,  2, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0]])

(8000, 24)
[ 0  1  2  3  4  5  6  7  8  9 10 12 14 15 16 17 18 20 21 22 25 26 27 28]
0.08659195899963379 seconds


In [None]:
#@title Lemmatized token counts - 12 columns
#   Time: 8772.74 seconds -> 2.5 hours
start_time = time.time()
data = toColumn(lem_counts)
dataset, columns = addKeys(dataset, data)
all_columns += list(np.array(list(data))[columns[col_len:] - col_len])
col_len = len(all_columns)
display(dataset)
print(dataset.shape)
print(columns)
print(time.time()-start_time, "seconds")

array([[ 3,  1,  1, ...,  0,  0,  0],
       [16,  8,  2, ...,  0,  0,  0],
       [11,  5,  1, ...,  0,  0,  0],
       ...,
       [ 6,  3,  0, ...,  0,  0,  0],
       [16,  8,  2, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0]])

(8000, 36)
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  26  33  37  42  45  48  50  57  67  71 188]
3533.60116147995 seconds


In [None]:
#@title Between entity POS counts - 6 columns
#   Time: 0.1 seconds
start_time = time.time()
data = toColumn(bw_pos_counts)
dataset, columns = addKeys(dataset, data)
all_columns += list(np.array(list(data))[columns[col_len:] - col_len])
col_len = len(all_columns)
display(dataset)
print(dataset.shape)
print(columns)
print(time.time()-start_time, "seconds")

array([[ 3,  1,  1, ...,  0,  0,  0],
       [16,  8,  2, ...,  2,  1,  0],
       [11,  5,  1, ...,  1,  2,  0],
       ...,
       [ 6,  3,  0, ...,  0,  0,  0],
       [16,  8,  2, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0]])

(8000, 42)
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 40 42 50]
2.2414872646331787 seconds


In [None]:
#@title POS bigram counts - 19 columns
#   Time: 28.78 seconds
start_time = time.time()
data = toColumn(pos_bigram)
dataset, columns = addKeys(dataset, data)
all_columns += list(np.array(list(data))[columns[col_len:] - col_len])
col_len = len(all_columns)
display(dataset)
print(dataset.shape)
print(columns)
print(time.time()-start_time, "seconds")

array([[ 3,  1,  1, ...,  0,  0,  0],
       [16,  8,  2, ...,  0,  0,  0],
       [11,  5,  1, ...,  0,  0,  0],
       ...,
       [ 6,  3,  0, ...,  0,  0,  0],
       [16,  8,  2, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0]])

(8000, 61)
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  45  46  50  51  52  53  54  55  56  64  68
  71  73  92  94 119 127 129]
12.818054437637329 seconds


In [None]:
#@title Lemmatized token bigram counts - Not done
start_time = time.time()
data = toColumn(lem_bigram)
dataset, columns = addKeys(dataset, data)
all_columns += list(np.array(list(data))[columns[col_len:] - col_len])
col_len = len(all_columns)
display(dataset)
print(dataset.shape)
print(columns)
print(time.time()-start_time, "seconds")

In [None]:
#@title POS trigram counts - 11 columns
#   Time: 1199.95 seconds -> 20 minutes
start_time = time.time()
data = toColumn(pos_trigram)
dataset, columns = addKeys(dataset, data)
all_columns += list(np.array(list(data))[columns[col_len:] - col_len])
col_len = len(all_columns)
display(dataset)
print(dataset.shape)
print(columns)
print(time.time()-start_time, "seconds")

array([[ 3,  1,  1, ...,  0,  0,  0],
       [16,  8,  2, ...,  1,  0,  0],
       [11,  5,  1, ...,  1,  1,  0],
       ...,
       [ 6,  3,  0, ...,  0,  0,  0],
       [16,  8,  2, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0]])

(8000, 72)
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  69  70  71  72  74  76  83  84  85  95 137]
489.0403461456299 seconds


In [None]:
#@title Save Dataset and Columns
save_obj(all_columns, 'all_columns')
save_obj(dataset, 'dataset')
files.download('all_columns.pkl')
files.download('dataset.pkl')

In [None]:
#@title Clean Columns
all_columns = load_obj('all_columns')
dataset = load_obj('dataset')

print(all_columns)

cleaned_col_strings = []
for col_name in all_columns:
  new_col = col_name.replace(",","#")
  new_col = new_col.replace(".","*")
  cleaned_col_strings.append(new_col)

['label', 'rel', 'dir', 'e2_ner_tag', 'sentence_len', 'entity_distance', 'pos_DT', 'pos_NN', 'pos_VBN', 'pos_IN', 'pos_PRP', 'pos_JJ', 'pos_RB', 'pos_VBD', 'pos_CC', 'pos_TO', 'pos_VB', 'pos_VBZ', 'pos_NNP', 'pos_VBG', 'pos_VBP', 'pos_CD', 'pos_PRP$', 'pos_,', 'lem_The', 'lem_a', 'lem_in', 'lem_of', 'lem_wa', 'lem_and', 'lem_the', 'lem_by', 'lem_to', 'lem_from', 'lem_is', 'lem_,', 'bw_pos_IN', 'bw_pos_JJ', 'bw_pos_NN', 'bw_pos_VBD', 'bw_pos_DT', 'bw_pos_VBN', 'bigram_pos_DT_NN', 'bigram_pos_VBN_IN', 'bigram_pos_IN_NN', 'bigram_pos_NN_IN', 'bigram_pos_IN_DT', 'bigram_pos_DT_JJ', 'bigram_pos_JJ_NN', 'bigram_pos_IN_JJ', 'bigram_pos_NN_.', 'bigram_pos_NN_NN', 'bigram_pos_TO_VB', 'bigram_pos_NN_VBZ', 'bigram_pos_NNP_NNP', 'bigram_pos_IN_NNP', 'bigram_pos_NN_CC', 'bigram_pos_NN_VBD', 'bigram_pos_NN_,', 'bigram_pos_CC_NN', 'bigram_pos_VBD_DT', 'trigram_pos_NN_IN_DT', 'trigram_pos_IN_DT_JJ', 'trigram_pos_DT_JJ_NN', 'trigram_pos_JJ_NN_IN', 'trigram_pos_IN_JJ_NN', 'trigram_pos_DT_NN_NN', 'trigra

In [None]:
#@title Download csv
col_names_string = ",".join(cleaned_col_strings)
np.savetxt('dataset.csv', dataset, delimiter=",", header=col_names_string)
files.download('dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#@title Load csv
dataset = "dataset.csv"
df = pd.read_csv(dataset, header='infer')
display(df)

Unnamed: 0,# label,rel,dir,e2_ner_tag,sentence_len,entity_distance,pos_DT,pos_NN,pos_VBN,pos_IN,pos_PRP,pos_JJ,pos_RB,pos_VBD,pos_CC,pos_TO,pos_VB,pos_VBZ,pos_NNP,pos_VBG,pos_VBP,pos_CD,pos_PRP$,pos_#,lem_The,lem_a,lem_in,lem_of,lem_wa,lem_and,lem_the,lem_by,lem_to,lem_from,lem_is,lem_#,bw_pos_IN,bw_pos_JJ,bw_pos_NN,bw_pos_VBD,bw_pos_DT,bw_pos_VBN,bigram_pos_DT_NN,bigram_pos_VBN_IN,bigram_pos_IN_NN,bigram_pos_NN_IN,bigram_pos_IN_DT,bigram_pos_DT_JJ,bigram_pos_JJ_NN,bigram_pos_IN_JJ,bigram_pos_NN_#,bigram_pos_NN_NN,bigram_pos_TO_VB,bigram_pos_NN_VBZ,bigram_pos_NNP_NNP,bigram_pos_IN_NNP,bigram_pos_NN_CC,bigram_pos_NN_VBD,bigram_pos_NN_#.1,bigram_pos_CC_NN,bigram_pos_VBD_DT,trigram_pos_NN_IN_DT,trigram_pos_IN_DT_JJ,trigram_pos_DT_JJ_NN,trigram_pos_JJ_NN_IN,trigram_pos_IN_JJ_NN,trigram_pos_DT_NN_NN,trigram_pos_IN_DT_NN,trigram_pos_DT_NN_IN,trigram_pos_NN_IN_NN,trigram_pos_JJ_NN_NN,trigram_pos_VBN_IN_DT
0,3.0,1.0,1.0,2.0,17.0,3.0,3.0,5.0,1.0,3.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,16.0,8.0,2.0,2.0,15.0,8.0,3.0,5.0,1.0,3.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,0.0,3.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0
2,11.0,5.0,1.0,2.0,15.0,6.0,4.0,5.0,1.0,2.0,3.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,16.0,8.0,2.0,2.0,8.0,4.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,3.0,2.0,1.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0.0
4,12.0,6.0,0.0,2.0,20.0,1.0,4.0,5.0,1.0,4.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,0.0,3.0,2.0,1.0,1.0,2.0,0.0,2.0,1.0,1.0,1.0,2.0,5.0,5.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,1.0,4.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,16.0,8.0,2.0,2.0,20.0,4.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7996,8.0,4.0,0.0,2.0,19.0,6.0,0.0,11.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7997,6.0,3.0,0.0,2.0,26.0,3.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7998,16.0,8.0,2.0,2.0,26.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**CREATE COMBINED TRAIN DATAFRAME**
---



In [None]:
#@title Get Sentence List
list_of_sentences = get_sentence_list('semeval_train.txt')

In [None]:
#@title Create Unigram Vectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(list_of_sentences).toarray()

In [None]:
#@title Unigram Feature Selection
X, u_columns = feature_selection(X, t = 0.001)

In [None]:
#@title Create Bigram Vectorizer
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), token_pattern=r'\b\w+\b', min_df=1)
X2 = bigram_vectorizer.fit_transform(list_of_sentences).toarray()

In [None]:
#@title Bigram Feature Selection (1)

# Split feature selection into increments (to prevent out of RAM errors)
results1 = feature_selection(X2[:, :20000], t=0.0003)

In [None]:
#@title Bigram Feature Selection (2)
results2 = feature_selection(X2[:, 20000:40000], t=0.0003)

In [None]:
#@title Bigram Feature Selection (3)
results3 = feature_selection(X2[:, 40000:60000], t=0.0003)

In [None]:
#@title Bigram Feature Selection (4)
results4 = feature_selection(X2[:, 60000:80000], t=0.0003)

In [None]:
#@title Bigram Feature Selection (5)
results5 = feature_selection(X2[:, 80000:], t=0.0003)

In [None]:
#@title Bigram Feature Selection (6)

# Merge all feature selection results
X2 = np.hstack((results1[0], results2[0], results3[0], results4[0], results5[0]))

In [None]:
#@title Merge Unigram and Bigram Counts Together
X3 = np.hstack((X, X2))

In [None]:
#@title Get All Unigram Bigram Column Names
ub_all_columns = list(u_columns) + list(results1[1]) + list(results2[1]) + list(results3[1]) + list(results4[1]) + list(results5[1])

In [None]:
#@title Create Combined DataFrame
train_dataset = pd.read_csv('dataset.csv')
train_X = train_dataset.to_numpy()
combined = np.hstack((train_X, X3))

# To DataFrame
combined_df = pd.DataFrame(combined, columns=list(train_dataset.columns) + ub_all_columns)

In [None]:
#@title Save csv
combined_df.to_csv('combined_train.csv') 
files.download('combined_train.csv')

**CREATE ORIGINAL TEST DATAFRAME**
---



In [None]:
#@title Loop through test examples
# Hard-coded test file
test_file = "semeval_test.txt"

# Get test file lines
test = open(test_file)
test_lines = test.readlines()
test.close()

# Loop through examples in corpus
i = 1
for sentence, e1, e2, relation, e1_loc, e2_loc in CorpusReader(test_lines):
    print()
    print(i)
    print("sentence:", sentence)
    dep_tree, ner_tags, wordNet_extractions, wordNet_holo_meronymy_relations, wordnet_entity_relations, pipeline_out, e_tags, wordnet_similarity = NLPPipeline(sentence, e1, e2)
    print("dep_tree:", dep_tree)
    print("ner_tags", ner_tags)
    print("wordNet_extractions:", wordNet_extractions)
    print("wordNet_holo_meronymy_relations:", wordNet_holo_meronymy_relations)
    print("wordnet_entity_relations:", wordnet_entity_relations)
    print("wordnet_similarity:", wordnet_similarity)
    print("pipeline_out:", pipeline_out)
    print("e_tags", e_tags)
    print("e1:", e1)
    print("e2:", e2)
    print("e1_loc:", e1_loc)
    print("e2_loc:", e2_loc)
    print("relation:", relation)
    if i==4:  break
    i += 1

In [None]:
#@title Run DataProcessor
test_file = open("semeval_test.txt")
test_file_lines = test_file.readlines()
test_file.close()
test_data, test_pos_counts, test_lem_counts, test_bw_pos_counts, test_pos_bigram, test_lem_bigram, test_pos_trigram, test_dep_counts = DataProcessor(test_file_lines)

100 	: 1.1318065087000528 minutes
200 	: 2.220451323191325 minutes
300 	: 3.309110422929128 minutes
400 	: 4.395928374926249 minutes
500 	: 5.488763578732809 minutes
600 	: 6.597280609607696 minutes
700 	: 7.713918081919352 minutes
800 	: 8.805193734169006 minutes
900 	: 9.8951860109965 minutes
1000 	: 10.99754974047343 minutes
1100 	: 12.104061901569366 minutes
1200 	: 13.209910742441814 minutes
1300 	: 14.317969167232514 minutes
1400 	: 15.53614310423533 minutes
1500 	: 16.87615841627121 minutes
1600 	: 18.16016369263331 minutes
1700 	: 19.44098145167033 minutes
1800 	: 20.732945545514426 minutes
1900 	: 21.850903578599294 minutes
2000 	: 22.95852600336075 minutes
2100 	: 24.062557860215506 minutes
2200 	: 25.193063922723134 minutes
2300 	: 26.302772402763367 minutes
2400 	: 27.471490816275278 minutes
2500 	: 28.59602326552073 minutes
2600 	: 29.732338154315947 minutes
2700 	: 30.846593364079794 minutes
Total time: 31.05245924393336 minutes


In [None]:
#@title Save all objects
"""
Only uncomment if saving is needed
Save all objects
save_obj(test_data, 'test_data')
save_obj(test_pos_counts, 'test_pos_counts')
save_obj(test_lem_counts, 'test_lem_counts')
save_obj(test_bw_pos_counts, 'test_bw_pos_counts')
save_obj(test_pos_bigram, 'test_pos_bigram')
save_obj(test_lem_bigram, 'test_lem_bigram')
save_obj(test_pos_trigram, 'test_pos_trigram')
save_obj(test_pos_trigram, 'test_pos_trigram')
save_obj(test_dep_counts, 'test_dep_counts')

from google.colab import files
files.download('test_data.pkl')
files.download('test_pos_counts.pkl')
files.download('test_lem_counts.pkl')
files.download('test_bw_pos_counts.pkl')
files.download('test_pos_bigram.pkl')
files.download('test_lem_bigram.pkl')
files.download('test_pos_trigram.pkl')
save_obj(pos_trigram, 'test_pos_trigram')
save_obj(dep_counts, 'test_dep_counts')
"""

"\nOnly uncomment if saving is needed\nSave all objects\nsave_obj(data, 'test_data')\nsave_obj(pos_counts, 'test_pos_counts')\nsave_obj(lem_counts, 'test_lem_counts')\nsave_obj(bw_pos_counts, 'test_bw_pos_counts')\nsave_obj(pos_bigram, 'test_pos_bigram')\nsave_obj(lem_bigram, 'test_lem_bigram')\nsave_obj(pos_trigram, 'test_pos_trigram')\n\nfrom google.colab import files\nfiles.download('test_data.pkl')\nfiles.download('test_pos_counts.pkl')\nfiles.download('test_lem_counts.pkl')\nfiles.download('test_bw_pos_counts.pkl')\nfiles.download('test_pos_bigram.pkl')\nfiles.download('test_lem_bigram.pkl')\nfiles.download('test_pos_trigram.pkl')\n"

In [None]:
#@title Load all objects
test_data = load_obj('test_data')
test_pos_counts = load_obj('test_pos_counts')
test_lem_counts = load_obj('test_lem_counts')
test_bw_pos_counts = load_obj('test_bw_pos_counts')
test_pos_bigram = load_obj('test_pos_bigram')
test_lem_bigram = load_obj('test_lem_bigram')
test_pos_trigram = load_obj('test_pos_trigram')
test_dep_counts = load_obj('test_dep_counts')

In [None]:
#@title Create Pandas DataFrame for Test
test_dataset = pd.DataFrame(test_data)
display(test_dataset)

Unnamed: 0,label,e1_ner_tag_len,e2_ner_tag_len,e1_ner_tag,e2_ner_tag,sentence_len,entity_distance,wordNet_holo_meronymy_relations,wordnet_entity_relations
0,"Message-Topic(e1,e2)",0,0,,,9,3,False,False
1,"Product-Producer(e2,e1)",0,0,,,6,3,False,False
2,"Instrument-Agency(e2,e1)",0,0,,,10,6,False,False
3,"Entity-Destination(e1,e2)",0,0,,,11,4,False,False
4,"Cause-Effect(e2,e1)",1,0,ORG,,18,15,False,False
...,...,...,...,...,...,...,...,...,...
2712,"Instrument-Agency(e2,e1)",0,0,,,20,7,False,False
2713,"Product-Producer(e1,e2)",0,0,,,17,4,False,False
2714,"Component-Whole(e2,e1)",0,0,,,21,1,False,False
2715,"Product-Producer(e1,e2)",0,0,,,19,6,False,False


In [None]:
#@title Parse labels
test_dataset = parse_label(test_dataset)
display(test_dataset)

Unnamed: 0,label,rel,dir,e1_ner_tag_len,e2_ner_tag_len,e1_ner_tag,e2_ner_tag,sentence_len,entity_distance,wordNet_holo_meronymy_relations,wordnet_entity_relations
0,"Message-Topic(e1,e2)",Message-Topic,"(e1,e2)",0,0,,,9,3,False,False
1,"Product-Producer(e2,e1)",Product-Producer,"(e2,e1)",0,0,,,6,3,False,False
2,"Instrument-Agency(e2,e1)",Instrument-Agency,"(e2,e1)",0,0,,,10,6,False,False
3,"Entity-Destination(e1,e2)",Entity-Destination,"(e1,e2)",0,0,,,11,4,False,False
4,"Cause-Effect(e2,e1)",Cause-Effect,"(e2,e1)",1,0,ORG,,18,15,False,False
...,...,...,...,...,...,...,...,...,...,...,...
2712,"Instrument-Agency(e2,e1)",Instrument-Agency,"(e2,e1)",0,0,,,20,7,False,False
2713,"Product-Producer(e1,e2)",Product-Producer,"(e1,e2)",0,0,,,17,4,False,False
2714,"Component-Whole(e2,e1)",Component-Whole,"(e2,e1)",0,0,,,21,1,False,False
2715,"Product-Producer(e1,e2)",Product-Producer,"(e1,e2)",0,0,,,19,6,False,False


In [None]:
#@title Clean DataFrame
test_dataset['label'] = le_label.fit_transform(test_dataset['label'])
test_dataset['rel'] = le_rel.fit_transform(test_dataset['rel'])
test_dataset['dir'] = le_dir.fit_transform(test_dataset['dir'])
test_dataset['e1_ner_tag'] = test_dataset['e1_ner_tag'].fillna("NAN")
test_dataset['e1_ner_tag'] = le_e1.fit_transform(test_dataset['e1_ner_tag'])
test_dataset['e2_ner_tag'] = test_dataset['e2_ner_tag'].fillna("NAN")
test_dataset['e2_ner_tag'] = le_e2.fit_transform(test_dataset['e2_ner_tag'])
test_dataset['wordNet_holo_meronymy_relations'] = lb_wordNet_holo_meronymy_relations.fit_transform(test_dataset['wordNet_holo_meronymy_relations'])
test_dataset['wordnet_entity_relations'] = lb_wordnet_entity_relations.fit_transform(test_dataset['wordnet_entity_relations'])
display(test_dataset)

Unnamed: 0,label,rel,dir,e1_ner_tag_len,e2_ner_tag_len,e1_ner_tag,e2_ner_tag,sentence_len,entity_distance,wordNet_holo_meronymy_relations,wordnet_entity_relations
0,14,7,0,0,0,2,1,9,3,0,0
1,18,9,1,0,0,2,1,6,3,0,0
2,11,5,1,0,0,2,1,10,6,0,0
3,6,3,0,0,0,2,1,11,4,0,0
4,1,0,1,1,0,4,1,18,15,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2712,11,5,1,0,0,2,1,20,7,0,0
2713,17,9,0,0,0,2,1,17,4,0,0
2714,3,1,1,0,0,2,1,21,1,0,0
2715,17,9,0,0,0,2,1,19,6,0,0


In [None]:
#@title Convert to numpy array
# Set correct length of columns
corr_len = len(test_data['label'])
all_test_columns = test_dataset.columns.tolist()
print(all_test_columns)
# Convert pandas dataframe to numpy array
test_dataset = test_dataset.to_numpy()

display(test_dataset)
print(test_dataset.shape)

['label', 'rel', 'dir', 'e1_ner_tag_len', 'e2_ner_tag_len', 'e1_ner_tag', 'e2_ner_tag', 'sentence_len', 'entity_distance', 'wordNet_holo_meronymy_relations', 'wordnet_entity_relations']


array([[14,  7,  0, ...,  3,  0,  0],
       [18,  9,  1, ...,  3,  0,  0],
       [11,  5,  1, ...,  6,  0,  0],
       ...,
       [ 3,  1,  1, ...,  1,  0,  0],
       [17,  9,  0, ...,  6,  0,  0],
       [ 7,  3,  1, ...,  3,  0,  0]])

(2717, 11)


In [None]:
#@title Dependency Count Columns
start_time = time.time()
test_data = toColumn(dep_counts)
test_dataset, cols = addAllKeys(test_dataset, test_data)
all_test_columns += cols
display(test_dataset)
print(test_dataset.shape)
print(time.time()-start_time, "seconds")
print(all_test_columns)

In [None]:
#@title POS Unigram Count Columns
#   Time: 0.035 seconds
start_time = time.time()
test_data = toColumn(test_pos_counts)
test_dataset, cols = addAllKeys(test_dataset, test_data)
all_test_columns += cols
display(test_dataset)
print(test_dataset.shape)
print(time.time()-start_time, "seconds")
print(all_test_columns)

array([[14,  7,  0, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0],
       [11,  5,  1, ...,  0,  0,  0],
       ...,
       [ 3,  1,  1, ...,  0,  0,  0],
       [17,  9,  0, ...,  0,  0,  0],
       [ 7,  3,  1, ...,  0,  0,  0]])

(2717, 54)
0.03530716896057129 seconds
['label', 'rel', 'dir', 'e1_ner_tag_len', 'e2_ner_tag_len', 'e1_ner_tag', 'e2_ner_tag', 'sentence_len', 'entity_distance', 'wordNet_holo_meronymy_relations', 'wordnet_entity_relations', 'pos_DT', 'pos_RBS', 'pos_JJ', 'pos_NN', 'pos_VBD', 'pos_IN', 'pos_CC', 'pos_.', 'pos_VBZ', 'pos_VBP', 'pos_VBN', 'pos_NNP', 'pos_RB', 'pos_JJR', 'pos_:', 'pos_VBG', 'pos_CD', 'pos_PRP$', 'pos_VB', 'pos_,', 'pos_TO', 'pos_WP', 'pos_POS', 'pos_PRP', 'pos_(', 'pos_)', 'pos_WDT', 'pos_RP', 'pos_JJS', 'pos_EX', 'pos_WRB', 'pos_NNS', 'pos_PDT', 'pos_``', "pos_''", 'pos_MD', 'pos_RBR', 'pos_FW', 'pos_NNPS', 'pos_WP$', 'pos_$', 'pos_UH', 'pos_SYM']


In [None]:
#@title Lemmatized Token Counts Columns
# Time: 382.745 seconds
start_time = time.time()
test_data = toColumn(test_lem_counts)
test_dataset, cols = addAllKeys(test_dataset, test_data)
all_test_columns += cols
display(test_dataset)
print(test_dataset.shape)
print(time.time()-start_time, "seconds")

array([[14,  7,  0, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0],
       [11,  5,  1, ...,  0,  0,  0],
       ...,
       [ 3,  1,  1, ...,  0,  0,  0],
       [17,  9,  0, ...,  1,  0,  0],
       [ 7,  3,  1, ...,  0,  1,  1]])

(2717, 10410)
379.39559602737427 seconds


In [None]:
#@title Between Entity POS Counts Columns
#   Time: 2.701 seconds
start_time = time.time()
test_data = toColumn(test_bw_pos_counts)
test_dataset, cols = addAllKeys(test_dataset, test_data)
all_test_columns += cols
display(test_dataset)
print(test_dataset.shape)
print(time.time()-start_time, "seconds")

array([[14,  7,  0, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0],
       [11,  5,  1, ...,  0,  0,  0],
       ...,
       [ 3,  1,  1, ...,  0,  0,  0],
       [17,  9,  0, ...,  0,  0,  0],
       [ 7,  3,  1, ...,  0,  0,  0]])

(2717, 10448)
3.2069780826568604 seconds


In [None]:
#@title POS Bigram Counts Columns
#   Time: 50.87088 seconds
start_time = time.time()
test_data = toColumn(test_pos_bigram)
test_dataset, cols = addAllKeys(test_dataset, test_data)
all_test_columns += cols
display(test_dataset)
print(test_dataset.shape)
print(time.time()-start_time, "seconds")

array([[14,  7,  0, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0],
       [11,  5,  1, ...,  0,  0,  0],
       ...,
       [ 3,  1,  1, ...,  0,  0,  0],
       [17,  9,  0, ...,  0,  0,  1],
       [ 7,  3,  1, ...,  0,  0,  0]])

(2717, 11268)
50.30026984214783 seconds


In [None]:
#@title Lemmatized Token Bigram Counts Columns - Not done
start_time = time.time()
test_data = toColumn(test_lem_bigram)
test_dataset, cols = addAllKeys(test_dataset, test_data)
all_test_columns += cols
display(test_dataset)
print(test_dataset.shape)
print(time.time()-start_time, "seconds")

In [None]:
#@title POS Trigram Counts Columns
#   Time: 320.5216 seconds
start_time = time.time()
test_data = toColumn(test_pos_trigram)
test_dataset, cols = addAllKeys(test_dataset, test_data)
all_test_columns += cols
display(test_dataset)
print(test_dataset.shape)
print(time.time()-start_time, "seconds")

array([[14,  7,  0, ...,  0,  0,  0],
       [18,  9,  1, ...,  0,  0,  0],
       [11,  5,  1, ...,  0,  0,  0],
       ...,
       [ 3,  1,  1, ...,  0,  0,  0],
       [17,  9,  0, ...,  0,  1,  0],
       [ 7,  3,  1, ...,  0,  0,  1]])

(2717, 15699)
388.69193863868713 seconds


In [None]:
#@title Save Dataset and Columns
save_obj(test_dataset, 'test_dataset')
save_obj(all_test_columns, 'all_test_columns')
files.download('test_dataset.pkl')
files.download('all_test_columns.pkl')

In [None]:
#@title Load Dataset and Columns
test_dataset = load_obj('test_dataset')
all_test_columns= load_obj('all_test_columns')

In [None]:
#@title Download csv
cleaned_col_strings = []
for col_name in all_test_columns:
  new_col = col_name.replace(",","#")
  new_col = new_col.replace(".","#")
  cleaned_col_strings.append(new_col)

col_names_string = ",".join(cleaned_col_strings)
np.savetxt('test_dataset.csv', test_dataset, delimiter=",", header=col_names_string)
files.download('test_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#@title Load csv
test_dataset = "test_dataset.csv"
test_df = pd.read_csv(test_dataset, header='infer')
display(test_df)

Unnamed: 0,# label,rel,dir,e1_ner_tag_len,e2_ner_tag_len,e1_ner_tag,e2_ner_tag,sentence_len,entity_distance,wordNet_holo_meronymy_relations,wordnet_entity_relations,pos_DT,pos_RBS,pos_JJ,pos_NN,pos_VBD,pos_IN,pos_CC,pos_#,pos_VBZ,pos_VBP,pos_VBN,pos_NNP,pos_RB,pos_JJR,pos_:,pos_VBG,pos_CD,pos_PRP$,pos_VB,pos_#.1,pos_TO,pos_WP,pos_POS,pos_PRP,pos_(,pos_),pos_WDT,pos_RP,pos_JJS,...,trigram_pos_CC_VB_JJR,trigram_pos_IN_RB_VBZ,trigram_pos_JJ_NNS_PRP,trigram_pos_NNS_PRP_MD,trigram_pos_VB_IN_VBD,trigram_pos_IN_VBD_PRP$,trigram_pos_VBP_VBG_#.1,trigram_pos_NNP_POS_VBZ,trigram_pos_POS_VBZ_DT,trigram_pos_WRB_EX_VBZ,trigram_pos_JJ_WRB_VBD,trigram_pos_WRB_VBD_RB,trigram_pos_VBN_CC_NN,trigram_pos_DT_``_JJS,trigram_pos_VB_DT_JJR,trigram_pos_JJ_TO_NNP,trigram_pos_NN_#_VBP.1,trigram_pos_#_VBP_DT.1,trigram_pos_TO_JJ_#,trigram_pos_:_CD_VBP,trigram_pos_CD_VBP_CC,trigram_pos_VBP_CC_CD,trigram_pos_NN_MD_VBD,trigram_pos_MD_VBD_VBN,trigram_pos_IN_PRP$_VBN,trigram_pos_PRP_IN_PRP,trigram_pos_)_#_VBZ,trigram_pos_VBZ_VBN_WRB,trigram_pos_NN_VBD_WDT,trigram_pos_VBD_WDT_VBD,trigram_pos_JJR_NNS_VBP,trigram_pos_#_WP_MD,trigram_pos_WP_MD_VB,trigram_pos_RBR_IN_PRP$,trigram_pos_)_VBZ_#,trigram_pos_POS_NN_WRB,trigram_pos_#_WDT_PRP,trigram_pos_WDT_PRP_VBP,trigram_pos_NNS_:_DT,trigram_pos_POS_NN_PRP$
0,14.0,7.0,0.0,0.0,0.0,2.0,1.0,9.0,3.0,0.0,0.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18.0,9.0,1.0,0.0,0.0,2.0,1.0,6.0,3.0,0.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11.0,5.0,1.0,0.0,0.0,2.0,1.0,10.0,6.0,0.0,0.0,3.0,1.0,2.0,4.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.0,3.0,0.0,0.0,0.0,2.0,1.0,11.0,4.0,0.0,0.0,3.0,1.0,2.0,3.0,1.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,1.0,0.0,4.0,1.0,18.0,15.0,0.0,0.0,2.0,1.0,1.0,7.0,1.0,3.0,1.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2712,11.0,5.0,1.0,0.0,0.0,2.0,1.0,20.0,7.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2713,17.0,9.0,0.0,0.0,0.0,2.0,1.0,17.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2714,3.0,1.0,1.0,0.0,0.0,2.0,1.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2715,17.0,9.0,0.0,0.0,0.0,2.0,1.0,19.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
#@title Clean Columns
all_columns = load_obj('all_columns')

cleaned_col_strings = []
for col_name in all_columns:
  new_col = col_name.replace(",","#")
  new_col = new_col.replace(".","*")
  cleaned_col_strings.append(new_col)

cleaned_col_strings.pop(0) 
cleaned_col_strings.insert(0, "# label") 
test_df = test_df.reindex(columns = cleaned_col_strings)  

display(test_df)

Unnamed: 0,# label,rel,dir,e2_ner_tag,sentence_len,entity_distance,pos_DT,pos_NN,pos_VBN,pos_IN,pos_PRP,pos_JJ,pos_RB,pos_VBD,pos_CC,pos_TO,pos_VB,pos_VBZ,pos_NNP,pos_VBG,pos_VBP,pos_CD,pos_PRP$,pos_#,lem_The,lem_a,lem_in,lem_of,lem_wa,lem_and,lem_the,lem_by,lem_to,lem_from,lem_is,lem_#,bw_pos_IN,bw_pos_JJ,bw_pos_NN,bw_pos_VBD,bw_pos_DT,bw_pos_VBN,bigram_pos_DT_NN,bigram_pos_VBN_IN,bigram_pos_IN_NN,bigram_pos_NN_IN,bigram_pos_IN_DT,bigram_pos_DT_JJ,bigram_pos_JJ_NN,bigram_pos_IN_JJ,bigram_pos_NN_#,bigram_pos_NN_NN,bigram_pos_TO_VB,bigram_pos_NN_VBZ,bigram_pos_NNP_NNP,bigram_pos_IN_NNP,bigram_pos_NN_CC,bigram_pos_NN_VBD,bigram_pos_NN_#.1,bigram_pos_CC_NN,bigram_pos_VBD_DT,trigram_pos_NN_IN_DT,trigram_pos_IN_DT_JJ,trigram_pos_DT_JJ_NN,trigram_pos_JJ_NN_IN,trigram_pos_IN_JJ_NN,trigram_pos_DT_NN_NN,trigram_pos_IN_DT_NN,trigram_pos_DT_NN_IN,trigram_pos_NN_IN_NN,trigram_pos_JJ_NN_NN,trigram_pos_VBN_IN_DT
0,14.0,7.0,0.0,1.0,9.0,3.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18.0,9.0,1.0,1.0,6.0,3.0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11.0,5.0,1.0,1.0,10.0,6.0,3.0,4.0,0.0,1.0,0.0,2.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,3.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3,6.0,3.0,0.0,1.0,11.0,4.0,3.0,3.0,0.0,3.0,0.0,2.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,1.0,5.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,1.0,18.0,15.0,2.0,7.0,1.0,3.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2712,11.0,5.0,1.0,1.0,20.0,7.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2713,17.0,9.0,0.0,1.0,17.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2714,3.0,1.0,1.0,1.0,21.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2715,17.0,9.0,0.0,1.0,19.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
#@title Download csv
col_names_string = ",".join(cleaned_col_strings)
np.savetxt('test_df.csv', test_df, delimiter=",", header=col_names_string)
files.download('test_df.csv')

**CREATE COMBINED TEST DATAFRAME**
---

In [None]:
#@title Get Sentence List
list_of_sentences = get_sentence_list('semeval_test.txt')

In [None]:
#@title Use Vectorizers
unigram_word_counts = vectorizer.transform(list_of_sentences).toarray()
bigram_word_counts = bigram_vectorizer.transform(list_of_sentences).toarray()

In [None]:
#@title Create DataFrames
num_orig_cols = 338
num_unigram_cols = 19115

unigram_df = pd.DataFrame(unigram_word_counts)
bigram_df = pd.DataFrame(bigram_word_counts)
unigram_df = unigram_df[all_columns[num_orig_cols:num_unigram_cols+num_orig_cols]]
bigram_df = bigram_df[all_columns[num_unigram_cols+num_orig_cols:]]

KeyError: ignored

In [None]:
#@title Combine unigram and bigram DataFrames
unigram_bigram_df = pd.concat((unigram_df, bigram_df), axis=1)

In [None]:
#@title Create Combined DataFrame
test_df = pd.read_csv('test_df.csv')
combined_test_df = pd.concat((test_df, unigram_bigram_df),axis=1)

In [None]:
#@title Save csv
combined_test_df.to_csv('combined_test_df.csv')

**CREATE MODELS**
---



In [None]:
#@title Model Training Functions

# Random Forest Model
def create_rf_model(X_train, y_train):
    print("\nCreating Random Forest...")
    start_time = time.time()
    rf_parameters = {'bootstrap': [True, False],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_leaf': [1, 2, 4],
                    'min_samples_split': [2, 5, 10],
                    'n_estimators': [100, 200, 400]
                    }
    rf = GridSearchCV(RandomForestClassifier(), rf_parameters)
    rf.fit(X_train, y_train)
    print("Created Random Forest in", time.time() - start_time, "seconds")
    print('Random Forest Best parameters: %s' % rf.best_params_)
    print('Random Forest Accuracy: %.2f' % rf.best_score_)
    return rf

# K-Nearest Neighbor Model
def create_knn_model(X_train, y_train):
    print("\nCreating KNN...")
    start_time = time.time()
    knn_parameters={'n_neighbors': [3,5,7], 
                    'weights':['uniform', 'distance']
                    }
    knn = GridSearchCV(KNeighborsClassifier(), knn_parameters)
    knn.fit(X_train, y_train)
    print("Created KNN in", time.time() - start_time, "seconds")
    print('KNN Best parameters: %s' % knn.best_params_)
    print('KNN Accuracy: %.2f' % knn.best_score_)
    return knn

# Gaussian Naive Bayes Model
def create_gnb_model(X_train, y_train):
    print("\nCreating Gaussian Naive Bayes...")
    start_time = time.time()
    gnb_parameters={'var_smoothing': [0.00000001, 0.000000001, 0.00000001]}
    gnb = GridSearchCV(GaussianNB(), gnb_parameters)
    gnb.fit(X_train, y_train)
    print("Created Gaussian Naive Bayes in", time.time() - start_time, "seconds")
    print('Gaussian Naive Bayes Best parameters: %s' % gnb.best_params_)
    print('Gaussian Naive Bayes Accuracy: %.2f' % gnb.best_score_)
    return gnb

# Ada Boost Model
def create_ada_model(X_train, y_train):
    print("\nCreating Ada Boost...")
    start_time = time.time()
    ada_parameters={
        'n_estimators': [50,100, 200, 400]
        }
    ada = GridSearchCV(AdaBoostClassifier(), ada_parameters)
    ada.fit(X_train, y_train)
    print("Created Ada Boost in", time.time() - start_time, "seconds")
    print('Ada Boost Best parameters: %s' % ada.best_params_)
    print('Ada Boost Accuracy: %.2f' % ada.best_score_)
    return ada

# SVC Model
def create_svc_model(X_train, y_train):
    print("\nCreating SVC...")
    start_time = time.time()
    svc_parameters={'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}
    svc = GridSearchCV(SVC(), svc_parameters)
    svc.fit(X_train, y_train)
    print("Created SVC in", time.time() - start_time, "seconds")
    print('SVC Best parameters: %s' % svc.best_params_)
    print('SVC Accuracy: %.2f' % svc.best_score_)
    return svc

# Stack Classifier Model
def create_model(X_train, y_train, label):
    print("\n\nMaking models for", label)
    print("=======================================")
    rf = create_rf_model(X_train, y_train)
    knn = create_knn_model(X_train, y_train)
    gnb = create_gnb_model(X_train, y_train)
    print("\n\nMaking Stack Classifier model for", label)
    print("=======================================")
    start_time = time.time()
    estimators = [
                  ('rf',  rf),
                  ('knn', knn),
                  ('gnb', gnb)             
                  ]

    stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    stack.fit(X_train, y_train)
    print("Created Stack Classifier in", time.time() - start_time, "seconds")
    return [rf, knn, gnb, stack]

# Stack Classifier 2 Model
def create_model_2(X_train, y_train, label):
    print("\n\nMaking models for", label)
    print("=======================================")
    ada = create_ada_model(X_train, y_train)
    svc = create_svc_model(X_train, y_train)
    print("\n\nMaking Stack Classifier model for", label)
    print("=======================================")
    start_time = time.time()
    estimators = [
                  ('ada', ada),
                  ('svc', svc)           
                  ]

    stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
    stack.fit(X_train, y_train)
    print("Created Stack Classifier in", time.time() - start_time, "seconds")
    return [ada, svc, stack]   

In [None]:
#@title Load Dataframes
df = pd.read_csv("combined_train2.csv", header='infer', index_col=0)
test_df = pd.read_csv("combined_test_df2.csv", header='infer', index_col=0)

y_train_label = df.iloc[:,0]
y_train_rel = df.iloc[:,1]
y_train_dir = df.iloc[:,2]
# x_train = df.drop(['# label', 'rel', 'dir', 'bigram_pos_CC_NN','bigram_pos_VBD_DT'], axis=1)
X_train = df.drop(['# label', 'rel', 'dir'], axis=1)

y_test_label = test_df.iloc[:,0]
y_test_rel = test_df.iloc[:,1]
y_test_dir = test_df.iloc[:,2]
X_test = test_df.drop(['# # label', 'rel', 'dir'], axis=1)

#Temp split the training data to use for testing
# X_train, X_test, y_train_label, y_test_label = train_test_split(x_train, y_train_label, random_state=42)
# X_train, X_test, y_train_rel, y_test_rel = train_test_split(x_train, y_train_rel, random_state=42)
# X_train, X_test, y_train_dir, y_test_dir = train_test_split(x_train, y_train_dir, random_state=42)

In [None]:
#@title Model Training For Label
rf_label, knn_label, gnb_label, stack_label = create_model(X_train, y_train_label, "Label")
save_obj(stack_label, 'stack_label')
save_obj(rf_label, 'rf_label')
save_obj(knn_label, 'knn_label')
save_obj(gnb_label, 'gnb_label')

ada_label_ada_svc, svc_label_ada_svc, stack_label_ada_svc = create_model_2(X_train, y_train_label, "Label")
save_obj(stack_label_ada_svc, 'stack_label_ada_svc')
save_obj(ada_label_ada_svc, 'ada_label_ada_svc')
save_obj(svc_label_ada_svc, 'svc_label_ada_svc')

In [None]:
#@title Model Training For Relation
rf_rel, knn_rel, gnb_rel, stack_rel = create_model(X_train, y_train_rel, "Relation")
save_obj(stack_rel, 'stack_rel')
save_obj(rf_rel, 'rf_rel')
save_obj(knn_rel, 'knn_rel')
save_obj(gnb_rel, 'gnb_rel')

ada_rel_ada_svc, svc_rel_ada_svc, stack_rel_ada_svc = create_model_2(X_train, y_train_rel, "Relation")
save_obj(stack_rel_ada_svc, 'stack_rel_ada_svc')
save_obj(ada_rel_ada_svc, 'ada_rel_ada_svc')
save_obj(svc_rel_ada_svc, 'svc_rel_ada_svc')

In [None]:
#@title Model Training For Direction
rf_dir, knn_dir, gnb_dir, stack_dir = create_model(X_train, y_train_dir, "Direction")
save_obj(stack_dir, 'stack_dir')
save_obj(rf_dir, 'rf_dir')
save_obj(knn_dir, 'knn_dir')
save_obj(gnb_dir, 'gnb_dir')

ada_dir_ada_svc, svc_dir_ada_svc, stack_dir_ada_svc = create_model_2(X_train, y_train_dir, "Direction")
save_obj(stack_dir_ada_svc, 'stack_dir_ada_svc')
save_obj(ada_dir_ada_svc, 'ada_dir_ada_svc')
save_obj(svc_dir_ada_svc, 'svc_dir_ada_svc')



Making models for Direction

Creating Random Forest...
Created Random Forest in 2607.4676423072815 seconds
Random Forest Best parameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Accuracy: 0.31

Creating KNN...
Created KNN in 19.291343450546265 seconds
KNN Best parameters: {'n_neighbors': 5, 'weights': 'uniform'}
KNN Accuracy: 0.42

Creating Gaussian Naive Bayes...
Created Gaussian Naive Bayes in 0.2406620979309082 seconds
Gaussian Naive Bayes Best parameters: {'var_smoothing': 1e-08}
Gaussian Naive Bayes Accuracy: 0.30


Making Stack Classifier model for Direction


In [None]:
#@title Save Model
files.download('stack_label.pkl')
files.download('rf_label.pkl')
files.download('knn_label.pkl')
files.download('gnb_label.pkl')

files.download('stack_rel.pkl')
files.download('rf_rel.pkl')
files.download('knn_rel.pkl')
files.download('gnb_rel.pkl')

files.download('stack_dir.pkl')
files.download('rf_dir.pkl')
files.download('knn_dir.pkl')
files.download('gnb_dir.pkl')

files.download('stack_label_ada_svc.pkl')
files.download('ada_label_ada_svc.pkl')
files.download('svc_label_ada_svc.pkl')

files.download('stack_rel_ada_svc.pkl')
files.download('ada_rel_ada_svc.pkl')
files.download('svc_rel_ada_svc.pkl')

files.download('stack_dir_ada_svc.pkl')
files.download('ada_dir_ada_svc.pkl')
files.download('svc_dir_ada_svc.pkl')

**MODEL PREDICTIONS**
---



In [None]:
#@title Loading Models
rel_model = load_obj('stack_rel')
dir_model = load_obj('stack_dir')

In [None]:
#@title Load Dataframes
df = pd.read_csv("combined_train2.csv", header='infer', index_col=0)
test_df = pd.read_csv("combined_test_df2.csv", header='infer', index_col=0)

y_train_label = df.iloc[:,0]
y_train_rel = df.iloc[:,1]
y_train_dir = df.iloc[:,2]
X_train = df.drop(['# label', 'rel', 'dir'], axis=1)

y_test_label = test_df.iloc[:,0]
y_test_rel = test_df.iloc[:,1]
y_test_dir = test_df.iloc[:,2]
X_test = test_df.drop(['# # label', 'rel', 'dir'], axis=1)

In [None]:
#@title Model Predictions
#y_test_label, y_pred_label = model_predictions(X_test, y_test, stack_label, "Label")
y_test_rel, y_pred_rel = model_predictions(X_test, y_test_rel, rel_model, "Relation")
y_test_dir, y_pred_dir = model_predictions(X_test, y_test_dir, dir_model, "Direction")
combined_model_accuracy_test(y_pred_rel, y_pred_dir, y_test_label)



Making Predictions for Relation
Time to make predictions: 5.900059223175049 seconds
Accuracy: 0.5038645564961355
Macro Precision:  0.5523399374240769
Recall: 0.5034284641896806
F1 Score: 0.5205874141963094
FBeta Score: 0.5374817530989455



Making Predictions for Direction
Time to make predictions: 1.9193644523620605 seconds
Accuracy: 0.5726904674273096
Macro Precision:  0.39387102046868866
Recall: 0.4729322461845258
F1 Score: 0.4172238040274389
FBeta Score: 0.40053786136409436



Combined Relation and Direction Metrics
Accuracy: 0.38866396761133604
Macro Precision:  0.3609123854531386
Recall: 0.33053192727978403
F1 Score: 0.32826571484102457
FBeta Score: 0.3425819033253867



**TEST USER INPUT**
---



In [None]:
#@title Loading
test_df = pd.read_csv("combined_test_df2.csv", index_col=0)
rel_model = load_obj('stack_rel')
dir_model = load_obj('stack_dir')
vectorizer = load_obj('vectorizer')
bigram_vectorizer = load_obj('bigram_vectorizer')
le_label = load_obj('le_label')
le_rel = load_obj('le_rel')
le_dir = load_obj('le_dir')
le_e1 = load_obj('le_e1')
le_e2 = load_obj('le_e2')
lb_wordNet_holo_meronymy_relations = load_obj('lb_wordNet_holo_meronymy_relations')
lb_wordnet_entity_relations = load_obj('lb_wordnet_entity_relations')

In [None]:
#@title Initialization
num_orig_cols = 338
num_unigram_cols = 19115
all_columns = test_df.columns.tolist()

In [None]:
#@title Test User Input
while True:
  line = input("Enter the sentence that you would like to test or type exit: ")
  if line=="exit":
    break
  else:
    relation = input("Enter the relation: ")

    # Use bag of words vectorizers
    unigram_word_counts = vectorizer.transform([line]).toarray()
    bigram_word_counts = bigram_vectorizer.transform([line]).toarray()

    # Create DataFrames
    unigram_df = pd.DataFrame(unigram_word_counts)
    bigram_df = pd.DataFrame(bigram_word_counts)
    unigram_df = unigram_df[num_orig_cols:num_unigram_cols+num_orig_cols]
    bigram_df = bigram_df.reindex(columns=all_columns[num_unigram_cols+num_orig_cols:])

    # Combine unigram and bigram DataFrames
    unigram_bigram_df = pd.concat((unigram_df, bigram_df), axis=1)

    # Create DataFrame for original data
    line_data, line_pos_counts, line_lem_counts, line_bw_pos_counts, line_pos_bigram, line_lem_bigram, line_pos_trigram, line_dep_counts= DataProcessor([line, relation])
   
    # Create Pandas Dataframe for Line
    line_dataset = pd.DataFrame(line_data)
    # Parse labels
    line_dataset = parse_label(line_dataset)
    # Clean Dataframe
    line_dataset['label'] = le_label.transform(line_dataset['label'])
    line_dataset['rel'] = le_rel.transform(line_dataset['rel'])
    line_dataset['dir'] = le_dir.transform(line_dataset['dir'])
    line_dataset['e1_ner_tag'] = line_dataset['e1_ner_tag'].fillna("NAN")
    line_dataset['e1_ner_tag'] = le_e1.transform(line_dataset['e1_ner_tag'])
    line_dataset['e2_ner_tag'] = line_dataset['e2_ner_tag'].fillna("NAN")
    line_dataset['e2_ner_tag'] = le_e2.transform(line_dataset['e2_ner_tag'])
    line_dataset['wordNet_holo_meronymy_relations'] = lb_wordNet_holo_meronymy_relations.transform(line_dataset['wordNet_holo_meronymy_relations'])
    line_dataset['wordnet_entity_relations'] = lb_wordnet_entity_relations.transform(line_dataset['wordnet_entity_relations'])

    # Convert to numpy array
    corr_len = len(line_dataset['label'])
    all_line_columns = line_dataset.columns.tolist()
    line_dataset = line_dataset.to_numpy()

    # Dependency Count Columns
    line_data = toColumn(line_dep_counts)
    line_dataset, cols = addAllKeys(line_dataset, line_data)
    all_line_columns += cols

    # POS Unigram Count Columns
    line_data = toColumn(line_pos_counts)
    line_dataset, cols = addAllKeys(line_dataset, line_data)
    all_line_columns += cols

    #Lemmatized Token Counts Columns
    line_data = toColumn(line_lem_counts)
    line_dataset, cols = addAllKeys(line_dataset, line_data)
    all_line_columns += cols

    #Between Entity POS Counts Columns
    line_data = toColumn(line_bw_pos_counts)
    line_dataset, cols = addAllKeys(line_dataset, line_data)
    all_line_columns += cols

    #POS Bigram Counts Columns
    line_data = toColumn(line_pos_bigram)
    line_dataset, cols = addAllKeys(line_dataset, line_data)
    all_line_columns += cols

    #POS Trigram C
    line_data = toColumn(line_pos_trigram)
    line_dataset, cols = addAllKeys(line_dataset, line_data)
    all_line_columns += cols

    cleaned_col_strings = []
    for col_name in all_line_columns:
      new_col = col_name.replace(",","#")
      new_col = new_col.replace(".","*")
      cleaned_col_strings.append(new_col)

    col_names_string = ",".join(cleaned_col_strings)
    line_df = pd.DataFrame(data = line_dataset,  
                  columns = np.array(cleaned_col_strings))

    cleaned_col_strings = []
    for col_name in all_columns[:338]:
      new_col = col_name.replace(",","#")
      new_col = new_col.replace(".","*")
      cleaned_col_strings.append(new_col)

    cleaned_col_strings.pop(0) 
    cleaned_col_strings.insert(0, "# label") 
    line_df = line_df.reindex(columns = cleaned_col_strings)  

    # Combine columns
    X_pred = pd.concat((line_df, unigram_bigram_df), axis=1)
    X_pred = X_pred.fillna(0)

    # Make prediction
    y_test_rel, y_pred_rel = model_predictions(X_pred.iloc[:,3:], X_pred.iloc[:,1], rel_model, "Relation")
    y_test_dir, y_pred_dir = model_predictions(X_pred.iloc[:,3:], X_pred.iloc[:,2], dir_model, "Direction")

    combined_model_accuracy(y_pred_rel, y_pred_dir, relation)