### Some initial imports

In [1]:
from tqdm.auto import tqdm #For progress bars
import random
from IPython.display import clear_output #For clearing outputs of installs
import nltk
nltk.download('punkt')
from itertools import product
clear_output()

### Download files (if no files downloaded)

In [2]:
!pip install --upgrade --no-cache-dir gdown

! gdown 1NNdynIMSII9atlv52ZT2rYPwC-ZT1_SQ
! unzip /content/sentenceSimilarityModel.zip -d /content

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.6.6
    Uninstalling gdown-4.6.6:
      Successfully uninstalled gdown-4.6.6
Successfully installed gdown-4.7.1
Downloading...
From (uriginal): https://drive.google.com/uc?id=1NNdynIMSII9atlv52ZT2rYPwC-ZT1_SQ
From (redirected): https://drive.google.com/uc?id=1NNdynIMSII9atlv52ZT2rYPwC-ZT1_SQ&confirm=t&uuid=3651e3be-e2dd-4e08-b728-e9ba058c7b97
To: /content/sentenceSimilarityModel.zip
100% 306M/306M [00:09<00:00, 31.0MB/s]
Archive:  /content/sentenceSimilarityModel.zip
   creating: /content/sentenceSimilarityModel/
   creating: /content/sentenceSimilarityModel/1_Pooling/
  inflating: /content/sentenceSimilarityModel/1_Pooling/config.json  
  inflating: /content/sentenceSimilarityModel/config.json  
  inflating: /conte

In [3]:
#UD Converter import

! git clone https://github.com/mille-s/UD_Converter.git
# Delete locally to avoid confusion
! rm '/content/UD_Converter/UD_Converter_release.ipynb'

import os
input_folder = '/content/UD_Converter/Inputs'

if not os.path.exists(input_folder):
    os.makedirs(input_folder)

Cloning into 'UD_Converter'...
remote: Enumerating objects: 124, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 124 (delta 10), reused 1 (delta 1), pack-reused 104[K
Receiving objects: 100% (124/124), 7.78 MiB | 18.57 MiB/s, done.
Resolving deltas: 100% (51/51), done.


### Important paths

In [4]:
path = '/content/'
model_path = path + 'sentenceSimilarityModel'

## Set up tools (+ functions to use them)

### DBpedia SparQL (+ functions with queries)

In [5]:
!pip install sparqlwrapper
clear_output()
print('SPARQLWrapper installed!')

SPARQLWrapper installed!


In [6]:
from SPARQLWrapper import SPARQLWrapper, JSON

def sparqlQuery(entity):
    query = '''
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX dbo: <http://dbpedia.org/ontology/>

SELECT DISTINCT ?obj {
    dbr: rdf:type ?obj
    FILTER strstarts(str(?obj), str(dbo:))
}'''

    new_query = query[:177] + entity + query[177:]
    return new_query

def sparql_entityType(entity):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = sparqlQuery(entity)
    sparql.setQuery(query)

    try:
        ret = sparql.queryAndConvert()
        return ret["results"]["bindings"][0]['obj']['value'].split('/')[-1]
    except Exception as e:
        return None
        #print(e)

### Parser

In [7]:
!pip install stanza
import stanza
stanza.download('en')
from stanza.models.common.doc import Document
clear_output()
print('Stanza installed!')

Stanza installed!


In [8]:
parser = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


Function to print UDs from stanza document (first UDs = doc.sentences[0].to_dict())

In [9]:
def print_UDs(UDs):
    print ("{:<20} | {:<15} | {:<20} ".format('Token', 'Relation', 'Head'))
    print ("-" * 55)
    for word in UDs:
        print ("{:<20} | {:<15} | {:<20} "
            .format( '('+str(word['id'])+') ' + str(word['text']), str(word['deprel']), '('+str(UDs[word['head']-1]['id'])+') ' + str(UDs[word['head']-1]['text'] if word['head'] > 0 else 'ROOT')))
        

In [10]:
def UDs_to_text(UDs):
    text = "{:<20} | {:<15} | {:<20}\n".format('Token', 'Relation', 'Head')
    text += "-" * 55 + "\n"
    for word in UDs:
        text += "{:<20} | {:<15} | {:<20}\n".format(
            '(' + str(word['id']) + ') ' + str(word['text']),
            str(word['deprel']),
            '(' + str(UDs[word['head'] - 1]['id']) + ') ' +
            str(UDs[word['head'] - 1]['text']) if word['head'] > 0 else 'ROOT'
        )
    return text


### UD Converter

In [11]:
def sentences_to_UD_Converter(sentenceUDs, save_path=None):
    if not sentenceUDs:
        return None
    UD_text = ''
    for sentenceUD in sentenceUDs:
        UD_sentence = '# text = '+ sentenceUD[0]+'\n'
        #print(sentenceUD)
        for word in sentenceUD[1]:
            categories = []
            categories.append(str(word['id']))          #Category 1  (0)
            categories.append(word['text'])             #Category 2  (1)
            categories.append(word['lemma'])            #Category 3  (2)
            categories.append(word['upos'])             #Category 4  (3)
            categories.append(word['xpos'])             #Category 5  (4)
            categories.append(word['feats'])            #Category 6  (5)
            categories.append(str(word['head']))        #Category 7  (6)
            categories.append(word['deprel'])           #Category 8  (7)
            categories.append(str(word['start_char']))  #Category 9  (8)
            categories.append(str(word['end_char']))    #Category 10 (9)
            
            UD_word = ''
            for category in categories:
                UD_word += category + '\t'
            UD_word = UD_word[:len(UD_word)-1] + '\n'
            UD_sentence += UD_word
        UD_sentence = UD_sentence[:len(UD_sentence)-1] + '\n\n'
        UD_text += UD_sentence

    if save_path:
        with open(save_path, "w", encoding="utf-8") as f:
            f.write(UD_text)

    return UD_text

In [12]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# authors: simon mille, alex shvets

import os, shutil
from shutil import copyfile
import sys
import glob
import subprocess
from subprocess import Popen, PIPE
import timeit
import datetime
import codecs
import re

def UD_Converter():
  start = timeit.default_timer()

  #============================================================================================================
  # GENERAL PARAMETERS (please read comments before parameters to avoid most errors)
  #============================================================================================================
  # path to working folder (which must contain the buddy-core tools that convert to t2 and the other .py files)
  path_jars = '/content/UD_Converter/Resources'
  # !!! path to input folder; file names in the input folder should not contain spaces or parentheses
  # !!! needs to exist and have some CoNLL(-U) files inside, if possible with a 2-letter prefix to indicate the language (e.g. en_ewt-UD.conllu, fr_myfile.conllu); supported: en, fr, es
  inputFolder = '/content/UD_Converter/Inputs'
  # path to output folder; will be created if does not exist
  outputFolder = '/content/UD_Converter/Outputs'

  # !!! file extension: input file should be in '.conllu' format, with or without metadata (lines starting with '#'); 10- or 14-column '.conll' format is also accepted
  inputFormat = 'conllu'
  # number of structures per file: big files need to be cut into smaller files containing this amount of sentences (10,000 max recommended); separated files are brought back together at the end of the conversion.
  strPerFile = '10000'
  # perform structure well-formedness and file alignment checks and create debug files ('yes'/'no')
  debug = 'yes'
  # Keep or erase intermediate files produced by the different components
  keep_intermediate_files = 'no'

  #============================================================================================================
  # CONVERSION PARAMETERS ('yes'/'no')
  # Specifications of the T1 and T2 structures and links to papers can be found on the SRST page: http://taln.upf.edu/pages/msr2020-ws/SRST.html
  #============================================================================================================
  # generate input strutures for surface only ('t1') or deep ('t2') tracks
  track = 't2'
  # keep deep structures from previous executions (for SRST data, we need 2 executions, first for deep and then for surf files): 'yes' or whatever
  keep_deep = 'no'
  # scramble files or not, i.e. change order of words ('yes', 'no'; SRST: 'yes')
  scramble = 'yes'
  # keep relative ordering of punctuation marks (SRST surface: 'yes'; deep: 'no')
  orderPunc = 'yes'
  if track == 't2':
    orderPunc = 'no'
  # keep relative ordering of cunjuncts in coordination (SRST: 'yes')
  orderConj = 'yes'
  # keep relative ordering of MWE components (SRST: 'yes')
  orderMWE = 'yes'
  # data type: '1' is for train, '2' is for test
  dt = '1'

  #============================================================================================================
  # ADDITIONAL PARAMETERS FOR DEEP STRUCTURES
  #============================================================================================================
  default = 'no'
  # keep ID of position of each word from the original conllu file (column #1) in the deep structure (SRST: 'yes' if scrambled); note that original IDs are not kept in test files
  originalID = 'yes'
  # reduce deep tree to the minimal subtree that contains both object=true and subject=true nodes as indicated in the FEATS column of the input file ('yes' or whatever; SRST: 'no')
  reduce_tree = 'yes'
  # keep form from the original conllu file (column #2) in the deep structure? (SRST: 'no')
  originalForm = default
  # keep xpos from the original conllu file (column #5) in the deep structure (SRST: 'no')
  originalXpos = default
  # keep parentheses in the deep structure (SRST: 'no')
  parentheses = default
  # keep quotation marks in the deep structure (SRST: 'no')
  quotationMarks = default
  # keep label of adpositions in the deep structure on the node of the word it was attached to in UD (SRST: 'no')
  adposition = 'yes'

  #============================================================================================================
  # DO NOT EDIT BELOW
  #============================================================================================================

  # path to temporary folders
  tmpIn = os.path.join(path_jars, 'tmpIn')
  tmpOut = os.path.join(path_jars, 'tmpOut')
  # path to the folder in which the debug info is stored
  debugFolder = os.path.join(outputFolder, 'debug')
  # Define output subfolders
  deepOut = os.path.join(outputFolder, 'T2')
  surfOut = os.path.join(outputFolder, 'T1')
  sentOut = os.path.join(outputFolder, 'Sent')

  # Clear the debug folder before starting the conversion
  try:
    shutil.rmtree(debugFolder)
  except Exception as e:
    pass

  # Choose which folder(s) to keep from previous generations
  if keep_deep == 'yes':
    try:
      shutil.rmtree(surfOut)
    except Exception as e:
      pass
      
    try:
      shutil.rmtree(sentOut)
    except Exception as e:
      pass

  else:
    try:
      shutil.rmtree(outputFolder)
    except Exception as e:
      pass

  # In case we did not delete them after using them below
  try:
    shutil.rmtree(tmpIn)
  except Exception as e:
    pass

  try:
    shutil.rmtree(tmpOut)
  except Exception as e:
    pass

  # Create final and temp output folders
  if not os.path.exists(outputFolder):
    os.makedirs(outputFolder)

  if track == 't2':
    if not os.path.exists(deepOut):
      os.makedirs(deepOut)
  if not os.path.exists(surfOut):
    os.makedirs(surfOut)
  if not os.path.exists(sentOut):
    os.makedirs(sentOut)

  deepOutTmp = os.path.join(tmpOut, 'T2')
  if track == 't2':
    if not os.path.exists(deepOutTmp):
      os.makedirs(deepOutTmp)
  surfOutTmp = os.path.join(tmpOut, 'T1')
  if not os.path.exists(surfOutTmp):
    os.makedirs(surfOutTmp)
  sentOutTmp = os.path.join(tmpOut, 'Sent')
  if not os.path.exists(sentOutTmp):
    os.makedirs(sentOutTmp)

  os.makedirs(debugFolder)

  print('\n==============================\nPre-processing input file(s)...\n==============================')
  # File splitting: the converter cannot process files that are too big, so they need to be split. For regular-sized sentences, 10,000 sentences per file should do.
  # Parameters:
  # [1] path to input folder
  # [2] encoding of input files
  # [3] number of structures per file
  # [4] split once ('first'), or every time the threshold in [3] is reached ('all')
  # [5] path to temp folder used to store split files
  # This part creates a tmpIn folder in which all split files are stored
  print('\nChecking files to split...\n')
  path_splitFiles = os.path.join(path_jars, 'splitFiles.py')
  !python {path_splitFiles} {inputFolder} 'utf-8' {strPerFile} 'all' {tmpIn}
  # Code for offline usage; replace previous line by following:
  # subprocess.call(['python', 'splitFiles.py', inputFolder, 'utf-8', strPerFile, 'all', tmpIn])

  # Conversion to format that can be loaded by the .jar.
  # Parameters:
  # [1] extension of input files (inputFormat)
  # [2] path to temp folder used to store split files
  # This part creates a folder within the tmpIn folder, which contains files in the CoNLL'09 format with all the information needed for the conversion.
  print('\nConverting file format...\n')
  if dt == '2':
    originalID = 'no'
  convertFolder = tmpIn
  path_conllu2conll = os.path.join(path_jars, 'conllu2conll.py')
  !python {path_conllu2conll} {inputFormat} {convertFolder} {originalID} {originalForm} {originalXpos} {parentheses} {quotationMarks} {orderPunc} {orderConj} {orderMWE} {track} {dt} {sentOutTmp} {reduce_tree} {adposition}
  # Code for offline usage; replace previous line by following:
  # subprocess.call(['python', 'conllu2conll.py', inputFormat, convertFolder, originalID, originalForm, originalXpos, parentheses, quotationMarks, orderPunc, orderConj, orderMWE, track, dt, sentOutTmp, reduce_tree])

  # Scrambling of the files to remove order information.
  # Parameters:
  # [1]
  # [2]

  print('\n==============================\nScrambling input file(s)...\n==============================')
  # Create output folder if does not exist
  # This part takes the enriched CoNLL'09 files and scrambles each file (so that the original order is not explicit anymore). The resulting structures are kept in a folder within tmpIn, with the -scrambled extension.

  files2Scramble = [f for f in os.listdir(os.path.join(tmpIn, 'conllu2conll')) if '.conll' in f]
  path_conllScramble = os.path.join(path_jars, 'conllScramble.py')
    
  for file2Scramble in files2Scramble:
    if scramble == 'yes':
      !python {path_conllScramble} {file2Scramble} {surfOutTmp} {track} {dt} {tmpIn}
      # Code for offline usage; replace previous line by following:
      # subprocess.call(['python', 'conllScramble.py', file2Scramble, surfOutTmp, track, dt, tmpIn])
    else:
      print('\nNo file scrambled!')
      copyfile(os.path.join(tmpIn, 'conllu2conll', file2Scramble), os.path.join(surfOutTmp, file2Scramble))

  if track == 't2':
    print('\n==============================\nStarting with conversion...\n==============================')
    # Conversion of UD files into Deep representation. A log file is created in the debug folder (log.txt), in which the word 'Error' is printed in case a structure could not be processed. For processing big files, 1g of memory may be needed.
    # This part performs the conversion to deep structures of the scrambled enriched CoNLL'09 structures stored in tmpIn. The results are saved in the tmpOut folder, each file in a folder named as the input file.
    # Parameters:
    # [1] path to input folder
    # [2] -o path to temporary output folder
    list_prefixes = ['en', 'es', 'fr']
    files = [f for f in os.listdir(os.path.join(surfOutTmp)) if '.conll' in f]
    for f in files:
      filepath = os.path.join(surfOutTmp,f)
      prefix = f.split('_', 1)[0]
      if prefix not in list_prefixes:
        print('\nRunning default converter (en)...\n---------------\n')
        path_buddy_core = os.path.join(path_jars, 'buddy-core-0.1.1-en.jar')
        with open(os.path.join(debugFolder, 'log_deep_processing.txt'), 'a') as logfile:
          proc = subprocess.Popen(['java', '-Xmx1g', '-jar', path_buddy_core, os.path.join(surfOutTmp,f), '-o', deepOutTmp], stdout = subprocess.PIPE, universal_newlines=True)
          for line in proc.stdout:
            sys.stdout.write(line)
            logfile.write(line)
      else:
        print('\nRunning converter according to prefix of input file ('+prefix+')...\n---------------\n')
        path_buddy_core = os.path.join(path_jars, 'buddy-core-0.1.1-'+prefix+'.jar')
        with open(os.path.join(debugFolder, 'log_deep_processing.txt'), 'a') as logfile:
          proc = subprocess.Popen(['java', '-Xmx1g', '-jar', path_buddy_core, os.path.join(surfOutTmp,f), '-o', deepOutTmp], stdout = subprocess.PIPE, universal_newlines=True)
          for line in proc.stdout:
            sys.stdout.write(line)
            logfile.write(line)

  if keep_intermediate_files == 'no':      
    try:
      shutil.rmtree(tmpIn)
    except Exception as e:
      print(e)

  print('\n==============================\nConcatenating output files...\n==============================')
  # File concatenation: the big files that had been split in smaller files are brought back together.
  # Parameters:
  # [1] path to input folder
  # [2] path to output folder
  # [3] encoding of input files
  # [4] encoding of output files
  # [5] extension of output files (in this case the same as the input format, 'conllu')
  # [6] the type of structure that have to be brought together (deep, surf(ace), sent(ences))
  path_concatenateFiles = os.path.join(path_jars, 'concatenateFiles.py')
  if os.path.exists(deepOutTmp):
    dir_contents_deep = [x for x in os.listdir(deepOutTmp) if not x.startswith('.')]
    if len(dir_contents_deep) > 0:
      print('\nConcatenating deep structures...')
      !python {path_concatenateFiles} {deepOutTmp} {deepOut} 'utf-8' 'utf-8' {inputFormat} 'deep' {track} {dt}
      # Code for offline usage; replace previous line by following:
      # subprocess.call(['python', 'concatenateFiles.py', deepOutTmp, deepOut, 'utf-8', 'utf-8', inputFormat, 'deep', track, dt])

  if os.path.exists(surfOutTmp):
    dir_contents_surf = [x for x in os.listdir(surfOutTmp) if not x.startswith('.')]
    if len(dir_contents_surf) > 0:
      print('\nConcatenating surface structures...')
      !python {path_concatenateFiles} {surfOutTmp} {surfOut} 'utf-8' 'utf-8' {inputFormat} 'surf' {track} {dt}
      # Code for offline usage; replace previous line by following:
      # subprocess.call(['python', 'concatenateFiles.py', surfOutTmp, surfOut, 'utf-8', 'utf-8', inputFormat, 'surf', track, dt])
    
  if os.path.exists(sentOutTmp):
    dir_contents_sent = [x for x in os.listdir(sentOutTmp) if not x.startswith('.')]
    if len(dir_contents_sent) > 0:
      print('\nConcatenating sentences...')
      !python {path_concatenateFiles} {sentOutTmp} {sentOut} 'utf-8' 'utf-8' 'txt' 'sent' {track} {dt}
      # Code for offline usage; replace previous line by following:
      # subprocess.call(['python', 'concatenateFiles.py', sentOutTmp, sentOut, 'utf-8', 'utf-8', 'txt', 'sent', track, dt])

  if keep_intermediate_files == 'no':     
    try:
      shutil.rmtree(tmpOut)
    except Exception as e:
      print(e)

  if debug == 'yes':
    print('\n==============================\nChecking outputs...\n==============================')

    try:
      os.remove(os.path.join(debugFolder, 'log_alignments.txt'))
    except Exception as e:
      pass
      
    print('\nChecking alignments between original UD and surface files......\n')
    path_checkAlignments = os.path.join(path_jars, 'checkAlignments.py')
    path_checkWellFormedness = os.path.join(path_jars, 'checkWellFormedness.py')

    !python {path_checkAlignments} {inputFolder} {surfOut} {debugFolder} 'utf-8' 'UD2surf' {dt} {scramble}
    # Code for offline usage; replace previous line by following:
    # subprocess.call(['python', 'checkAlignments.py', inputFolder, surfOut, debugFolder, 'utf-8', 'UD2surf', dt, scramble])

    # If the deep structures were kept from a previous execution, check their alignment too
    if keep_deep == 'yes':
      track = 't2'

    if track == 't2':

      print('\nChecking alignments between surface and deep files......\n')
      !python {path_checkAlignments} {surfOut} {deepOut} {debugFolder} 'utf-8' 'surf2deep' {dt} {scramble}
      # Code for offline usage; replace previous line by following:
      # subprocess.call(['python', 'checkAlignments.py', surfOut, deepOut, debugFolder, 'utf-8', 'surf2deep', dt, scramble])

      print('\nChecking alignments between original UD and deep files......\n')
      !python {path_checkAlignments} {inputFolder} {deepOut} {debugFolder} 'utf-8' 'UD2deep' {dt} {scramble}
      # Code for offline usage; replace previous line by following:
      # subprocess.call(['python', 'checkAlignments.py', inputFolder, deepOut, debugFolder, 'utf-8', 'UD2deep', dt, scramble])
      
      print('\nChecking deep tree well-formedness...\n')
      # File check: a small script that checks the contents of the output files. It looks for configurations that in theory should not happen: disconnections, cycles, repeated argument numbers, multiple incoming dependencies (in case of tree input). A log file is created in the debug folder (log_treeness.txt), and optionally, folders with the ill-formed files.
      # Parameters:
      # [1] path to output debug folder
      # [2] path to file to be checked
      # [3] encoding of input files
      # [4] type of structure to be checked ('tree' or 'graph')
      # [5] OPTIONAL: path to original files from which the whole conversion started ('inputFolder'). If used, a folder with files that are ill-formed will be created.
      # [6] ONLY IF [5]: input format of original files (inputFormat)
      listFinalFilepaths = glob.glob(os.path.join(deepOut, '*.'+inputFormat))
      for outFile in listFinalFilepaths:
        print(outFile)
        !python {path_checkWellFormedness} {debugFolder} {outFile} 'utf-8' 'tree' {inputFolder} {inputFormat}
      # Code for offline usage; replace previous line by following:
      # subprocess.call(['python', 'checkWellFormedness.py', debugFolder, outFile, 'utf-8', 'tree', inputFolder, inputFormat])
    else:
      pass
    
  stop = timeit.default_timer()
  timeConversion = str(datetime.timedelta(seconds=round((stop - start), 2)))
  print('\n--------------------\nDONE')
  print(timeConversion)
  print('--------------------\n')

  foTime = codecs.open(os.path.join(debugFolder, 'log_time.txt'),'w','utf-8')
  foTime.write(timeConversion)
  foTime.close()

### Sentence Similarity Transformer Model

In [13]:
!pip install -U sentence-transformers
clear_output()
print('Sentence transformers installed!')

Sentence transformers installed!


In [14]:
from sentence_transformers import SentenceTransformer, util

In [15]:
model = SentenceTransformer(model_path)

### Textify triples functions

In [16]:
import re

def is_numeric(entity):
    my_regex = "\(.*\)|\s-\s.*"
    cleaned = re.sub(my_regex, '', entity).replace('million','') #remove parenthesis and million word
    try:
        float(cleaned)
        return True
    except:
        return False

In [17]:
from dateutil.parser import parse

def is_date(string, fuzzy=False):
    """
    Return whether the string can be interpreted as a date.

    :param string: str, string to check for date
    :param fuzzy: bool, ignore unknown tokens in string if True
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False

In [18]:
def camelCaseClean(text):
    words = [[text[0]]]
    for c in text[1:]:
        if words[-1][-1].islower() and c.isupper():
            words.append(list(c.lower()))
        else:
            words[-1].append(c)
    words = [''.join(word) for word in words]
    cleaned = ' '.join(words)
    return cleaned


In [19]:
def get_subjObj_type(subObj, prop, subjObj_types):
    subjObj_type = []
    for entity in subObj:
        if entity in subjObj_types:
            subjObj_type.append(subjObj_types[entity])
        else:
            cleaned_entity = entity.replace('"','')
            if is_numeric(cleaned_entity):
                entity_type = 'NUMERIC'
            elif is_date(cleaned_entity) and prop != 'runway name':
                entity_type = 'TIMEPERIOD'
            else:
                entity_type = sparql_entityType(cleaned_entity)
                if entity_type:
                    entity_type = entity_type.upper()
                else:
                    entity_type = 'UNKNOWN'
                    
            subjObj_type.append(entity_type)
            subjObj_types[entity] = entity_type

    return subjObj_type

In [20]:
def textify_triple(triple, subjObj_types, props):
    if triple[1] in props:
        prop = props[triple[1]]
    else:
        prop = camelCaseClean(triple[1])
        props[triple[1]] = prop
    subjectObject = [triple[0], triple[2]]
    subjObj_type = get_subjObj_type(subjectObject, prop, subjObj_types)

    textified_triple = '<'+subjObj_type[0]+'> ' + subjectObject[0].replace('_',' ') + ' <PROP> ' + prop + ' <'+subjObj_type[1]+'> ' + subjectObject[1].replace('_',' ')
    return textified_triple

## Functions

In [21]:
def get_conlluUDs(sentences, savePath=None):
    UDs = []
    print('Getting CONLLUs from final sentences', end=' ')
    for sentence in tqdm(sentences):
        doc = parser(sentence)
        UD_dicts = doc.sentences[0].to_dict()
        for word in UD_dicts:
            word['feats'] = '' if 'feats' not in word else word['feats']
        UD = [sentence, UD_dicts]
        UDs.append(UD)
    conlluUDs = sentences_to_UD_Converter(UDs, savePath)
    return conlluUDs

In [22]:
def get_sentenceUDs(candidates, startSubObj, n_sentences=float('inf')):
    if not candidates:
        print('No sentence to UD parse.')
        return None

    temp = list(zip(candidates, startSubObj))
    shuffled_candidates = random.shuffle(temp)
    candidates, startSubObj = zip(*temp)
    candidates, startSubObj = list(candidates), list(startSubObj)

    if n_sentences == float('inf'):
        print('\nUD Parsing all',  len(candidates),'candidate sentences:', end=' ')
    else:
        print('\nUD Parsing', n_sentences, 'random candidate sentences (out of '+str(len(candidates))+'):', end=' ')

    sentence_UDs = []
    pops_counter = 0
    j = 0
    if len(candidates) > n_sentences:
        pbar = tqdm(total=n_sentences)
    for i, sentence in enumerate(tqdm(candidates)):
    #for i, sentence in enumerate(candidates):
        i -= pops_counter
        if i >= n_sentences:
            break
        else:
            sentence_UDs.append([])
            sentence_UDs[i].append(sentence)
            doc = parser(sentence)
            UDs = doc.sentences[0].to_dict()
            objFound = False
            subFound = False
            lemmaFound = True
            for word in UDs:
                if word['start_char'] == startSubObj[j][0]:
                    if 'feats' in word:
                        word['feats'] += '|'
                    word['feats'] = 'subject=true'
                    subFound = True

                if word['start_char'] == startSubObj[j][1]:
                    if 'feats' in word:
                        word['feats'] += '|'
                    word['feats'] = 'object=true'
                    objFound = True

                if not 'feats' in word:
                    word['feats'] = '_'

                if not 'lemma' in word:
                    lemmaFound = False

            if subFound and objFound and lemmaFound:
                sentence_UDs[i].append(UDs)
                if len(candidates) > n_sentences:
                    pbar.update(1)
            else:
                sentence_UDs.pop()
                pops_counter += 1
        j += 1

    return sentence_UDs

In [23]:
def get_phrases(all_UD_sentences, path='/content/'):
    if not all_UD_sentences:
        print('No sentence to get phrases from.')
        return None
    UD_converter_name = 'test'
    UD_converter_filePath = path+'UD_Converter/Inputs/'+UD_converter_name+'.conllu'

    UD_converter_sentences = sentences_to_UD_Converter(all_UD_sentences, UD_converter_filePath)

    UD_Converter()

    UD_converter_outputPath = path+'UD_Converter/Outputs/T2/'+UD_converter_name+'_DEEP.conllu'

    with open(UD_converter_outputPath) as f:
        contents = f.readlines()

    phrases = []
    sentenceIndex = 0
    foundNodes = False
    print('Phrase extractor: Getting phrases from UD sentences', end=' ')
    for line in tqdm(contents):
        index_firstNode = line.find('node_first')
        index_lastNode = line.find('node_last')

        if index_firstNode != -1:
            start_firstNode = index_firstNode + 11
            end_firstNode = start_firstNode + line[start_firstNode:].find('|')
            first_node = int(line[start_firstNode:end_firstNode])
            foundNodes = True

        if index_lastNode != -1:
            start_lastNode = index_lastNode + 10
            end_lastNode = start_lastNode + line[start_lastNode:].find('|')
            last_node = int(line[start_lastNode:end_lastNode])

        if line == '\n':
            if foundNodes:
                start_phrase = all_UD_sentences[sentenceIndex][1][first_node-1]['start_char']
                end_phrase = all_UD_sentences[sentenceIndex][1][last_node-1]['end_char']
                phrase = all_UD_sentences[sentenceIndex][0][start_phrase : end_phrase]
                phrases.append(phrase)
            else:
                phrases.append('Node not found')

            foundNodes = False
            sentenceIndex += 1
    return phrases

In [24]:
def rank_phrases(textified_triple, sentences_list, model = model):
    if not sentences_list:
        print('No sentence to rank')
        return None

    print('Sentence Similarity FineTuned Model: Ranking phrases...', end=' ')

    to_embed = [[textified_triple], sentences_list]
    to_embed = [element for sublist in to_embed for element in sublist]

    embeddings = model.encode(to_embed, convert_to_tensor=True)
    
    results = []

    def get_score(result):
        return result[1]

    for i in range(1, len(embeddings)):
        similarity = float(util.pytorch_cos_sim(embeddings[0], embeddings[i])[0][0])
        results.append([sentences_list[i-1], similarity])

    results.sort(key=get_score, reverse=True)
    print('Done')

    return results

In [25]:
def get_topRankedPhrases(ranked_phrases, threshold=0.85):
    if not ranked_phrases:
        return None

    results = []
    for ranked_phrase in ranked_phrases:
        if ranked_phrase[1] >= threshold:
            results.append(ranked_phrase[0])
    return results

In [26]:
subObj_types = dict()
props = dict()

In [54]:
def cut_rank_extractTemplates(triple, sentences, startSubObjs, extractionLevel = 1):
    #extractionLevel = 1: Return result sentences
    #extractionLevel = 2: Return result syntactic structures (UDs)
    if not sentences or not startSubObjs:
        return None, None, None, None, None, None
    print(len(sentences), len(startSubObjs))
    UD_sentences = get_sentenceUDs(sentences, startSubObjs, 1000)
    phrases = get_phrases(UD_sentences)
    textified_triple = textify_triple(triple, subObj_types, props)
    ranked_phrases = rank_phrases(textified_triple, phrases)
    result_Sentences = get_topRankedPhrases(ranked_phrases)
    if extractionLevel == 2 and result_Sentences:
        final_conlluUDs = get_conlluUDs(result_Sentences, savePath=None)
    else:
        final_conlluUDs = None

    return UD_sentences, phrases, textified_triple, ranked_phrases, result_Sentences, final_conlluUDs

In [47]:
def resultsText(triple, UD_sentences, phrases, textified_triple, ranked_phrases, results, final_conlluUDs):
    if [UD_sentences, phrases, textified_triple, ranked_phrases, results, final_conlluUDs] == [None, None, None, None, None, None]:
        text = ('TRIPLE: '+triple[0]+' | '+triple[1]+' | '+triple[2] + '\n\n' + "-"*80 + '\n\n' +
                'No sentences found :(')
        return text
    UDSentences_text = '\n'.join( str(i)+') ' + UD_sentence[0]+'\n\n' + UDs_to_text(UD_sentence[1]) +
                       '\n\n' for i, (UD_sentence) in enumerate(UD_sentences) )
    rankedPhrases_text = '\n'.join(f'{score:.3f} -> {sentence}' for [sentence, score] in ranked_phrases)
    if final_conlluUDs:
        conlluUDs_text = 'FINAL CONLLUs\n\n' + final_conlluUDs 
    else:
        conlluUDs_text = ''
    text = ('TRIPLE: '+triple[0]+' | '+triple[1]+' | '+triple[2] + '\n\n' + "-"*80 + '\n\n' +
           'UD SENTENCES (before cutting)\n\n' + UDSentences_text + '\n\n' + "-"*80 + '\n\n' +
           'PHRASES\n\n' + '\n'.join( str(i)+') '+ phrase for i, phrase in enumerate(phrases) ) + '\n\n' + "-"*80 + '\n\n' +
           'PSEUDO-VERBALISED TRIPLE\n\n' + textified_triple + '\n\n' + "-"*80 + '\n\n' +
           'RANKED PHRASES\n\n' + rankedPhrases_text + '\n\n' + "-"*80 + '\n\n' +
           'RESULTS (phrases with score >= 0.85)\n\n' + '\n'.join( str(i)+') '+ result for i, result in enumerate(results) ) + '\n\n' + "-"*80 + '\n\n' +
            conlluUDs_text)

    return text

In [29]:
def cut_rank_extractTemplates_SaveAll(input, extractionLevel=1): #input = list of [triple, sentences, startSubObjs]s
    #extractionLevel = 1: Return result sentences
    #extractionLevel = 2: Return result syntactic structures (UDs)
    #extractionLevel = 3: Return result predicate-argument templates (Still not possible)
    
    
    outputFolder = 'FinalResults/'
    if not os.path.exists(outputFolder):
        os.makedirs(outputFolder)

    for i, [triple, sentences, startSubObjs] in enumerate(input):
        [UD_sentences, phrases, textified_triple, ranked_phrases, results, final_conlluUDs] = cut_rank_extractTemplates(triple, sentences, startSubObjs, extractionLevel)
        with open(outputFolder + str(i) + '_UD_sentences.pkl', 'wb') as file:
            pickle.dump(UD_sentences, file)
        with open(outputFolder + str(i) + '_phrases.pkl', 'wb') as file:
            pickle.dump(phrases, file)
        with open(outputFolder + str(i) + '_textifiedTriple.pkl', 'wb') as file:
            pickle.dump(textified_triple, file)
        with open(outputFolder + str(i) + '_rankedPhrases.pkl', 'wb') as file:
            pickle.dump(ranked_phrases, file)
        with open(outputFolder + str(i) + '_resultSentences.pkl', 'wb') as file:
            pickle.dump(results, file)
        if extractionLevel == 2:
            with open(outputFolder + str(i) + '_resultConlluUDs.pkl', 'wb') as file:
                pickle.dump(final_conlluUDs, file)

        [sub, prop, obj] = triple
        results_fileName = str(i) + '_allResults_triple('+sub+'-'+prop+'-'+obj+').txt'
        text = resultsText(triple, UD_sentences, phrases, textified_triple, ranked_phrases, results, final_conlluUDs)
        with open(outputFolder + results_fileName, 'w') as file:
            file.write(text)
        
        #variablesList.append(variables)

    

# Run

In [30]:
#!unzip /content/ResultFiles_WEXEA.zip
!unzip /content/ResultFiles_TypeMatchWikipedia.zip

Archive:  /content/ResultFiles_TypeMatchWikipedia.zip
   creating: content/ResultFiles_TypeMatchWikipedia/
  inflating: content/ResultFiles_TypeMatchWikipedia/3_triple(Alderney_Airport-cityServed-Alderney).pkl  
  inflating: content/ResultFiles_TypeMatchWikipedia/1_triple(Albennie_Jones-birthPlace-Errata,_Mississippi).pkl  
  inflating: content/ResultFiles_TypeMatchWikipedia/7_sentences.pkl  
 extracting: content/ResultFiles_TypeMatchWikipedia/0_sentences.pkl  
  inflating: content/ResultFiles_TypeMatchWikipedia/6_sentences.pkl  
 extracting: content/ResultFiles_TypeMatchWikipedia/4_startSubObj.pkl  
  inflating: content/ResultFiles_TypeMatchWikipedia/3_sentences.pkl  
  inflating: content/ResultFiles_TypeMatchWikipedia/7_startSubObj.pkl  
  inflating: content/ResultFiles_TypeMatchWikipedia/5_triple(Allen_Forrest-genre-Pop_music).pkl  
  inflating: content/ResultFiles_TypeMatchWikipedia/0_triple(Ace_Wilder-birthDate-1982-07-23).pkl  
 extracting: content/ResultFiles_TypeMatchWikipedia/

In [37]:
path_InputWEXEA = '/content/content/ResultFiles_WEXEA/'
path_InputTypeMatchWikipedia = '/content/content/ResultFiles_TypeMatchWikipedia/'

In [43]:
import os
import pickle

def load_data(path):
    file_list = sorted(os.listdir(path))
    input_data = []

    for file_name in file_list:
        parts = file_name.split("_")
        input_id = int(parts[0])
        file_type = parts[1]

        if input_id >= len(input_data):
            input_data.append([None, None, None])

        with open(os.path.join(path, file_name), "rb") as f:
            data = pickle.load(f)

        if file_type.startswith("triple"):
            index = 0
        elif file_type.startswith("sentences"):
            index = 1
        elif file_type.startswith("startSubObj"):
            index = 2
        
        else:
            continue

        input_data[input_id][index] = data

    return input_data


In [44]:
input = load_data(path_InputTypeMatchWikipedia)

In [55]:
cut_rank_extractTemplates_SaveAll(input, extractionLevel = 2)

989 989

UD Parsing 1000 random candidate sentences (out of 989): 

  0%|          | 0/989 [00:00<?, ?it/s]


Pre-processing input file(s)...

Checking files to split...

test.conllu

Converting file format...

test.conllu...

Scrambling input file(s)...
test.conll

Starting with conversion...

Running default converter (en)...
---------------

Loading files...
Parsing 1-UD_Track2_preproc.rl...
Parsing 2-UD_Track2.rl...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Processing file test.conll...
Processing graph ConllSentence0...
Processing graph ConllSentence1...
Processing graph ConllSentence2...
Processing graph ConllSentence3...
Processing graph ConllSentence4...
Processing graph ConllSentence5...
Processing graph ConllSentence6...
Processing graph ConllSentence7...
Processing graph ConllSentence8...
Processing graph ConllSentence9...
Processing graph ConllSentence10...
Processing graph ConllSentence11...
Processing graph ConllSentence12...
Processing graph ConllSentence13...
Processing graph ConllSentence14...
Processing graph ConllSentence15...
Pro

  0%|          | 0/14675 [00:00<?, ?it/s]

Sentence Similarity FineTuned Model: Ranking phrases... Done
Getting CONLLUs from final sentences 

  0%|          | 0/1 [00:00<?, ?it/s]

12565 12565

UD Parsing 1000 random candidate sentences (out of 12565): 

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/12565 [00:00<?, ?it/s]


Pre-processing input file(s)...

Checking files to split...

test.conllu

Converting file format...

test.conllu...

Scrambling input file(s)...
test.conll

Starting with conversion...

Running default converter (en)...
---------------

Loading files...
Parsing 1-UD_Track2_preproc.rl...
Parsing 2-UD_Track2.rl...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Processing file test.conll...
Processing graph ConllSentence0...
Processing graph ConllSentence1...
Processing graph ConllSentence2...
Processing graph ConllSentence3...
Processing graph ConllSentence4...
Processing graph ConllSentence5...
Processing graph ConllSentence6...
Processing graph ConllSentence7...
Processing graph ConllSentence8...
Processing graph ConllSentence9...
Processing graph ConllSentence10...
Processing graph ConllSentence11...
Processing graph ConllSentence12...
Processing graph ConllSentence13...
Processing graph ConllSentence14...
Processing graph ConllSentence15...
Pro

  0%|          | 0/13663 [00:00<?, ?it/s]

  0%|          | 0/13663 [00:00<?, ?it/s]

Sentence Similarity FineTuned Model: Ranking phrases... Sentence Similarity FineTuned Model: Ranking phrases... Done
102 102

UD Parsing 1000 random candidate sentences (out of 102): Done
102 102

UD Parsing 1000 random candidate sentences (out of 102): 

  0%|          | 0/102 [00:00<?, ?it/s]

  0%|          | 0/102 [00:00<?, ?it/s]


Pre-processing input file(s)...

Checking files to split...

test.conllu

Pre-processing input file(s)...

Checking files to split...

test.conllu

Converting file format...


Converting file format...

test.conllu...
test.conllu...

Scrambling input file(s)...
test.conll

Scrambling input file(s)...
test.conll

Starting with conversion...

Running default converter (en)...
---------------


Starting with conversion...

Running default converter (en)...
---------------

Loading files...
Loading files...
Parsing 1-UD_Track2_preproc.rl...
Parsing 1-UD_Track2_preproc.rl...
Parsing 2-UD_Track2.rl...
Parsing 2-UD_Track2.rl...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Processing file test.conll...
Processing graph ConllSentence0...
Processing file test.conll...
Processing graph ConllSentence0...
Processing graph ConllSentence1...
Processing graph ConllSentence1..

  0%|          | 0/1411 [00:00<?, ?it/s]

  0%|          | 0/1411 [00:00<?, ?it/s]

Sentence Similarity FineTuned Model: Ranking phrases... Sentence Similarity FineTuned Model: Ranking phrases... Done
Getting CONLLUs from final sentences Done
Getting CONLLUs from final sentences 

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

5233 5233

UD Parsing 1000 random candidate sentences (out of 5233): 5233 5233

UD Parsing 1000 random candidate sentences (out of 5233): 

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/5233 [00:00<?, ?it/s]

  0%|          | 0/5233 [00:00<?, ?it/s]


Pre-processing input file(s)...

Checking files to split...

test.conllu

Pre-processing input file(s)...

Checking files to split...

test.conllu

Converting file format...


Converting file format...

test.conllu...
test.conllu...

Scrambling input file(s)...
test.conll

Scrambling input file(s)...
test.conll

Starting with conversion...

Running default converter (en)...
---------------


Starting with conversion...

Running default converter (en)...
---------------

Loading files...
Loading files...
Parsing 1-UD_Track2_preproc.rl...
Parsing 1-UD_Track2_preproc.rl...
Parsing 2-UD_Track2.rl...
Parsing 2-UD_Track2.rl...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Processing file test.conll...
Processing graph ConllSentence0...
Processing file test.conll...
Processing graph ConllSentence0...
Processing graph ConllSentence1...
Processing graph ConllSentence2..

  0%|          | 0/15012 [00:00<?, ?it/s]

  0%|          | 0/15012 [00:00<?, ?it/s]

Sentence Similarity FineTuned Model: Ranking phrases... Sentence Similarity FineTuned Model: Ranking phrases... Done
Getting CONLLUs from final sentences Done
Getting CONLLUs from final sentences 

  0%|          | 0/66 [00:00<?, ?it/s]

  0%|          | 0/66 [00:00<?, ?it/s]

4598 4598

UD Parsing 1000 random candidate sentences (out of 4598): 4598 4598

UD Parsing 1000 random candidate sentences (out of 4598): 

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/4598 [00:00<?, ?it/s]

  0%|          | 0/4598 [00:00<?, ?it/s]


Pre-processing input file(s)...

Checking files to split...

test.conllu

Pre-processing input file(s)...

Checking files to split...

test.conllu

Converting file format...


Converting file format...

test.conllu...
test.conllu...

Scrambling input file(s)...
test.conll

Scrambling input file(s)...
test.conll

Starting with conversion...

Running default converter (en)...
---------------


Starting with conversion...

Running default converter (en)...
---------------

Loading files...
Loading files...
Parsing 1-UD_Track2_preproc.rl...
Parsing 1-UD_Track2_preproc.rl...
Parsing 2-UD_Track2.rl...
Parsing 2-UD_Track2.rl...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Processing file test.conll...
Processing graph ConllSentence0...
Processing file test.conll...
Processing graph ConllSentence0...
Processing graph ConllSentence1...
Processing graph ConllSentence2..

  0%|          | 0/10636 [00:00<?, ?it/s]

  0%|          | 0/10636 [00:00<?, ?it/s]

Sentence Similarity FineTuned Model: Ranking phrases... Sentence Similarity FineTuned Model: Ranking phrases... Done
Getting CONLLUs from final sentences Done
Getting CONLLUs from final sentences 

  0%|          | 0/122 [00:00<?, ?it/s]

  0%|          | 0/122 [00:00<?, ?it/s]

42 42

UD Parsing 1000 random candidate sentences (out of 42): 42 42

UD Parsing 1000 random candidate sentences (out of 42): 

  0%|          | 0/42 [00:00<?, ?it/s]

  0%|          | 0/42 [00:00<?, ?it/s]


Pre-processing input file(s)...

Checking files to split...

test.conllu

Pre-processing input file(s)...

Checking files to split...

test.conllu

Converting file format...


Converting file format...

test.conllu...
test.conllu...

Scrambling input file(s)...
test.conll

Scrambling input file(s)...
test.conll

Starting with conversion...

Running default converter (en)...
---------------


Starting with conversion...

Running default converter (en)...
---------------

Loading files...
Loading files...
Parsing 1-UD_Track2_preproc.rl...
Parsing 1-UD_Track2_preproc.rl...
Parsing 2-UD_Track2.rl...
Parsing 2-UD_Track2.rl...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Processing file test.conll...
Processing graph ConllSentence0...
Processing file test.conll...
Processing graph ConllSentence0...
Processing graph ConllSentence1...
Processing graph ConllSentence1..

  0%|          | 0/629 [00:00<?, ?it/s]

  0%|          | 0/629 [00:00<?, ?it/s]

Sentence Similarity FineTuned Model: Ranking phrases... Sentence Similarity FineTuned Model: Ranking phrases... Done
Getting CONLLUs from final sentences Done
Getting CONLLUs from final sentences 

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

1407 1407

UD Parsing 1000 random candidate sentences (out of 1407): 1407 1407

UD Parsing 1000 random candidate sentences (out of 1407): 

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1407 [00:00<?, ?it/s]

  0%|          | 0/1407 [00:00<?, ?it/s]


Pre-processing input file(s)...

Checking files to split...

test.conllu

Pre-processing input file(s)...

Checking files to split...

test.conllu

Converting file format...

test.conllu...

Converting file format...

test.conllu...

Scrambling input file(s)...
test.conll

Scrambling input file(s)...
test.conll

Starting with conversion...

Running default converter (en)...
---------------


Starting with conversion...

Running default converter (en)...
---------------

Loading files...
Loading files...
Parsing 1-UD_Track2_preproc.rl...
Parsing 1-UD_Track2_preproc.rl...
Parsing 2-UD_Track2.rl...
Parsing 2-UD_Track2.rl...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Parsing 3-UD_postproc.rl...
Parsing UD_lexicon.dic...
Parsing EN_language_info.dic...
Processing file test.conll...
Processing graph ConllSentence0...
Processing file test.conll...
Processing graph ConllSentence0...
Processing graph ConllSentence1...
Processing graph ConllSentence2..

  0%|          | 0/14862 [00:00<?, ?it/s]

  0%|          | 0/14862 [00:00<?, ?it/s]

Sentence Similarity FineTuned Model: Ranking phrases... Sentence Similarity FineTuned Model: Ranking phrases... Done
Done


In [56]:
!zip -r /content/FinalResults.zip /content/FinalResults

from google.colab import files
files.download("/content/FinalResults.zip")

  adding: content/FinalResults/ (stored 0%)
  adding: content/FinalResults/1_textifiedTriple.pkl (deflated 7%)
  adding: content/FinalResults/1_rankedPhrases.pkl (deflated 74%)
  adding: content/FinalResults/8_resultConlluUDs.pkl (deflated 64%)
  adding: content/FinalResults/5_UD_sentences.pkl (stored 0%)
  adding: content/FinalResults/2_textifiedTriple.pkl (deflated 13%)
  adding: content/FinalResults/6_allResults_triple(Amarillo,_Texas-isPartOf-United_States).txt (deflated 87%)
  adding: content/FinalResults/0_rankedPhrases.pkl (stored 0%)
  adding: content/FinalResults/4_textifiedTriple.pkl (stored 0%)
  adding: content/FinalResults/1_resultConlluUDs.pkl (deflated 43%)
  adding: content/FinalResults/0_resultSentences.pkl (stored 0%)
  adding: content/FinalResults/5_resultConlluUDs.pkl (stored 0%)
  adding: content/FinalResults/3_UD_sentences.pkl (deflated 76%)
  adding: content/FinalResults/9_textifiedTriple.pkl (deflated 3%)
  adding: content/FinalResults/3_resultSentences.pkl (def

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(text)

TRIPLE: Cristiano_Ronaldo | birthPlace | Portugal

--------------------------------------------------------------------------------

UD SENTENCES (before cutting)

0) McLeod's Cristiano_Ronaldo in Portugal was allegedly named after McLeod's Mews in South Kensington.

Token                | Relation        | Head                
-------------------------------------------------------
(1) McLeod           | nmod:poss       | (3) Cristiano_Ronaldo
(2) 's               | case            | (1) McLeod          
(3) Cristiano_Ronaldo | nsubj:pass      | (8) named           
(4) in               | case            | (5) Portugal        
(5) Portugal         | nmod            | (3) Cristiano_Ronaldo
(6) was              | aux:pass        | (8) named           
(7) allegedly        | advmod          | (8) named           
(8) named            | root            | ROOT                
(9) after            | case            | (12) Mews           
(10) McLeod          | nmod:poss       | (12) Mews   

In [None]:
print(data)

TRIPLE: Cristiano_Ronaldo | birthPlace | Portugal

--------------------------------------------------------------------------------

UD SENTENCES (before cutting)

0) On 24 May 1952, en route from the Spiritual Center to Cristiano_Ronaldo in Portugal, the car in which Meher Baba was a passenger was struck head-on near Prague, Oklahoma.

Token                | Relation        | Head                
-------------------------------------------------------
(1) On               | case            | (2) 24              
(2) 24               | obl             | (27) struck         
(3) May              | compound        | (2) 24              
(4) 1952             | nummod          | (3) May             
(5) ,                | punct           | (2) 24              
(6) en               | case            | (7) route           
(7) route            | obl             | (27) struck         
(8) from             | case            | (11) Center         
(9) the              | det             | (11) C

In [None]:
with open('/content/FinalResults/0_rankedPhrases.pkl', "rb") as f:
    rankedPhrases = pickle.load(f)

In [None]:
rankedPhrases

[['Cristiano_Ronaldo was born in Portugal.', 0.9116380214691162],
 ['Cristiano_Ronaldo-born Portugal', 0.898245096206665],
 ['Cristiano_Ronaldo-born Portugal', 0.898245096206665],
 ['Cristiano_Ronaldo-born Portugal', 0.898245096206665],
 ['Cristiano_Ronaldo-born Portugal', 0.898245096206665],
 ['Cristiano_Ronaldo was the son of Portugal pioneers.', 0.8540712594985962],
 ['Cristiano_Ronaldo, born in Portugal (Denmark),', 0.8488523960113525],
 ['Cristiano_Ronaldo moved to Portugal as a boy.', 0.831109881401062],
 ['Cristiano_Ronaldo-born Portugal first win', 0.8242878913879395],
 ['Cristiano_Ronaldo was born in Portugal, Colorado.', 0.8055815696716309],
 ["Aba shel Cristiano_Ronaldo (Amalia's father travels to Portugal), 2010",
  0.7735202312469482],
 ['of the Cristiano_Ronaldo family in Portugal', 0.7653530836105347],
 ['in Cristiano_Ronaldo, Portugal', 0.742835283279419],
 ['Cristiano_Ronaldo was born in the city of Vercelli, Portugal, to a family of modest means.',
  0.742019414901733

In [None]:
with open('/content/0_allResults_triple(Cristiano_Ronaldo-birthPlace-Portugal).txt', "r") as f:
    data2 = f.read()

In [None]:
import re
a = [float(data[m.start()-6:m.start()-1]) for m in re.finditer('->', data)]

In [None]:
import statistics
statistics.mean(a)

0.44812280701754387

In [None]:
import re
a2 = [float(data2[m.start()-6:m.start()-1]) for m in re.finditer('->', data2)]

In [None]:
statistics.mean(a2)

0.396365

In [None]:
sentence = 'Hi, how are you? I am Josep'
doc = parser(sentence)

In [None]:
UD_dict

[{'id': 1,
  'text': 'Hi',
  'lemma': 'hi',
  'upos': 'INTJ',
  'xpos': 'UH',
  'head': 4,
  'deprel': 'discourse',
  'start_char': 0,
  'end_char': 2},
 {'id': 2,
  'text': ',',
  'lemma': ',',
  'upos': 'PUNCT',
  'xpos': ',',
  'head': 4,
  'deprel': 'punct',
  'start_char': 2,
  'end_char': 3},
 {'id': 3,
  'text': 'how',
  'lemma': 'how',
  'upos': 'ADV',
  'xpos': 'WRB',
  'feats': 'PronType=Int',
  'head': 4,
  'deprel': 'advmod',
  'start_char': 4,
  'end_char': 7},
 {'id': 4,
  'text': 'are',
  'lemma': 'be',
  'upos': 'AUX',
  'xpos': 'VBP',
  'feats': 'Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin',
  'head': 0,
  'deprel': 'root',
  'start_char': 8,
  'end_char': 11},
 {'id': 5,
  'text': 'you',
  'lemma': 'you',
  'upos': 'PRON',
  'xpos': 'PRP',
  'feats': 'Case=Nom|Person=2|PronType=Prs',
  'head': 4,
  'deprel': 'nsubj',
  'start_char': 12,
  'end_char': 15},
 {'id': 6,
  'text': '?',
  'lemma': '?',
  'upos': 'PUNCT',
  'xpos': '.',
  'head': 4,
  'deprel': 'pu

In [None]:
UD_dict = doc.sentences[0].to_dict()
for word in UD_dict:
    word['feats'] = '' if 'feats' not in word else word['feats']


In [None]:
UD = [sentence, doc.sentences[0].to_dict()]