In [9]:
'''
    CORA dataset builder
    
    Hotfixes:
     1) Pages - Remove "pp." and "pages" around digits
     2) Volume - Remove "Vol."
'''
import pandas as pd
import os
import glob
import numpy as np
import multiprocessing
import string
import random
import time
import itertools
import re

# ----- Tokenizer Starts ----- #

'''
  This method takes element and label and produce the proper BOI label for each token
  elemnt -> Can be XML token, word token or punctuation token
'''
def getTagLabelPairs(element, label=None):
  resultList = []
  punc = r"[\/ ,.:;()\"'?!+@#{\[}\]\\$^&*-]"
  tagLabel = []
  result = re.match("<[^>]+>",element) # Check if element is a string with XML
  initLabel = False #
  if result and label: # Enter if element is XML string
    element = re.sub("<[^>]+>", "", element) # Remove all XML tag from element
    if label == 'PAGE': # HOTFIX
        element = re.sub("[^\d-]+", "", element) ## For pages - remove anything other then digit & hyphen
    s = re.findall(r"\w+|[^\w\s]", element) # Split the element(with XML tag) into words and punctuation 
    for item in s:
      if item in punc:
        resultList.append((item,'B-PUNC'))
      else:
        if initLabel is False:
          initLabel = True
          resultList.append((item,'B-'+label))
        else:
          resultList.append((item,'I-'+label))
  else: # Works when element is anything other then string of XML tag. Can be punctuation or words outside of XML tag in ciation string
    # Either punctuation or other tag to anything outside of [defined] XML tag
    pattern = re.compile(punc)        
    result = pattern.match(element)
    if pattern.match(element): 
      resultList.append((element,'B-PUNC'))    
    else:
      resultList.append((element,'O'))    

  return resultList

# Utility method : Remove ASCII and unicodes
def strip_non_ascii(string):
  ''' Returns the string without non ASCII characters'''
  stripped = (c for c in string if 0 < ord(c) < 127)
  return ''.join(stripped)

'''
  Input:
  ciation string -> XML tagged citation string from GIANT dataset
'''
def tokenizeCitation(citationString):
  """
    Remove URL/DOI from the annotated citation string
    And cleanup texts before URL/DOI tags
    TODO: Currently keeping out of URL/DOI. Will work on this later
  """
  citationString = strip_non_ascii(citationString.replace('‚Äì','-').encode("ascii", "ignore").decode()) # Fixing GIANT training data citation string error and removing unicode

  urlExists = False
  doiExists = False
  urlordoi = None
  '''
    tech <remove>
    institute <remove> 
    editor <remove> 
    location
  '''
  if "editor" in citationString:
        citationString = re.sub('<editor>.+<\/editor>', "", citationString)
  if "institute" in citationString:
        citationString = re.sub('<institute>.+<\/institute>', "", citationString)
  if "institution" in citationString:
        citationString = re.sub('<institution>.+<\/institution>', "", citationString)
  if "tech" in citationString:
        citationString = re.sub('<tech>.+<\/tech>', "", citationString)     
  if "location" in citationString:
        citationString = re.sub('<location>.+<\/location>', "", citationString)   
  if "note" in citationString:
        citationString = re.sub('<note>.+<\/note>', "", citationString)           
        
  if "<URL>" in citationString:
    urlExists = True
    """
      re.findall [0] gets the matched string
      re.sub gives the text inside the tag
    """
    urlordoi = re.sub('<[^<]+>', "", re.findall('<URL>.+<\/URL>',citationString)[0]) 
    citationString = re.sub('<URL>.+<\/URL>', "", citationString)
  if "<DOI>" in citationString:
    doiExists = True
    urlordoi = re.sub('<[^<]+>', "", re.findall('<DOI>.+<\/DOI>',citationString)[0]) 
    citationString = re.sub('<DOI>.+<\/DOI>', "", citationString)

  tokens = []
  tags = []
  unprocessedTokens = []


  charPattern = r"[\w\d]"
  noCharPattern = r"[\/ ,.:;()\"'?!+@#{\[}\]\\$^&*-]" 
  endPtr = 0
  word = ""

  """
    Seperating non-word and words(with tags) at first 
    e.g. ['<author><family>Mehrinfar<family>', ',', ' ', '<given>Ramona<given>']
    [Processed them later]
  """
  tag = False
  tagName = ''
  elseCounter = 0
  while endPtr < len(citationString):
    # Keep everything inside the XML tag
    if citationString[endPtr] == '<':
      tag = True  

    if tag:      
      tagName += citationString[endPtr]

      if citationString[endPtr] == '>':
        tag = False
        tagName = tagName[-len(tagName)+1:-1] # Tag name excluding < & >
        tagExpression = r"<"+tagName+">.+<\/"+tagName+">" # <DOI>.+<\/DOI>
        p = re.compile(tagExpression)        
        result = p.search(citationString)
        if result:
          token = result.group()     # group(1) will return the 1st capture (stuff within the brackets).
          unprocessedTokens.append(token)   
          citationString = re.sub(tagExpression,'',citationString)
          
          endPtr = elseCounter
          tagName = ""
          word = ""
    else:
      # If it's not XML tag, look for char & non-char 
      if re.search(charPattern, citationString[endPtr]):
        word += citationString[endPtr]
      elif re.search(noCharPattern, citationString[endPtr]):
        if word is not "":
          unprocessedTokens.append(word)   
        unprocessedTokens.append(citationString[endPtr])
        word = ""
      elseCounter += 1  
    endPtr = endPtr + 1

  #print(unprocessedTokens)

  for item in unprocessedTokens:
    if item == ' ' or item == '':
      unprocessedTokens.remove(item)

  #print(unprocessedTokens)
  '''
    author -> author
    date -> date
    title -> title
    booktitle / journal -> based on citationType. If not found then containerTitle
    volume -> volume
    publisher -> publisher
    pages -> page
    other
  '''
  sentence = []
  for item in unprocessedTokens:    
    if "author" in item:
      sentence.append(getTagLabelPairs(item,"AUTHOR"))
    elif "booktitle" in item or "journal" in item:
      sentence.append(getTagLabelPairs(item,"CT"))      
    elif "title" in item:
      sentence.append(getTagLabelPairs(item,"TITLE"))
    elif "date" in item:
      sentence.append(getTagLabelPairs(item,"DATE"))
    elif "volume" in item:
      # EDIT =>  Vol.
      #item = re.sub('Vol.', "", item) # HOTFIX
      sentence.append(getTagLabelPairs(item,"VOL"))        
    elif "pages" in item:
      # EDIT => pp.
      #item = re.sub('pp.', "", item) # HOTFIX
      sentence.append(getTagLabelPairs(item,"PAGE"))
    elif "publisher" in item:
      sentence.append(getTagLabelPairs(item,"PUBLISHER"))            
    else:
      sentence.append(getTagLabelPairs(item))

  sentence = list(itertools.chain.from_iterable(sentence))  
  #print("Sentence {}".format(sentence))

  # If returning a list-of-token & list-of-tags is needed.
  

  tokens = []
  tags = []
  ''' Remove spaces from the processed tokens '''
  for item in sentence:
    if item[0]==' ' or item[0]=='':
        pass
    else:
        tokens.append(item[0])
        tags.append(item[1])

  #print("sentence {}".format(sentence))
  #print("Tokens {}".format(tokens))
  #print("Tags {}".format(tags))

  return (tokens,tags)
  #return sentence

# ----- Tokenizer Ends ----- #    

# Get processed Gold Dataset
def getDataTuples(dataset):
    """
        Input# dataset: n rows of annotated ciation strings 
        
        Process# Prepare each citation string using tokenizer mothod which returns word-label list
                 from a annotated citation string
        
        Output# List(List(word-lable tuple of one citation string))
    """

    '''Read the dataset'''
    
    ''' Prepare dataframe for tuples'''    
    df2 = pd.DataFrame(columns=['Tokens'])
    df3 = pd.DataFrame(columns=['Tags'])
    for data in dataset:
        tokenizerResponse = tokenizeCitation(data.strip())
        #print("Response: {}".format(tokenizerResponse))
        df2 = df2.append({'Tokens': tokenizerResponse[0]}, ignore_index=True)
        df3 = df3.append({'Tags': tokenizerResponse[1]}, ignore_index=True)        

    
    data = pd.concat([df2,df3],axis=1)

    ''' Prepare list of tuples'''
    datatuple = []
    for index, row in data.iterrows():
        tuples = []
        for idx in range(len(row['Tokens'])):
            # Yeilding out the spaces
            if row['Tokens'][idx] is not ' ':
                tuples.append((row['Tokens'][idx], row['Tags'][idx]))
        datatuple.append(tuples)
    print(datatuple)
    return datatuple

with open ("muddi/citationParser/CORA/tagged_references.txt", "r") as myfile:
    data = myfile.read().splitlines()
    print(type(data))
    for index in range(10):
        data_subset = random.sample(data, 50)
        #print(data_subset[0])

        datatuple = getDataTuples(data_subset)
        #print(datatuple[0])
        filename = 'cora_test_sample_'+str(index+1)+'.txt'
        filepath = 'muddi/citationParser/CORA/'+filename
        with open(filepath, 'a+') as fileValid:  
            for sentence in datatuple:
                for tuples in sentence:                
                    fileValid.write("{}\t{}\n".format(tuples[0],tuples[1]))    
                fileValid.write("\n")

<class 'list'>
[[('Leroy', 'B-AUTHOR'), (',', 'B-PUNC'), ('X', 'I-AUTHOR'), ('.', 'B-PUNC'), ('Typage', 'B-TITLE'), ('polymorphe', 'I-TITLE'), ('d', 'I-TITLE'), ("'", 'B-PUNC'), ('un', 'I-TITLE'), ('langage', 'I-TITLE'), ('algorithmique', 'I-TITLE'), ('.', 'B-PUNC'), ('7', 'B-DATE'), (',', 'B-PUNC'), ('1992', 'I-DATE'), ('.', 'B-PUNC')], [('Arthur', 'B-AUTHOR'), ('M', 'I-AUTHOR'), ('.', 'B-PUNC'), ('Keller', 'I-AUTHOR'), ('and', 'I-AUTHOR'), ('Julie', 'I-AUTHOR'), ('Basu', 'I-AUTHOR'), ('.', 'B-PUNC'), ('A', 'B-TITLE'), ('predicate', 'I-TITLE'), ('-', 'B-PUNC'), ('based', 'I-TITLE'), ('caching', 'I-TITLE'), ('scheme', 'I-TITLE'), ('for', 'I-TITLE'), ('client', 'I-TITLE'), ('-', 'B-PUNC'), ('server', 'I-TITLE'), ('database', 'I-TITLE'), ('architectures', 'I-TITLE'), ('.', 'B-PUNC'), ('In', 'B-CT'), ('Proceedings', 'I-CT'), ('of', 'I-CT'), ('PDIS', 'I-CT'), ('-', 'B-PUNC'), ('94', 'I-CT'), (',', 'B-PUNC'), ('1994', 'B-DATE'), ('.', 'B-PUNC')], [('Bylander', 'B-AUTHOR'), (',', 'B-PUNC'), 

In [7]:
with open ("muddi/citationParser/CORA/tagged_references.txt", "r") as myfile:
    data = myfile.read().splitlines()

print(data[0])

<author> A. Cau, R. Kuiper, and W.-P. de Roever. </author> <title> Formalising Dijkstra's development strategy within Stark's formalism. </title> <editor> In C. B. Jones, R. C. Shaw, and T. Denvir, editors, </editor> <booktitle> Proc. 5th. BCS-FACS Refinement Workshop, </booktitle> <date> 1992. </date>
