# SLING Parser

## IDEA:
SLING is a surprisingly robust frame semantics parser, able to extract frames from complicated sentences with many layers of nested predicates.

However, the interal organization of SLING-related objects remain somewhat unclear, due to relatively little documentation available.

Worse yet, the author has moved on from the "SEMPAR" model which we used for this project, in favor of another model, the "CASPAR" model.

In the implementation of the "CASPAR" model, the author decided to drop support for PropBank framesets, which are critical to our project.

With the code below, much of which we wrote, we were able to adapt the output of SLING to a format suitable for our project.

In [None]:
##### IMPORT STATEMENTS #####

from collections import OrderedDict
from collections import defaultdict
import sling
import re
import json

##### SLING PARSER #####

# We will use the older SEMPAR model
parser = sling.Parser("sempar.flow")


##### HELPER FUNCTIONS #####

# We will also use code found online, that converts a string with appropriate indentations to a nested list
# Reference: https://stackoverflow.com/questions/31551395/tab-formatted-nested-string-to-nested-list-python

#==== START OF REFERENCED MATERIAL ====#

def parse(data):

  currentTab = 0
  currentList = []
  result = [currentList]

  i = 0
  tabCount = 0

  for line in data.splitlines():

    tabCount = len(line)-len(line.lstrip())

    line = line.strip().rstrip(' :')

    if tabCount == currentTab:
        currentList.append(line)

    elif tabCount > currentTab:
        newList = [line]
        currentList.append(newList)
        currentList = newList

    elif tabCount == 0:
        currentList = [line]
        result.append(currentList)

    elif tabCount == 1:
        currentList = [line]
        result[-1].append(currentList)

    currentTab = tabCount

    tabCount = tabCount + 1
    i = i + 1

  return result
  
#==== END OF REFERENCED MATERIAL ====#

# This is another helper function for parsing verbs in a list
# to a nested dictionary structure that we will use

def parseverb(li):
    
    mydict = {}
    
    for i in range(len(li)):
        if i==0:
            mydict["__verb__"] = li[i]
            
        else:
            if isinstance(li[i], str):
                arg, word = li[i].split(" ")
                
                mydict[arg] = word
                
            elif isinstance(li[i], list):
                mydict["other verbs"] = parseverb(li[i])
    
    return mydict

##### TEXT_TO_FRAMES FUNCTION #####

# This function, using SLING, takes caption text as an input,
# and outputs frame data in a format convinient for our project

def text_to_frames(text):
    doc = parser.parse(text)
    mystr = doc.frame.data(pretty=True)
    
    index = mystr.find("]")
    firsthalf = mystr[:index]
    secondhalf = mystr[index+2:]
    
    #print(firsthalf)
    
    # Find the largest ref number in the first half
    hashtags = re.findall("=#[0-9]*", firsthalf)
    
    # remove "=#" and find max
    li = []

    for item in hashtags:
        li.append(int(item[2:]))
        
    maxval = max(li)
    #print(maxval)
    
    # Create tok to ref dictionary
    from collections import defaultdict

    index_to_tok = {}
    ref_to_index = {}

    i = 2 # First token is ref number 2

    for tok in doc.tokens:
        ref_to_index[i] = i-2
        index_to_tok[i-2] = tok.text
        i += 1
        
    # print(repr(secondhalf))
    
    mentions = []
    
    li = []
    
    ### Curly braces matching ###
    matchingmode = False # Engage in parentheses matching, when true
    stack = []

    startindex = 0

    for i in range(len(secondhalf)):
        c = secondhalf[i]
        
        # Engage in matching mode
        if (not matchingmode and c=="{"):
            stack.append("{")
            matchingmode = True
            startindex = i
            
        # Disengage from matching mode
        elif (matchingmode and c=="}" and len(stack)==1):
            li.append(secondhalf[startindex:i+1])
            stack.pop()
            matchingmode = False
            
        elif (matchingmode and c=="}"):
            stack.pop()
            
        elif (matchingmode and c=="{"):
            stack.append("{")
    
    text_by_index = text.split(" ")

    # Use regex to format SLING's output
    numbers1 = re.findall("{=#[0-9]* \n    :/s/phrase\n    /s/phrase/begin: [0-9]*", secondhalf)
    numbers2 = re.findall("/s/phrase/evokes: [{]?[=]?#[0-9]*", secondhalf)

    for i in range(len(numbers1)):
        if numbers1[i][-3]==":":
            numbers1[i] = numbers1[i][:-2] + " " + numbers1[i][-2:]

    li1 = []
    li2 = []
            
    for item in numbers1:
        li1.append(item[-3:])
        
    for item in numbers2:
        i = item.index("#")
        li2.append(item[i+1:])
        
    #print(li1)

    #print(li2)
    

    mapping = {}

    for i in range(len(li1)):
        mapping[li2[i]] = text_by_index[int(li1[i])]
        
    #print(mapping)
    
    for num in mapping:
        for i in range(len(li)):
            li[i] = li[i].replace(num+" \n", mapping[num]+" \n")
            li[i] = li[i].replace(num+"\n", mapping[num]+"\n")
    
    predli = []

    for item in li:
        if "/pb/arg" in item:
            predli.append(item)
            
    # More string formatting
            
    for i in range(len(predli)):
        
        predli[i] = predli[i].replace("/s/phrase", "")
        predli[i] = predli[i].replace("/pb/", "")
        predli[i] = predli[i].replace("{", "")
        predli[i] = predli[i].replace("}", "")
        predli[i] = predli[i].replace(":", "")
        predli[i] = predli[i].replace("=#", "")
        predli[i] = predli[i].replace("#", "")
        
        predli[i] = re.sub("[ ]*[0-9]*[ ]*\n[ ]*\n[ ]*/begin[ ]*[0-9]*\n[ ]*/evokes[ ]*[a-z]*[ ]*", "", predli[i])
        
        
    newpredli = []
        
    for pred in predli:
        
        #print("===")
        #print((pred))
       
        newpredli.append(pred.split("\n"))
        
    for newpred in newpredli:
        for i in range(len(newpred)):
            newpred[i] = newpred[i].replace("  ", "==>")
            newpred[i] = newpred[i][9:]
                
    for newpred in newpredli:
        for i in range(len(newpred)):
            for n in range(21):
                if newpred[i]=="==>"*n:
                    newpred[i] = "REMOVETHIS"
                if "/saft/" in newpred[i] or "/s/" in newpred[i]:
                    newpred[i] = "REMOVETHIS"
                    

                    
    for newpred in newpredli:
        while "REMOVETHIS" in newpred:
            newpred.remove("REMOVETHIS")
            
    for newpred in newpredli:
        for i in range(len(newpred)-1):
            line1 = newpred[i]
            line2 = newpred[i+1]
                
    for newpred in newpredli:
        while "REMOVETHIS" in newpred:
            newpred.remove("REMOVETHIS")
            
    #for newpred in newpredli:
        #for line in newpred:
            #print(line)
    
    for newpred in newpredli:
        for i in range(len(newpred)):
            newpred[i] = newpred[i].replace("==>", "\t")
            
    longstrli = []

    for newpred in newpredli:
        longstrli.append("\n".join(newpred))
        
    nestli = []

    for longstr in longstrli:
        nestli.append(parse(longstr))
        
    vdictli = []
    jsonli = []

    for nest in nestli:
        for p in nest:
            vdictli.append(parseverb(p))

    for vdict in vdictli:
        json_object = json.dumps(vdict, indent = 4, sort_keys=True)
        jsonli.append(json_object)
            
    return jsonli # Output is a list of JSON objects, one for each top-level predicate in a caption

In [None]:
# This function allows for the conversion of the JSON objects to tuples
# that can be easily used further down in the pipeline
def json_to_tuples(js):
    lines = js.split("\n")
    
    tups = []
    
    lastverb = ""
    
    for line in lines:
        
        
        if '__verb__' in line:
            lastverb = line.split('"')[-2]
        
        elif '"arg' in line:
            split = line.split('"')
            tups.append((split[-2], split[1], lastverb))
            
    return tups

In [None]:
# Example of the text_to_frames() function running

jsonli = text_to_frames("a woman closing a text that a laptop")
for js in jsonli:
    print(js)
    print(json_to_tuples(js))
    

{
    "__verb__": "close-01", 
    "arg0": "woman", 
    "arg1": "text"
}
[('woman', 'arg0', 'close-01'), ('text', 'arg1', 'close-01')]


In [None]:
# Code for running the defined text_to_frames() function on the output from
# the previous segment of the pipeline, and writing the output of our function
# to a file so that it can be used further down the pipeline

f = open("capOutput.txt", "r")

capli = []

while(True):

    line = f.readline()
    
    if not line:
        break
    
    
    line = line.replace(",", "")
    line = line.replace("'", "")
    line = line.replace(". <end>\r\n", "")
    line = line.replace(". <end> \r\n", "")
    line = line.replace(". <end>", "")
    line = re.split(':.', line)
    
    capli.append(line[1])
    
framesli = []
    
for cap in capli:
    
    frames = []
    
    jsonli = text_to_frames(cap)
    
    for js in jsonli:
        frames.append(json_to_tuples(js))
    
    framesli.append(frames)
    
overalldict = {}
    
for i in range(len(framesli)):
    
    idict = defaultdict(list)
    
    for subframe in framesli[i]:
        for tup in subframe:
            idict[tup[0]].append((tup[1], tup[2]))
            
    overalldict[i] = dict(idict)

with open('output.json', 'w') as f:
    json.dump(overalldict, f)
            
    