# Combined Framework for Symlink

In [3]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# enter the foldername in your Drive where you have saved the unzipped
# workshop folder, e.g. 'acmlab/workshops/project'
FOLDERNAME = 'primary_description'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd /content/drive/My\ Drive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1Ip0tGIOuwJNvhITLBA4pbOcMNg9Lv-mq/primary_description


## Import Libraries

In [4]:
import numpy as np
import os
from sklearn.metrics import f1_score
import pandas as pd
from textblob import TextBlob
import nltk
import re
import difflib
import json

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

from IPython.display import display

# import stanza
# stanza.install_corenlp()
# import spacy
# nltk.download('brown')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


### Read in Data

In [5]:
data = pd.read_json("Practice/cs.ai-ann0.json")
# text_data = data.loc["text"]
# text_data.to_dict()

Functions for extracting symbols and primary descriptions

In [6]:
def extract_primary_description(entity): #Entity is a dicitionary.
  output = {}
  for key in entity.keys():
    if entity[key]['label'] == "PRIMARY":
      output[key] = entity[key]
  return output


In [7]:
def extract_symbol(entity): #Entity is a dicitionary.
  output = {}
  for key in entity.keys():
    if entity[key]['label'] == "SYMBOL":
      output[key] = entity[key]
  return output


In [8]:
lemmatizer = nltk.WordNetLemmatizer()
def setup(text):

  #word tokenizeing and part-of-speech tagger
  document = text
  tokens = [nltk.word_tokenize(sent) for sent in [document]]
  postag = [nltk.pos_tag(sent) for sent in tokens][0]

  # Rule for NP chunk and VB Chunk
  grammar = r"""
      NBAR:
          {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
          {<RB.?>*<VB.?>*<JJ>*<VB.?>+<VB>?} # Verbs and Verb Phrases
          
      NP:
          {<NBAR>}
          {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
          
  """
  #Chunking
  cp = nltk.RegexpParser(grammar)

  # the result is a tree
  tree = cp.parse(postag)
  return tree

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label() =='NP'):
        yield subtree.leaves()
        
def get_word_postag(word):
    if pos_tag([word])[0][1].startswith('J'):
        return wordnet.ADJ
    if pos_tag([word])[0][1].startswith('V'):
        return wordnet.VERB
    if pos_tag([word])[0][1].startswith('N'):
        return wordnet.NOUN
    else:
        return wordnet.NOUN
    
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    # word = word.lower()
    # postag = get_word_postag(word)
    # word = lemmatizer.lemmatize(word, postag)
    return word

def get_terms(tree):    
    for leaf in leaves(tree):
        terms = [normalise(w) for w,t in leaf]
        yield terms

def getNounPhrases(text):
  wordsToRemove = ['be', 'is', 'are', 'was', 'were', 'been', 'being']
  tree = setup(text)

  terms = get_terms(tree)

  features = []
  for term in terms:
      _term = ''
      for word in term:
        _term += ' ' + word

      if not any(x in _term.split() for x in wordsToRemove) and '\\' not in _term:
        features.append(_term.strip())

  res = []
  res += re.findall(r"(?<=\$\sbe\s)(.*?)(?=\.|\,|\;|\:|\!|\?|and)", text)
  res += re.findall(r"(?<=\$\srepresent\s)(.*?)(?=\.|\,|\;|\:|\!|\?|and)", text)
  res += re.findall(r"(?<=\$\srepresents\s)(.*?)(?=\.|\,|\;|\:|\!|\?|and)", text)
  res += re.findall(r"(?<=\$\sdenote\s)(.*?)(?=\.|\,|\;|\:|\!|\?|and)", text)
  res += re.findall(r"(?<=\$\sdenotes\s)(.*?)(?=\.|\,|\;|\:|\!|\?|and)", text)
  res += re.findall(r"(?<=\$\sis\s)(.*?)(?=\.|\,|\;|\:|\!|\?|\\|and)", text)
  res += re.findall(r"(?<=\sof\s)(.*?)(?=\$|\.|\,|\;|\:|\!|\?|\\|and)", text)
  result1 = re.findall(r"(?<=\sthe\s)(.*?)(?=\$|\.|\,|\;|\:|\!|\?|\\|and)", text)
  for i in range(len(result1)):
    result1[i] = 'the ' + result1[i]
  res += result1

  result2 = re.findall(r"(?<=\sThe\s)(.*?)(?=\$|\.|\,|\;|\:|\!|\?|\\)", text)
  for i in range(len(result2)):
    result2[i] = 'The ' + result2[i]
  res += result2

  result3 = re.findall(r"(?<=\sa\s)(.*?)(?=\$|\.|\,|\;|\:|\!|\?|\\)", text)
  for i in range(len(result3)):
    result3[i] = 'a ' + result3[i]
  res += result3


  for phrase in res:
    features.append(phrase.strip())
      
  return features

### Kapil's Code for extracting Symbols

In [25]:
def getSymbols(string): # given a string, returns a dictionary of each symbol, its start index, and its end index
  temp = {}
  # substrings surrounded by delimiter pairs are math text
  delimiters = {'\\(': '\\)', '$': '$', '$$': '$$', '\\begin{math}': '\\end{math}', '\\[': '\\]', '\\begin{displaymath}': '\\end{displaymath}', '\\begin{equation}': '\\end{equation}', '\\begin{align}': '\\end{align}', '\\begin{eqnarray}': '\\end{eqnarray}', '\\begin{array}': '\\end{array}', '(': ')'}
  # substrings to the left and right of a splitter are considered separate symbols
  splitters = ['=', '<', '>', '<=', '=>', '\\leq', '\\geq', '\\leftarrow', '\\rightarrow', '\\longleftarrow', '\\longrightarrow', '\\Leftarrow', '\\Rightarrow', '\\Longleftarrow', '\\Longrightarrow', '\\leftrightarrow', '\\Longleftrightarrow', '\\mapsto', '\\longmapsto', '\\neq', '\equiv']

  for start, end in delimiters.items(): # grab each pair of delimiters
    new = string # scan the complete string for each pair of delimiters
    while True:   
      a = new.find(start)
      a_shift = a + len(start) # first delimiter
      b = new.find(end, a_shift) # second delimiter
      if a == -1 or b == -1: # if pair of delimiters are not present, go to the next pair of delimiters
        break
      else: # if pair of delimiters are present, add the symbols and locations to the dict

        if new[a_shift] == '_': # check for subscripts
          a_shift -= 3
          while a_shift != ' ':
            a_shift -= 1
          a_shift += 1

        

        result = new[a_shift:b]
        if (start != '(' or (result.isupper() and len(result.split())) <= 1) and result.strip() != '': # the (parentheses) delimiter pair only surround abbreviations, i.e. single full-caps words
          temp[result] = (a_shift, b)
        new = new[b + len(end):] # search for the same pair of delimiters located AFTER the pair we have just identified
  
  symbols = dict(temp) # make two copies -- one to iterate over and one to store the split symbols

  for symbol, location in temp.items(): # recognize symbols separately in equalities/inequalities/implications
    (start, end) = location
    for splitter in splitters:
      if splitter in symbol:
        split = [sym.strip() for sym in symbol.split(splitter)]
        symbols[split[0]] = (start, start + len(split[0]))
        symbols[split[-1]] = (end - len(split[1]), end)
        try: # delete the unsplit symbol
          del symbols[symbol] 
        except: # sometimes the unsplit symbol will have already been deleted from temp, because it contained more than one splitter
          pass 

  entity = {}
  num = 1
  for symbol in symbols.keys():
    eid = 'T' + str(num)
    num += 1
    entity[eid] = {}
    entity[eid]['eid'] = eid
    entity[eid]['label'] = 'SYMBOL'
    entity[eid]['start'] = symbols[symbol][0]
    entity[eid]['end'] = symbols[symbol][1]
    entity[eid]['text'] = symbol

  return entity

## Calculate recall for descriptors

In [26]:
def calculate_recall_descriptors(noun_list, expected):
  total = 0
  count = 0
  primary_descriptor_list = []
  for key in expected.keys():
    primary_descriptor_list.append(expected[key]["text"])
  total = len(primary_descriptor_list)  
  for descriptor in primary_descriptor_list:
    if descriptor in noun_list:  
      count += 1
      noun_list.remove(descriptor)
  return count / total
  

## Test word rules

In [27]:
text = "Let $x$ be the number of bins. Let $\\alpha + \\beta$ be the sum of two parameters, which are then used for analysis."
# text = "Let $x$ be the number of bins. Let $\alpha + \beta$ be the sum of two parameters."
# res = re.findall(r"(?<=\$\sbe\s)(.*?)(?=\.)", text)
# text = "Let $x$ be the number of bins :"
res = re.findall(r"(?<=\$\sbe\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
print(res)

['the number of bins', 'the sum of two parameters']


In [28]:
text = "Let $x$ represent the number of bins, where $\alpha + \beta$ represents the sum of two parameters, which are then used for analysis."
# text = "Let $x$ be the number of bins. Let $\alpha + \beta$ be the sum of two parameters."
# res = re.findall(r"(?<=\$\sbe\s)(.*?)(?=\.)", text)
# text = "Let $x$ be the number of bins :"
res = re.findall(r"(?<=\$\srepresent\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
print(res)

['the number of bins']


In [29]:
text = data.loc["text"][1]
print(text)
expected = extract_primary_description(data.loc["entity"][1])
print("Noun phrases", getNounPhrases(text))
print("Expected", expected)
calculate_recall_descriptors(getNounPhrases(text), expected)

We fit the parameters according to the procedure described in \cite{EkanadhamKarklin15} .
Estimating the entire trajectory $\thetastraj$ for each student simultaneously with item parameters is very expensive and difficult to do in real - time .
To simplify the approach , we learn parameters in two stages : \begin{enumerate}  \item We learn the            according to a standard 1PO IRT model (see Section~\ref{sec:irtlearning} ) on the training student population and freeze these during validation .
           .           
For the second step , we combine the approximation :            & P ( \{(s', i, r, t') \in D: s'=s, t'\leq t\}|\theta_{s,t} ) \approx \nonumber \\ &\prod_{(s',i,r,t') \in D: s'=s, t'\leq t} P ( ( s' , i , r , t ' ) | \theta_{s,t} ) \end{align} with \eqref{eq:wiener} , integrating out previous proficiencies of the student to get a tractable approximation of the log posterior over the student 's current proficiency given previous responses :
\begin{align}  \log P(\theta

1.0

In [None]:
# print(getNounPhrases("Mostly, there just is no default way of determining the paragraph boundary and people tend to work with sentences. Still, the unit of a paragraph might be of a higher value than that of a sentence. Examples might be: coreference resolutions that overlap multiple sentences. Questions that find their answer throughout a whole paragraph. A reader that understands a paragraph better than an isolated sentence. It’s clear that the signal from a writer is best expressed in a paragraph."))

In [None]:
# df = data
# for column in df:
#     wantedOutput = extract_primary_description(df[column]["entity"])
#     paragraph = df[column]["text"]
#     print("PARAGRAPH:\n", paragraph)
#     print("NOUN PHRASES:\n", getNounPhrases(paragraph))
#     print("EXPECTED:\n", wantedOutput)
#     print("*" * 280)
#     print("")

In [30]:
def findNounsWithLocs(text):
  '''This function takes in a block of text, finds the nouns in it and then returns an array of 1s and 0s representing where those nouns are'''
  originalText = text
  #modify text here however we please in getNounPhrases
  nounList = getNounPhrases(text)
  start = 0
  predicted_array = np.zeros(len(originalText))

  for word in nounList:
    nounStartLoc = originalText.find(word, start)
    nounEndLoc = nounStartLoc + len(word)

    if abs(originalText.find('$', start) - nounStartLoc) < 25:
      predicted_array[nounStartLoc : nounEndLoc] = 1

    start = nounEndLoc

  return predicted_array

In [31]:
print(findNounsWithLocs("The bus is yellow."))

[0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
# for column in data:
#   paragraph = df[column]["text"]
#   print("PARAGRAPH:\n", paragraph)
#   print("NOUN PHRASES:\n", getNounPhrases(paragraph))
#   print("NOUN LOCATIONS:\n", findNounsWithLocs(paragraph))
#   print('*' * 280 + '\n')

In [32]:
def create_arrays(expected, predicted, len_text):
  expected_array = np.zeros(len_text)  
  predicted_array = np.zeros(len_text)
  for key in expected.keys():
    start_index = expected[key]["start"]
    end_index = expected[key]["end"]
    expected_array[start_index : end_index + 1] = 1

  
  # for key in predicted.keys():
  #   start_index = predicted[key]["start"]
  #   end_index = predicted[key]["end"]
  #   predicted_array[start_index : end_index + 1] = 1

  return (expected_array, predicted_array)

In [33]:
def compare(expected, predicted): #Computes the F1 score between expected and predicted
  expected_array = np.array(len())

In [34]:
def framework(df):
  results = []
  recalls = []
  for column in df:
    paragraph = df[column]["text"]
    expected = df[column]["entity"]
    primary_description = extract_primary_description(expected)
    predicted = paragraph
    if len(primary_description) != 0:
      recalls.append(calculate_recall_descriptors(getNounPhrases(predicted), primary_description))
    # expected_array, predicted_array = create_arrays(primary_description, predicted, len(paragraph))
    expected_array, predicted_array = create_arrays(primary_description, predicted, len(paragraph))
    predicted_array = findNounsWithLocs(paragraph)
    result = f1_score(expected_array, predicted_array, average= "binary", zero_division = 1)
    results.append(result)
  print("Average F1 score is:", sum(results) / len(results))
  print("Average recall: ", sum(recalls) / len(recalls))


In [37]:
def framework_recall(filename):

  with open(filename) as json_file: 
    test_json = json.load(json_file)
    
    test_number = 1
    num_right_total = 0
    total = 0
    wrong = []
    for key in test_json.values():
      practice = {}
      for val in key['entity'].keys():
        entity = key['entity'][val]
        if entity['label'] == 'SYMBOL':
          practice[entity['text'].replace('\\', '').replace('$', '').strip()] = (entity['start'], entity['end'])
      if practice == {}:
        continue
      symbols = getSymbols(key['text'])
      symbols_list = []
      for val in symbols.values():
        symbols_list.append(val['text'])
      num_right = len([symbol for symbol in symbols_list if symbol.replace('\\', '').replace('$', '').strip() in practice])
      percent_right = min(1, num_right / len(practice.keys()))
      # wrong += [practice_missed for practice_missed in practice if practice_missed not in [symbol.replace('\\', '').replace('$', '').strip() for symbol in symbols]]
      
      num_right_total += num_right
      total += len(practice.keys())

      # print(test_number)
      # print('Percent right: ' + str(percent_right * 100))
      test_number += 1

    # print('Total symbols tested: ' + str(total))
    # print('Total symbols correct: ' + str(num_right_total))
    print('Average symbol recall: ' + str(num_right_total / total)) 
    # print('\nWhat you did not identify:')
    # print(wrong)

  results = []
  recalls = []
  df = pd.read_json(filename)
  for column in df:
    paragraph = df[column]["text"]
    expected = df[column]["entity"]
    primary_description = extract_primary_description(expected)
    if len(primary_description) != 0:
      recalls.append(calculate_recall_descriptors(getNounPhrases(paragraph), primary_description))
  print("Average descriptor recall: ", sum(recalls) / len(recalls))

In [None]:
#"The current F1 score of 0.355 arises when we leave the predicted_array all full of zeroes and compare with the expected."
# 0.261 is our current best implementing NLP techniques w/ <25
# framework(data)

In [36]:
dataFiles = []
pwd = '/content/drive/My Drive/primary_description/Practice'
for filename in os.listdir(pwd):
    if filename.endswith("json"): 
        dataFiles.append(pwd + '/' + filename)
for filename in dataFiles:
  print(filename.replace('/content/drive/My Drive/primary_description/Practice', ""))
  framework_recall(filename)
  print()

/physics.atom_ph-ann10.json
Average symbol recall: 0.6078291814946619
Average descriptor recall:  0.5662660355175455

/q_bio.qm-ann11.json
Average symbol recall: 0.6092959042797975
Average descriptor recall:  0.5548967998040463

/cs.ai-ann0.json
Average symbol recall: 0.8248275862068966
Average descriptor recall:  0.5866658568189179

/cs.ai-ann3.json
Average symbol recall: 0.5426008968609866
Average descriptor recall:  0.6252281557784404

/econ.th-ann6.json
Average symbol recall: 0.8324324324324325
Average descriptor recall:  0.4732876712328767

/physics.atom_ph-ann8.json
Average symbol recall: 0.49296567248171075
Average descriptor recall:  0.5832093253968254

/econ.th-ann5.json
Average symbol recall: 0.8915254237288136
Average descriptor recall:  0.5506547619047619

/physics.atom_ph-ann9.json
Average symbol recall: 0.562708102108768
Average descriptor recall:  0.5757469862422696

/math.co-ann7.json
Average symbol recall: 0.8506024096385543
Average descriptor recall:  0.53074400123216

This is for F1 score

In [None]:
# dataFiles = []
# pwd = '/content/drive/My Drive/primary_description/Practice'
# for filename in os.listdir(pwd):
#     if filename.endswith("json"): 
#         dataFiles.append(pwd + '/' + filename)
# for filename in dataFiles:
#   print(filename.replace('/content/drive/My Drive/primary_description/Practice/', ""))
#   framework(pd.read_json(filename))
#   print()

In [None]:
def get_descriptions_paragraph(paragraph):
    sentences = nltk.tokenize.sent_tokenize(paragraph)
    answer = {}
    cnt = 0
    for sent in sentences:
      features = getNounPhrases(sent)
      for feature in features:  ##TODO: Consider case where a word occurs multiple times in a sentence##
        if sent.find(feature) > -1:
          start = cnt + sent.find(feature)
          end = start + len(feature)
          answer[feature] = (start, end)
      cnt += len(sent)
    return {k: v for k, v in sorted(answer.items(), key=lambda item: item[1])}


In [None]:
def descriptor_to_output(descriptors):
    entity = {}
    num = 1
    for desc in descriptors.keys():
      eid = 'T' + str(num)
      num += 1
      entity[eid] = {}
      entity[eid]['eid'] = eid
      entity[eid]['label'] = 'PRIMARY'
      entity[eid]['start'] = descriptors[desc][0]
      entity[eid]['end'] = descriptors[desc][1]
      entity[eid]['text'] = desc

      return entity

In [None]:
def merge(dict1, dict2):
    res = list(dict1.values()) + list(dict2.values())
    res = sorted(res, key=lambda d: d['start'])
    #correct still this point
    entityRes = {}
    num = 1
    for d in res:
      eid = 'T' + str(num)
      num += 1
      entityRes[eid] = d
      entityRes[eid]['eid'] = eid
    return entityRes

# def merge(dict1, dict2):
#     res = {}
#     dict1_key_list = list(dict1.keys())
#     dict2_key_list = list(dict2.keys())
#     while len(dict1) > 0 or len(dict2) > 0:
#         if len(dict2) == 0:
#             res[first_descriptor] = dict1[first_descriptor]
#             del dict1[first_descriptor]
#             dict1_key_list.pop(0)
#         elif len(dict1) == 0:
#             res[first_symbol] = dict2[first_descriptor]
#             del dict2[first_symbol]
#             dict2_key_list.pop(0)
#         else:
#           first_descriptor = dict1_key_list[0]
#           first_symbol = dict2_key_list[0]
#           descriptor_start = dict1[first_descriptor]['start']
#           symbol_start = dict2[first_symbol]['start']
#           if descriptor_start < symbol_start:
#               res[first_descriptor] = dict1[first_descriptor]
#               del dict1[first_descriptor]
#               dict1_key_list.pop(0)
#           else:
#               res[first_symbol] = dict2[first_descriptor]
#               del dict2[first_symbol]
#               dict2_key_list.pop(0)
#     print(res)

In [None]:
df = pd.read_json("Practice/physics.atom_ph-ann10.json")
paragraph = df.loc["text"][10]
descriptors = get_descriptions_paragraph(paragraph)
dict1 = descriptor_to_output(descriptors)
dict2 = getSymbols(paragraph)
# print(dict1)
# print(dict2)
merged = merge(dict1, dict2)
print(dict1)
# print('*'*280)
# print(dict2)
# print('*'*280)
print(merged)

{'T1': {'eid': 'T6', 'label': 'PRIMARY', 'start': 6, 'end': 14, 'text': 'evaluate'}}
{'T1': {'eid': 'T1', 'label': 'SYMBOL', 'start': 2, 'end': 6, 'text': '\\tau'}, 'T2': {'eid': 'T2', 'label': 'SYMBOL', 'start': 2, 'end': 4, 'text': 'RT'}, 'T3': {'eid': 'T3', 'label': 'SYMBOL', 'start': 4, 'end': 8, 'text': 'n|RT'}, 'T4': {'eid': 'T4', 'label': 'SYMBOL', 'start': 4, 'end': 7, 'text': ' 0 '}, 'T5': {'eid': 'T5', 'label': 'SYMBOL', 'start': 5, 'end': 9, 'text': 'p(n)'}, 'T6': {'eid': 'T6', 'label': 'PRIMARY', 'start': 6, 'end': 14, 'text': 'evaluate'}, 'T7': {'eid': 'T7', 'label': 'SYMBOL', 'start': 7, 'end': 10, 'text': '-RT'}, 'T8': {'eid': 'T8', 'label': 'SYMBOL', 'start': 9, 'end': 13, 'text': 'p(R)'}, 'T9': {'eid': 'T9', 'label': 'SYMBOL', 'start': 10, 'end': 30, 'text': '\\sum_R p(R)\\,p(n|RT)'}, 'T10': {'eid': 'T10', 'label': 'SYMBOL', 'start': 11, 'end': 18, 'text': 'p(n|RT)'}, 'T11': {'eid': 'T11', 'label': 'SYMBOL', 'start': 16, 'end': 23, 'text': 'g^{(2)}'}, 'T12': {'eid': 'T

In [None]:
# We have dicts of symbols and descriptors - let's connect them via relations

# direct, co-referred to, count



In [None]:
# df = pd.read_json("Practice/econ.th-ann6.json")
# paragraph = df.loc["text"][6]
# descriptors = get_descriptions_paragraph(paragraph)
# dict1 = descriptor_to_output(descriptors)
# dict2 = getSymbols(paragraph)
# merged = merge(dict1, dict2)
# print(dict1)
# print(merged)

In [None]:
import backbone.py