In [None]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# enter the foldername in your Drive where you have saved the unzipped
# workshop folder, e.g. 'acmlab/workshops/project'
FOLDERNAME = 'primary_description'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))

%cd /content/drive/My\ Drive/$FOLDERNAME/

Mounted at /content/drive
/content/drive/My Drive/primary_description


In [None]:
import numpy as np
import os
from sklearn.metrics import f1_score
import pandas as pd
from textblob import TextBlob
import nltk
import re

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet

from IPython.display import display

# import stanza
# stanza.install_corenlp()
# import spacy
# nltk.download('brown')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
data = pd.read_json("Practice/cs.ai-ann0.json")
# text_data = data.loc["text"]
# text_data.to_dict()

In [None]:
def extract_primary_description(entity): #Entity is a dictionary.
  output = {}
  for key in entity.keys():
    if entity[key]['label'] == "PRIMARY":
      output[key] = entity[key]
  return output


In [None]:
lemmatizer = nltk.WordNetLemmatizer()
def setup(text):

  #word tokenizeing and part-of-speech tagger
  document = text
  tokens = [nltk.word_tokenize(sent) for sent in [document]]
  postag = [nltk.pos_tag(sent) for sent in tokens][0]

  # Rule for NP chunk and VB Chunk
  grammar = r"""
      NBAR:
          {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
          {<RB.?>*<VB.?>*<JJ>*<VB.?>+<VB>?} # Verbs and Verb Phrases
          
      NP:
          {<NBAR>}
          {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
          
  """
  #Chunking
  cp = nltk.RegexpParser(grammar)

  # the result is a tree
  tree = cp.parse(postag)
  return tree

def leaves(tree):
    """Finds NP (nounphrase) leaf nodes of a chunk tree."""
    for subtree in tree.subtrees(filter = lambda t: t.label() =='NP'):
        yield subtree.leaves()
        
def get_word_postag(word):
    if pos_tag([word])[0][1].startswith('J'):
        return wordnet.ADJ
    if pos_tag([word])[0][1].startswith('V'):
        return wordnet.VERB
    if pos_tag([word])[0][1].startswith('N'):
        return wordnet.NOUN
    else:
        return wordnet.NOUN
    
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    # word = word.lower()
    # postag = get_word_postag(word)
    # word = lemmatizer.lemmatize(word, postag)
    return word

def get_terms(tree):    
    for leaf in leaves(tree):
        terms = [normalise(w) for w,t in leaf]
        yield terms

def getNounPhrases(text):
  wordsToRemove = ['be', 'is', 'are', 'was', 'were', 'been', 'being']
  tree = setup(text)

  terms = get_terms(tree)

  features = []
  for term in terms:
      _term = ''
      for word in term:
        _term += ' ' + word

      if not any(x in _term.split() for x in wordsToRemove) and '\\' not in _term:
        features.append(_term.strip())

  res = []
  res += re.findall(r"(?<=\$\sbe\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
  res += re.findall(r"(?<=\$\srepresent\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
  res += re.findall(r"(?<=\$\srepresents\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
  res += re.findall(r"(?<=\$\sdenote\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
  res += re.findall(r"(?<=\$\sdenotes\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
  res += re.findall(r"(?<=\$\sis\s)(.*?)(?=\.|\,|\;|\:|\!|\?|\\|and)", text)
  res += re.findall(r"(?<=\sof\s)(.*?)(?=\$|\.|\,|\;|\:|\!|\?|\\)", text)
  result1 = re.findall(r"(?<=\sthe\s)(.*?)(?=\$|\.|\,|\;|\:|\!|\?|\\)", text)
  for i in range(len(result1)):
    result1[i] = 'the ' + result1[i]
  res += result1

  result2 = re.findall(r"(?<=\sThe\s)(.*?)(?=\$|\.|\,|\;|\:|\!|\?|\\)", text)
  for i in range(len(result2)):
    result2[i] = 'The ' + result2[i]
  res += result2

  result3 = re.findall(r"(?<=\sa\s)(.*?)(?=\$|\.|\,|\;|\:|\!|\?|\\)", text)
  for i in range(len(result3)):
    result3[i] = 'a ' + result3[i]
  res += result3


  for phrase in res:
    features.append(phrase.strip())
      
  return features

In [None]:
def calculate_recall(noun_list, expected):
  total = 0
  count = 0
  primary_descriptor_list = []
  for key in expected.keys():
    primary_descriptor_list.append(expected[key]["text"])
  total = len(primary_descriptor_list)  
  for descriptor in primary_descriptor_list:
    if descriptor in noun_list:  
      count += 1
      noun_list.remove(descriptor)
  return count / total
  

  

In [None]:
text = "Let $x$ be the number of bins. Let $\alpha + \beta$ be the sum of two parameters, which are then used for analysis."
# text = "Let $x$ be the number of bins. Let $\alpha + \beta$ be the sum of two parameters."
# res = re.findall(r"(?<=\$\sbe\s)(.*?)(?=\.)", text)
# text = "Let $x$ be the number of bins :"
res = re.findall(r"(?<=\$\sbe\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
print(res)

['the number of bins', 'the sum of two parameters']


In [None]:
text = "Let $x$ represent the number of bins, where $\alpha + \beta$ represents the sum of two parameters, which are then used for analysis."
# text = "Let $x$ be the number of bins. Let $\alpha + \beta$ be the sum of two parameters."
# res = re.findall(r"(?<=\$\sbe\s)(.*?)(?=\.)", text)
# text = "Let $x$ be the number of bins :"
res = re.findall(r"(?<=\$\srepresent\s)(.*?)(?=\.|\,|\;|\:|\!|\?)", text)
print(res)

['the number of bins']


In [None]:
text = data.loc["text"][1]
print(text)
expected = extract_primary_description(data.loc["entity"][1])
print("Noun phrases", getNounPhrases(text))
print("Expected", expected)
calculate_recall(getNounPhrases(text), expected)

We fit the parameters according to the procedure described in \cite{EkanadhamKarklin15} .
Estimating the entire trajectory $\thetastraj$ for each student simultaneously with item parameters is very expensive and difficult to do in real - time .
To simplify the approach , we learn parameters in two stages : \begin{enumerate}  \item We learn the            according to a standard 1PO IRT model (see Section~\ref{sec:irtlearning} ) on the training student population and freeze these during validation .
           .           
For the second step , we combine the approximation :            & P ( \{(s', i, r, t') \in D: s'=s, t'\leq t\}|\theta_{s,t} ) \approx \nonumber \\ &\prod_{(s',i,r,t') \in D: s'=s, t'\leq t} P ( ( s' , i , r , t ' ) | \theta_{s,t} ) \end{align} with \eqref{eq:wiener} , integrating out previous proficiencies of the student to get a tractable approximation of the log posterior over the student 's current proficiency given previous responses :
\begin{align}  \log P(\theta

0.0

In [None]:
# print(getNounPhrases("Mostly, there just is no default way of determining the paragraph boundary and people tend to work with sentences. Still, the unit of a paragraph might be of a higher value than that of a sentence. Examples might be: coreference resolutions that overlap multiple sentences. Questions that find their answer throughout a whole paragraph. A reader that understands a paragraph better than an isolated sentence. It’s clear that the signal from a writer is best expressed in a paragraph."))

In [None]:
# df = data
# for column in df:
#     wantedOutput = extract_primary_description(df[column]["entity"])
#     paragraph = df[column]["text"]
#     print("PARAGRAPH:\n", paragraph)
#     print("NOUN PHRASES:\n", getNounPhrases(paragraph))
#     print("EXPECTED:\n", wantedOutput)
#     print("*" * 280)
#     print("")

In [None]:
def findNounsWithLocs(text):
  '''This function takes in a block of text, finds the nouns in it and then returns an array of 1s and 0s representing where those nouns are'''
  originalText = text
  #modify text here however we please in getNounPhrases
  nounList = getNounPhrases(text)
  start = 0
  predicted_array = np.zeros(len(originalText))

  for word in nounList:
    nounStartLoc = originalText.find(word, start)
    nounEndLoc = nounStartLoc + len(word)

    if abs(originalText.find('$', start) - nounStartLoc) < 25:
      predicted_array[nounStartLoc : nounEndLoc] = 1

    start = nounEndLoc

  return predicted_array

In [None]:
print(findNounsWithLocs("The bus is yellow."))

[0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
# for column in data:
#   paragraph = df[column]["text"]
#   print("PARAGRAPH:\n", paragraph)
#   print("NOUN PHRASES:\n", getNounPhrases(paragraph))
#   print("NOUN LOCATIONS:\n", findNounsWithLocs(paragraph))
#   print('*' * 280 + '\n')

In [None]:
def create_arrays(expected, predicted, len_text):
  expected_array = np.zeros(len_text)  
  predicted_array = np.zeros(len_text)
  for key in expected.keys():
    start_index = expected[key]["start"]
    end_index = expected[key]["end"]
    expected_array[start_index : end_index + 1] = 1

  
  # for key in predicted.keys():
  #   start_index = predicted[key]["start"]
  #   end_index = predicted[key]["end"]
  #   predicted_array[start_index : end_index + 1] = 1

  return (expected_array, predicted_array)

In [None]:
def compare(expected, predicted): #Computes the F1 score between expected and predicted
  expected_array = np.array(len())

In [None]:
def framework(df):
  results = []
  recalls = []
  for column in df:
    paragraph = df[column]["text"]
    expected = df[column]["entity"]
    primary_description = extract_primary_description(expected)
    predicted = paragraph
    if len(primary_description) != 0:
      recalls.append(calculate_recall(getNounPhrases(predicted), primary_description))
    # expected_array, predicted_array = create_arrays(primary_description, predicted, len(paragraph))
    expected_array, predicted_array = create_arrays(primary_description, predicted, len(paragraph))
    predicted_array = findNounsWithLocs(paragraph)
    result = f1_score(expected_array, predicted_array, average= "binary", zero_division = 1)
    results.append(result)
  print("Average F1 score is:", sum(results) / len(results))
  print("Average recall: ", sum(recalls) / len(recalls))


In [None]:
def framework_recall(df):
  results = []
  recalls = []
  for column in df:
    paragraph = df[column]["text"]
    expected = df[column]["entity"]
    primary_description = extract_primary_description(expected)
    if len(primary_description) != 0:
      recalls.append(calculate_recall(getNounPhrases(paragraph), primary_description))
  print("Average recall: ", sum(recalls) / len(recalls))


In [None]:
#"The current F1 score of 0.355 arises when we leave the predicted_array all full of zeroes and compare with the expected."
# 0.261 is our current best implementing NLP techniques w/ <25
# framework(data)

In [None]:
dataFiles = []
pwd = '/content/drive/My Drive/primary_description/Practice'
for filename in os.listdir(pwd):
    if filename.endswith("json"): 
        dataFiles.append(pwd + '/' + filename)
for filename in dataFiles:
  print(filename.replace('/content/drive/My Drive/primary_description/Practice/', ""))
  framework_recall(pd.read_json(filename))
  print()

physics.atom_ph-ann10.json
Average recall:  0.5660383488144489

q_bio.qm-ann11.json
Average recall:  0.5545261387190994

cs.ai-ann0.json
Average recall:  0.5938299724651765

cs.ai-ann3.json
Average recall:  0.6252281557784404

econ.th-ann6.json
Average recall:  0.4732876712328767

physics.atom_ph-ann8.json
Average recall:  0.5832093253968254

econ.th-ann5.json
Average recall:  0.5371130952380953

physics.atom_ph-ann9.json
Average recall:  0.5757469862422696

math.co-ann7.json
Average recall:  0.5293902268014692

cs.ai-ann2.json
Average recall:  0.5056761221234907

econ.th-ann4.json
Average recall:  0.5415768107231523



In [None]:
dataFiles = []
pwd = '/content/drive/My Drive/primary_description/Practice'
for filename in os.listdir(pwd):
    if filename.endswith("json"): 
        dataFiles.append(pwd + '/' + filename)
for filename in dataFiles:
  print(filename.replace('/content/drive/My Drive/primary_description/Practice/', ""))
  framework(pd.read_json(filename))
  print()

physics.atom_ph-ann10.json
Average F1 score is: 0.24100718166495289
Average recall:  0.5606080209455965

q_bio.qm-ann11.json
Average F1 score is: 0.24836546363431322
Average recall:  0.5369418510261161

cs.ai-ann0.json
Average F1 score is: 0.2619597439213551
Average recall:  0.33751223974438266

cs.ai-ann3.json
Average F1 score is: 0.292149992910222
Average recall:  0.58191018342821

econ.th-ann6.json
Average F1 score is: 0.18309014547466393
Average recall:  0.4664383561643835

physics.atom_ph-ann8.json
Average F1 score is: 0.10158271561332784
Average recall:  0.5832093253968254

econ.th-ann5.json
Average F1 score is: 0.10843422731427918
Average recall:  0.3567708333333333

physics.atom_ph-ann9.json
Average F1 score is: 0.2622680957747418
Average recall:  0.5737728422869935

math.co-ann7.json
Average F1 score is: 0.27245963591418115
Average recall:  0.2672441053358214

cs.ai-ann2.json
Average F1 score is: 0.2836446512781167
Average recall:  0.4970934675704413

econ.th-ann4.json
Average