# Estimating surprisal from language models
Take sentences from CommitmentBank, MegaAttitudes, and stimuli from experiment, mask the attitude predicate, and get predicted probability of occurrence for the target verb. Then, calculate from that the surprisal of the verb.

In [1]:
from transformers import pipeline
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

In [2]:
# This makes the display show more info
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Read in the three datasets

In [3]:
# CommitmentBank
cb = pd.read_csv("../data/CommitmentBank-ALL.csv")[["uID","Verb","Target"]].drop_duplicates()
cb = cb.rename(columns={"Target": "Sentence","uID":"ID"})

In [4]:
# MegaVeridicality
mv = pd.read_csv("../data/mega-veridicality-v2.csv")[["verb","frame","voice","sentence"]].drop_duplicates()
mv = mv.rename(columns={"verb": "Verb", "sentence":"Sentence"})
mv["ID"] = mv[['frame', 'voice']].apply(lambda x: '_'.join(x), axis=1)
mv = mv.drop(columns=["frame","voice"])

In [5]:
# Arousal/Valence Study
vs = pd.read_csv("../data/1_sliderprojection/exp1_test-trials.csv")[["Word","utterance","exp"]]
vs = vs[vs["exp"]=="stim"].drop_duplicates().drop(columns={"exp"})
vs = vs.rename(columns={"Word": "Verb","utterance":"Sentence"})
vs["ID"] = ""

In [6]:
# Combine them together into one df
df = pd.concat([cb,mv,vs])

## to deal with the inflected verb token 
1. by creating a new verb token column
2. regex with literal string interpolation to match work in which the Verb occurrs

In [7]:
# df["VerbToken"] = df['Sentence'].str.extract(fr'({df["Verb"]}\w*)')

# Find a match in the Sentence column for the verb from the Verb column using a regex
# re.search() returns a match object, so you have to call .group() to get the string
# that is matched. In cases where there is no match, a NoneType object is returned and 
# you can't call .group() on that. 
df["VerbToken"] = df.apply(lambda x: re.search(fr'({x["Verb"]}\w*)',x['Sentence']), axis=1)

# 
df["VerbToken"] = df["VerbToken"].apply(lambda x: x.group() if x is not None else x)



In [8]:
empty = df[df["VerbToken"].isnull()]

In [10]:
len(empty)/len(df)*100

9.060509554140127

## GET THOSE EMPTIES
lemmatize the whole sentence and look for the matches to lemma

In [9]:
# from: https://gaurav5430.medium.com/using-nltk-for-lemmatizing-sentences-c1bfff963258
# initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [10]:
def lemmatize_verb_from_sentence(sentence,verb):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lw = []
#     for i in range(0,len(empty)-1):
#     for v in empty["Verb"].values:
#         verb_from_empty = empty["Verb"].values[i]
    for word, tag in wordnet_tagged:
        if tag is None:
            continue
        elif tag != 'v':
            continue
        else:
            lemma = lemmatizer.lemmatize(word, tag)
            if lemma != verb:
                # Go to the next word/tag pair to find the relevant verb
                break
            elif lemma == verb:
                print("{verb}: {word} {lemma}".format(verb=verb,word=word,lemma=lemma))
                lw.append(word)
                lw.append(lemma)
#     print(lw)
    return ' '.join(lw)

This seems to be working to break it up more

In [11]:
def get_verb(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
    verbs = []
    for i in nltk_tagged:
        if 'VB' in i[1]:
            verbs.append(i)
    return verbs

In [384]:
li = get_verb("She could also have told this was Tina's mother before Mrs Darne went off down the passage that led to the Headmaster's Flat.")

In [12]:
empty["VerbList"] = empty["Sentence"].apply(lambda x: get_verb(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty["VerbList"] = empty["Sentence"].apply(lambda x: get_verb(x))


In [13]:
empty["VerbTagged"] = empty["Verb"].apply(lambda x: nltk.pos_tag([x]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty["VerbTagged"] = empty["Verb"].apply(lambda x: nltk.pos_tag([x]))


In [14]:
def lemmatize_from_nltk_tagged_list(nltk_tagged):
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return lemmatized_sentence

In [15]:
empty["VerbListLemmatized"] = empty["VerbList"].apply(lambda x: lemmatize_from_nltk_tagged_list(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty["VerbListLemmatized"] = empty["VerbList"].apply(lambda x: lemmatize_from_nltk_tagged_list(x))


In [16]:
empty.head()

Unnamed: 0,ID,Verb,Sentence,VerbToken,VerbList,VerbTagged,VerbListLemmatized
9,BNC-1002,say,Indeed it could be said that they had prospered.,,"[(be, VB), (said, VBD), (had, VBD), (prospered, VBN)]","[(say, VB)]","[be, say, have, prosper]"
17,BNC-1003,say,He might have said to her that some time in the middle of the nineteenth century a cult had grown up around the idea of the home.,,"[(have, VB), (said, VBD), (had, VBD), (grown, VBN)]","[(say, VB)]","[have, say, have, grow]"
575,BNC-1145,tell,She could also have told this was Tina's mother before Mrs Darne went off down the passage that led to the Headmaster's Flat.,,"[(have, VB), (told, VBN), (was, VBD), (went, VBD), (led, VBD)]","[(tell, NN)]","[have, tell, be, go, lead]"
716,BNC-1187,think,They may have thought they were putting it out of its misery - a lifetime beautifying the lorry-route to the A1.,,"[(have, VB), (thought, VBN), (were, VBD), (putting, VBG), (beautifying, VBG)]","[(think, NN)]","[have, think, be, put, beautify]"
733,BNC-1194,think,Perhaps he thought that her own wishes would hardly be considered in the matter.,,"[(thought, VBD), (be, VB), (considered, VBN)]","[(think, NN)]","[think, be, consider]"


In [17]:
empty["VerbListLemmatizedTagged"] = empty["VerbListLemmatized"].apply(lambda x: nltk.pos_tag(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty["VerbListLemmatizedTagged"] = empty["VerbListLemmatized"].apply(lambda x: nltk.pos_tag(x))


In [408]:
empty.head()

Unnamed: 0,ID,Verb,Sentence,VerbToken,VerbList,VerbTagged,VerbListLemmatized,VerbListLemmatizedTagged
9,BNC-1002,say,Indeed it could be said that they had prospered.,,"[(be, VB), (said, VBD), (had, VBD), (prospered, VBN)]","[(say, VB)]","[be, say, have, prosper]","[(be, VB), (say, VBN), (have, VBP), (prosper, NN)]"
17,BNC-1003,say,He might have said to her that some time in the middle of the nineteenth century a cult had grown up around the idea of the home.,,"[(have, VB), (said, VBD), (had, VBD), (grown, VBN)]","[(say, VB)]","[have, say, have, grow]","[(have, VBP), (say, VBN), (have, VBP), (grow, NNS)]"
575,BNC-1145,tell,She could also have told this was Tina's mother before Mrs Darne went off down the passage that led to the Headmaster's Flat.,,"[(have, VB), (told, VBN), (was, VBD), (went, VBD), (led, VBD)]","[(tell, NN)]","[have, tell, be, go, lead]","[(have, VB), (tell, NN), (be, VB), (go, VBN), (lead, JJ)]"
716,BNC-1187,think,They may have thought they were putting it out of its misery - a lifetime beautifying the lorry-route to the A1.,,"[(have, VB), (thought, VBN), (were, VBD), (putting, VBG), (beautifying, VBG)]","[(think, NN)]","[have, think, be, put, beautify]","[(have, VB), (think, NN), (be, VB), (put, VBN), (beautify, VB)]"
733,BNC-1194,think,Perhaps he thought that her own wishes would hardly be considered in the matter.,,"[(thought, VBD), (be, VB), (considered, VBN)]","[(think, NN)]","[think, be, consider]","[(think, NN), (be, VB), (consider, JJR)]"


In [28]:
l = [[x,y] for x,y in zip(list(empty["VerbList"]),list(empty["VerbListLemmatizedTagged"]))]

In [31]:
l[0][1]

[('be', 'VB'), ('say', 'VBN'), ('have', 'VBP'), ('prosper', 'NN')]

In [32]:
l[0][0]

[('be', 'VB'), ('said', 'VBD'), ('had', 'VBD'), ('prospered', 'VBN')]

In [424]:
empty["Grouped"].values[2][1][i][1]
empty["VerbListLemmatizedTagged"].values[2][1][i][1]

'NN'

In [None]:
empty["Grouped2"] = empty.Grouped[]

In [426]:
len(empty)

569

In [None]:

for x, y in zip(xs, ys):
    print x, y


In [None]:

def search_two_cols(col1,col2)
    for x,y in zip(col1,col2):
        l = []

        if col1 is in col2.tolist():
            l.append(col1)
            

In [397]:
empty.head()

Unnamed: 0,ID,Verb,Sentence,VerbToken,VerbList,VerbTagged
9,BNC-1002,say,Indeed it could be said that they had prospered.,,"[(be, VB), (said, VBD), (had, VBD), (prospered, VBN)]","[(say, VB)]"
17,BNC-1003,say,He might have said to her that some time in the middle of the nineteenth century a cult had grown up around the idea of the home.,,"[(have, VB), (said, VBD), (had, VBD), (grown, VBN)]","[(say, VB)]"
575,BNC-1145,tell,She could also have told this was Tina's mother before Mrs Darne went off down the passage that led to the Headmaster's Flat.,,"[(have, VB), (told, VBN), (was, VBD), (went, VBD), (led, VBD)]","[(tell, NN)]"
716,BNC-1187,think,They may have thought they were putting it out of its misery - a lifetime beautifying the lorry-route to the A1.,,"[(have, VB), (thought, VBN), (were, VBD), (putting, VBG), (beautifying, VBG)]","[(think, NN)]"
733,BNC-1194,think,Perhaps he thought that her own wishes would hardly be considered in the matter.,,"[(thought, VBD), (be, VB), (considered, VBN)]","[(think, NN)]"


In [None]:
def lemmatize_verb_from_list(sentence,verb):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lw = []
#     for i in range(0,len(empty)-1):
#     for v in empty["Verb"].values:
#         verb_from_empty = empty["Verb"].values[i]
    for word, tag in wordnet_tagged:
        if tag is None:
            continue
        elif tag != 'v':
            continue
        else:
            lemma = lemmatizer.lemmatize(word, tag)
            if lemma != verb:
                # Go to the next word/tag pair to find the relevant verb
                break
            elif lemma == verb:
                print("{verb}: {word} {lemma}".format(verb=verb,word=word,lemma=lemma))
                lw.append(word)
                lw.append(lemma)
#     print(lw)
    return ' '.join(lw)

In [346]:
lemma = empty["Sentence"].apply(lambda x: lemmatize_verb_from_sentence(x))

say: said say
say: said say
say: said say
think: thought think
think: thought think
think: thought think
think: thought think
think: thought think
say: say say
say: said say
tell: told tell
think: thought think
think: thought think
say: said say
say: said say
say: said say
say: said say
think: think think
say: said say
say: said say
say: said say
tell: tell tell
say: said say
say: said say
say: said say
say: said say
say: said say
tell: told tell
think: thought think
think: thought think
say: said say
say: said say
say: said say
say: said say
say: said say
say: said say
say: said say
say: said say
say: said say
say: said say
say: said say


In [None]:
df['value'] = [x if x > 0 else y if y>0 for x,y in zip(df['a'],df['b'])]

# Estimating surprisal

https://twitter.com/bruno_nicenboim/status/1379168059311656963

In [357]:
df["Masked"] = df.apply(lambda x: x['Sentence'].replace(x["VerbToken"],"[MASK]"),axis=1)

In [101]:
df["Verb"][9].to_string()

'9       say\n9       sob\n9    murmur'

### Go through the CommitmentBank and change the masks by hand

In [47]:
empty.groupby(["Verb"])["Verb"].count()

Verb
feel        5
find        2
foresee     1
forget      9
hope        1
know        9
realize     3
say        18
see         1
tell        9
think      29
Name: Verb, dtype: int64

In [None]:
empty.

# Masked Language Modeling
using BERT large uncased on a masked task

In [5]:
unmasker = pipeline('fill-mask', model='bert-large-uncased')

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
unmasker("Dana was [MASK] that Mars has no water.",targets="surprised")[0]['score']

0.0471670962870121

In [36]:
def mlm_over_df(input_df):
    for row in input_df.itterows():
        sentence = f"{s}".format(s=input_df["sentence"])
        verb = f"{v}".format(v=input_df["verb"])
        mask_fill = unmasker(sentence, targets=verb)
        input_df["mlm_score"] = mask_fill[0]['score']
    return input_df