In [1]:
from datasets import load_from_disk
from transformers import AutoTokenizer
import torch 
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Original 
train_data = load_from_disk("hf_dataset/sentence_train")
eval_data = load_from_disk("hf_dataset/sentence_eval")
top50_idx = torch.load("results_sentences/sentential_negation_npi_licensor_present/top_indices.pt")
bottom50_idx = torch.load("results_sentences/sentential_negation_npi_licensor_present/bottom_indices.pt")

# Augmented
# train_data_aug = load_from_disk("hf_dataset/sentence_train_augmented")
# eval_data_aug = load_from_disk("hf_dataset/sentence_eval_augmented")
# top50_idx_aug = torch.load("results_sentences/sentential_negation_npi_licensor_present_augmented/top_indices.pt")
# bottom50_idx_aug = torch.load("results_sentences/sentential_negation_npi_licensor_present_augmented/bottom_indices.pt")

In [1]:
import pandas as pd
import numpy as np

# decode texts and put into df
def decode_df(indices, train_data):
    idx_arr = indices.numpy()
    flat_idx_arr = idx_arr.flatten()

    decoded_texts = [
        tokenizer.decode(train_data[int(idx)]["input_ids"], skip_special_tokens=True)
        for idx in flat_idx_arr
    ]

    decoded_array = np.array(decoded_texts).reshape(idx_arr.shape)

    decoded_df = pd.DataFrame(decoded_array)
    
    return decoded_df

top_df = decode_df(top50_idx, train_data)
bottom_df = decode_df(bottom50_idx, train_data)
# top_df_aug = decode_df(top50_idx_aug, train_data_aug)
# bottom_df_aug = decode_df(bottom50_idx_aug, train_data_aug)


In [None]:
idx = 249
blimp1_good = tokenizer.decode(eval_data["input_ids_good"][idx], skip_special_tokens=True)
blimp1_bad = tokenizer.decode(eval_data["input_ids_bad"][idx], skip_special_tokens=True)
blimp1_good, blimp1_bad

('The school had not ever intended to conspire.',
 'The school had probably ever intended to conspire.')

In [None]:
import re
# Define the phrase to search for, using regex
# phrase = re.compile(r"\bever\b")
# phrase = re.compile(r"\bnot ever\b")

# phrase = re.compile(r"(?:not|n't)\b.*\bever\b", flags=re.IGNORECASE)
# not or n't followed by ever, but does not include any occurrences of "happily ever after" or superlatives before the word 'ever'
phrase = re.compile(r"\b(?:not|n't)\b(?![^.]{0,50}happily ever after)(?![^.]{0,50}(?:est|most|worst|favorite|least)).*?\bever\b", flags=re.IGNORECASE)

# phrase = re.compile(r"\b(?:no one|nobody|nothing)\b(?![^.]{0,50}happily ever after).*?\bever\b", flags=re.IGNORECASE)
# phrase = re.compile(r"(than).*\bever\b", flags=re.IGNORECASE)
# phrase = re.compile(r"\bif\b.*\bever\b", flags=re.IGNORECASE)
# phrase = re.compile(r"\bever\b.*\?")
# phrase = re.compile(r"(?:est|most|worst|favorite|least).*\bever\b", flags=re.IGNORECASE)
# phrase = re.compile(r"(?:est|most|worst|favorite|least).{1,8}?\bever\b", flags=re.IGNORECASE)
# phrase = re.compile(r"\bhappily ever after\b", flags=re.IGNORECASE)

# phrase = re.compile(r"\b(?:any|yet)\b", flags=re.IGNORECASE)
# phrase = re.compile(r".*\b(?:not|never|n't)\b.*\b(?:any|yet)\b.*", flags=re.IGNORECASE)

# phrase = re.compile(r"\bonly\b", flags=re.IGNORECASE)
# phrase = re.compile(r"\bonly\b.*\b(?:any|yet|ever)\b.*", flags=re.IGNORECASE)
# phrase = re.compile(r"\bonly\b.*\byet\b.*", flags=re.IGNORECASE)

phrase_count_df = pd.DataFrame({
    "Top 50": top_df.apply(lambda row: sum(bool(phrase.search(text)) for text in row), axis=1),
    "Bottom 50": bottom_df.apply(lambda row: sum(bool(phrase.search(text)) for text in row), axis=1),
    "Top 50 Augmented": top_df_aug.apply(lambda row: sum(bool(phrase.search(text)) for text in row), axis=1),
    "Bottom 50 Augmented": bottom_df_aug.apply(lambda row: sum(bool(phrase.search(text)) for text in row), axis=1)
})
# sentences with the phrase
sentences_df = pd.DataFrame({
    "Top 50": top_df.apply(lambda row: [text for text in row if phrase.search(text)], axis=1),
    "Bottom 50": bottom_df.apply(lambda row: [text for text in row if phrase.search(text)], axis=1),
    "Top 50 Augmented": top_df_aug.apply(lambda row: [text for text in row if phrase.search(text)], axis=1),
    "Bottom 50 Augmented": bottom_df_aug.apply(lambda row: [text for text in row if phrase.search(text)], axis=1)
})
phrase_count_df

Unnamed: 0,Top 50,Bottom 50,Top 50 Augmented,Bottom 50 Augmented
0,0,2,0,2
1,0,0,0,0
2,0,0,0,0
3,0,0,1,0
4,0,1,0,1
...,...,...,...,...
245,0,0,0,0
246,0,0,0,0
247,0,0,0,0
248,0,3,2,4


In [None]:
print("Top 50 Phrase Avg:", phrase_count_df["Top 50"].mean())
print("Top 50 Phrase Total:", phrase_count_df["Top 50"].sum())
print("Bottom 50 Phrase Avg:", phrase_count_df["Bottom 50"].mean())
print("Bottom 50 Phrase Total:", phrase_count_df["Bottom 50"].sum())
print("Top 50 Augmented Phrase Avg:", phrase_count_df["Top 50 Augmented"].mean())
print("Top 50 Augmented Phrase Total:", phrase_count_df["Top 50 Augmented"].sum())
print("Bottom 50 Augmented Phrase Avg:", phrase_count_df["Bottom 50 Augmented"].mean())
print("Bottom 50 Augmented Phrase Total:", phrase_count_df["Bottom 50 Augmented"].sum())

Top 50 Phrase Avg: 0.104
Top 50 Phrase Total: 26
Bottom 50 Phrase Avg: 0.344
Bottom 50 Phrase Total: 86
Top 50 Augmented Phrase Avg: 0.192
Top 50 Augmented Phrase Total: 48
Bottom 50 Augmented Phrase Avg: 0.352
Bottom 50 Augmented Phrase Total: 88


In [18]:
neg_sents = []
for sentence in sentences_df["Top 50 Augmented"]:
    for text in sentence:
        # print unique occurrences of phrase
        if phrase.search(text):
            # if text not in neg_sents:
            neg_sents.append(text)
neg_sents   
            

['“I will not ever try to touch something so big again!”',
 'The girl did not ever want to drop ice again.',
 'Mia did not ever want to be in this place again.',
 'Mia did not ever want to be in this place again.',
 'Mia did not ever want to be in this place again.',
 'Max had not seen it ever before!',
 'Max had not seen it ever before!',
 "But don't ever touch that button again.",
 'Mia did not ever want to be in this place again.',
 'Max had not seen it ever before!',
 'Max had not seen it ever before!',
 'Max had not seen it ever before!',
 "But don't ever touch that button again.",
 'Max had not seen it ever before!',
 'Annie waved goodbye and she was sure she would not ever forget this amazing creature',
 'Annie waved goodbye and she was sure she would not ever forget this amazing creature',
 'Mia did not ever want to be in this place again.',
 'I didn\'t think I\'d ever get it to the top by myself."',
 '“I will not ever try to touch something so big again!”',
 'He decided that h

In [19]:
for sentence in sentences_df["Bottom 50 Augmented"]:
    for text in sentence:
        if phrase.search(text):
            print(text)
    

She did not want to see Max ever again.
They did not want to see the judge ever again.
Lucy was not anxious anymore, and they lived happily ever after.
He said they could not play with the microscope ever again.
He said they could not play with the microscope ever again.
They are not the best toys ever.
She smiled and promised not to be naughty ever again.
He helped the bear and was not scared ever again.
They are not the best toys ever.
They are not the best toys ever.
They are not the best toys ever.
They are not the best toys ever.
They have not been played with ever."
He said they could not play with the microscope ever again.
I do not want to fight ever again!"
She did not want to see Max ever again.
She smiled and promised not to be naughty ever again.
He said they could not play with the microscope ever again.
I do not want to fight ever again!"
The bear couldn't sing ever again.
He helped the bear and was not scared ever again.
He learned his lesson and never tried to take some

In [20]:
for sentence in sentences_df["Bottom 50"]:
    for text in sentence:
        if phrase.search(text):
            print(text)

She did not want to see Max ever again.
They did not want to see the judge ever again.
Lucy was not anxious anymore, and they lived happily ever after.
He said they could not play with the microscope ever again.
He said they could not play with the microscope ever again.
They are not the best toys ever.
She smiled and promised not to be naughty ever again.
He helped the bear and was not scared ever again.
They are not the best toys ever.
They are not the best toys ever.
They are not the best toys ever.
They are not the best toys ever.
They have not been played with ever."
He said they could not play with the microscope ever again.
I do not want to fight ever again!"
She did not want to see Max ever again.
She smiled and promised not to be naughty ever again.
He said they could not play with the microscope ever again.
I do not want to fight ever again!"
The bear couldn't sing ever again.
He helped the bear and was not scared ever again.
He learned his lesson and never tried to take some

In [41]:
# types of NPI with ever: sentential negation, superlative, if/whether, question, subject negated