In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
import nltk
from collections import Counter
from tqdm.notebook import tqdm
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset
from transformers import pipeline, BertTokenizer

In [2]:
# paradigms in the BLiMP dataset
paradigms = ['adjunct_island', 
             'anaphor_gender_agreement', 
             'anaphor_number_agreement', 
             'animate_subject_passive', 
             'animate_subject_trans', 
             'causative', 
             'complex_NP_island', 
             'coordinate_structure_constraint_complex_left_branch', 
             'coordinate_structure_constraint_object_extraction', 
             'determiner_noun_agreement_1', 
             'determiner_noun_agreement_2', 
             'determiner_noun_agreement_irregular_1', 
             'determiner_noun_agreement_irregular_2', 
             'determiner_noun_agreement_with_adj_2', 
             'determiner_noun_agreement_with_adj_irregular_1', 
             'determiner_noun_agreement_with_adj_irregular_2', 
             'determiner_noun_agreement_with_adjective_1', 
             'distractor_agreement_relational_noun', 
             'distractor_agreement_relative_clause', 
             'drop_argument', 
             'ellipsis_n_bar_1', 
             'ellipsis_n_bar_2', 
             'existential_there_object_raising', 
             'existential_there_quantifiers_1', 
             'existential_there_quantifiers_2', 
             'existential_there_subject_raising', 
             'expletive_it_object_raising', 
             'inchoative', 
             'intransitive', 
             'irregular_past_participle_adjectives', 
             'irregular_past_participle_verbs', 
             'irregular_plural_subject_verb_agreement_1', 
             'irregular_plural_subject_verb_agreement_2', 
             'left_branch_island_echo_question', 
             'left_branch_island_simple_question', 
             #'matrix_question_npi_licensor_present', 
             'npi_present_1', 
             'npi_present_2', 
             'only_npi_licensor_present',
             'only_npi_scope', 
             'passive_1', 
             'passive_2', 
             'principle_A_c_command', 
             'principle_A_case_1', 
             'principle_A_case_2', 
             'principle_A_domain_1', 
             'principle_A_domain_2', 
             'principle_A_domain_3', 
             'principle_A_reconstruction', 
             'regular_plural_subject_verb_agreement_1', 
             'regular_plural_subject_verb_agreement_2', 
             'sentential_negation_npi_licensor_present', 
             'sentential_negation_npi_scope', 
             'sentential_subject_island', 
             'superlative_quantifiers_1', 
             'superlative_quantifiers_2', 
             'tough_vs_raising_1', 
             'tough_vs_raising_2', 
             'transitive', 
             'wh_island', 
             'wh_questions_object_gap', 
             'wh_questions_subject_gap', 
             'wh_questions_subject_gap_long_distance', 
             'wh_vs_that_no_gap', 
             'wh_vs_that_no_gap_long_distance', 
             'wh_vs_that_with_gap', 
             'wh_vs_that_with_gap_long_distance']

# A dictionary with abbveriations of lingustic terms for better display
phenomena = {
        "anaphor_agreement": "ANA AGR",
        "argument_structure": "ARG STR",
        "binding": "BINDING",
        "control_raising": "CTRL RAIS",
        "determiner_noun_agreement": "D-N AGR",
        "ellipsis": "ELLIPSIS",
        "filler_gap_dependency": "FILLER. GAP",
        "irregular_forms": "IRREGULAR",
        "island_effects": "ISLAND",
        "npi_licensing": "NPI",
        "quantifiers": "QUANTIFIERS",
        "subject_verb_agreement": "S-V AGR"
}

In [None]:
# Create a dataframe to hold data for each paradigm
blimp_df = pd.DataFrame()

# Iterate over each paradigm and filter the dataset
for paradigm in paradigms:
    # Load the dataset for the current paradigm
    subset = load_dataset('nyu-mll/BLiMP', name=paradigm)
    
    # Convert the dataset to a dataframe and add the paradigm column
    subset_df = pd.DataFrame(subset['train'])
    subset_df.rename(columns={'UID': 'paradigm'}, inplace=True)
    # Add the phenomenon column using the phenomena dictionary
    subset_df['phenomenon'] = subset_df['linguistics_term'].map(phenomena).fillna('EMPTY')

    # Print loading status
    print(f"\rLoading {paradigm}: {((paradigms.index(paradigm) + 1) / len(paradigms)) * 100:.2f}%{' ' * 30}", end='')
    
    # Append the subset dataframe to the main dataframe
    blimp_df = pd.concat([blimp_df, subset_df], ignore_index=True)

# Display the combined dataframe
blimp_df.sample(5)

Loading coordinate_structure_constraint_complex_left_branch: 12.12%                              

In [None]:
# Filter entries where sentence_good is equal to sentence_bad
identical_sentences = blimp_df[blimp_df['sentence_good'] == blimp_df['sentence_bad']]

# Print a report
print(f"Number of identical sentences: {len(identical_sentences)}")
if not identical_sentences.empty:
    print("Details of identical sentences:")
    print(identical_sentences[['sentence_good', 'paradigm']])
else:
    print("No identical sentences found.")

# Drop identical sentences from the dataframe
blimp_df = blimp_df[blimp_df['sentence_good'] != blimp_df['sentence_bad']]

In [None]:
# Initialize the BERT tokenizer
# bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
mbert_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Function to create a masked sentence with shared tokens and a single [MASK] for differences
def process_tokens(row):
    # Tokenize the good and bad sentences
    good_tokens = row['sentence_good'].split()
    bad_tokens = row['sentence_bad'].split()

    # Remove common tokens from the beginning
    common_start = []
    while good_tokens and bad_tokens and good_tokens[0] == bad_tokens[0]:
        common_start.append(good_tokens.pop(0))
        bad_tokens.pop(0)

    # Remove common tokens from the end
    common_end = []
    while good_tokens and bad_tokens and good_tokens[-1] == bad_tokens[-1]:
        common_end.insert(0, good_tokens.pop(-1))
        bad_tokens.pop(-1)


    # If the good or bad tokens are empty, add a common token to both
    if good_tokens == [] or bad_tokens == []:
        additional_tokens = common_end.pop(0)
        print("additional token:", additional_tokens)
        good_tokens.append(additional_tokens)
        bad_tokens.append(additional_tokens)

    # Collect the remaining tokens as the masked sentence
    common_tokens = common_start + ["[MASK]"] + common_end
    
    good_fillers = mbert_tokenizer.tokenize(" ".join(good_tokens))
    bad_fillers = mbert_tokenizer.tokenize(" ".join(bad_tokens))

    sentence_masked = " ".join(common_tokens)

    # If the last token of the good and bad fillers is the same, add it to the masked sentence (same punctuation with MASK at the end)
    if len(good_fillers) > 1 and len(bad_fillers) > 1: 
        if good_fillers[-1] == bad_fillers[-1]:
            remove_punct = good_fillers.pop(-1)
            sentence_masked += remove_punct
            bad_fillers.pop(-1)

    return sentence_masked, good_fillers, bad_fillers

In [None]:
# Apply the function to the DataFrame
blimp_df[['sentence_masked', 'good_fillers', 'bad_fillers']] = blimp_df.apply(lambda row: pd.Series(process_tokens(row)), axis=1)

# Display a sample of the updated DataFrame
blimp_df[['sentence_masked', 'good_fillers', 'bad_fillers']].sample(10)

In [None]:
# Filter rows where good_fillers or bad_fillers are empty
empty_fillers = blimp_df[(blimp_df['good_fillers'].apply(len) == 0) | (blimp_df['bad_fillers'].apply(len) == 0)]

# Print the filtered rows
display(empty_fillers)

In [None]:
# Multiply the probabilities of the fillers to get the probability of the full sentence
def get_filler_probability(model, sentence, filler):
    total_probability = 1

    # Iterate over the fillers and calculate the probability of each word
    for i in range(0, len(filler)):
        word = filler[i]
        next_word = filler[i+1] if i+1 < len(filler) else ""

        # Calculate the probability of the current word
        word_probability = model(sentence, targets=[word])[0]['score']
        total_probability *= word_probability

        # Update the sentence with the filled word, pay attention to the ## tokens
        if word.startswith("##"):
            if next_word.startswith('##'):
                sentence = sentence.replace("[MASK]", word[2:] + "[MASK]")
            else:
                sentence = sentence.replace("[MASK]", word[2:] + " [MASK]")
        else:
            if next_word.startswith('##'):
                sentence = sentence.replace("[MASK]", word + "[MASK]")
            else:
                sentence = sentence.replace("[MASK]", word + " [MASK]")

        # Remove the [MASK] token if it is the last word
        if i+1 == len(filler):
            sentence = sentence.replace(" [MASK]", "")

    return total_probability ** (1 / len(filler)) if len(filler) > 0 else 0

In [None]:
#bert_cased = pipeline("fill-mask", model="bert-base-cased")
mbert_cased = pipeline("fill-mask", model="bert-base-multilingual-cased")

# different tokenizer needs different handling
#bibert_cased = pipeline("fill-mask", model="jhu-clsp/bibert-ende", tokenizer=BertTokenizer.from_pretrained("jhu-clsp/bibert-ende"))


deploy_df = blimp_df.sample(100)
# deploy_df = blimp_df.copy()

# Calculate the probabilities for the good and bad fillers
deploy_df['bert_good_prob'] = deploy_df.apply(
    lambda row: get_filler_probability(mbert_cased, row['sentence_masked'], row['good_fillers']), axis=1
)
deploy_df['bert_bad_prob'] = deploy_df.apply(
    lambda row: get_filler_probability(mbert_cased, row['sentence_masked'], row['bad_fillers']), axis=1
)

# Calculate the prediction based on the probabilities
deploy_df['bert_prediction'] = deploy_df.apply(
    lambda row: 1 if row['bert_good_prob'] > row['bert_bad_prob'] else 0, axis=1
)

display(deploy_df[['sentence_masked', 'good_fillers', 'bad_fillers', 'bert_good_prob', 'bert_bad_prob', 'bert_prediction']])
deploy_df['bert_prediction'].value_counts()

# Save the dataframe to a CSV file
deploy_df.to_csv("mbert_base_cased_prediction.csv", index=False)