## Imports.

In [None]:
## Imports.
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
import torch

sns.set() #set Seaborn theme on plots.

from datetime import date
from transformers.models.bert import BertForMaskedLM
from transformers.models.roberta import RobertaForMaskedLM, RobertaTokenizerFast
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertTokenizer
from transformers import FillMaskPipeline

## Constants & Directories.

In [None]:
## Hyperparameters.
USE_GPU = True

## Models.

# BabyBERTa.
# The authors made 3/10 random initializations of the model publicly available.
MODELS = ['phueb/BabyBERTa-1', 
          'phueb/BabyBERTa-2', 
          'phueb/BabyBERTa-3', 
          'bert-base-cased', 
          'bert-large-cased', 
          'roberta-base',
          'roberta-large',
          "nyu-mll/roberta-base-10M-1",
          "nyu-mll/roberta-base-10M-2",
          "nyu-mll/roberta-base-10M-3",
        ] 
SAVED_MODELS = ['saved_models/BabyBERTa_AO-CHILDES',
                'saved_models/BabyBERTa_AO-CHILDES_standard_masking',
                'saved_models/BabyBERTa_AO-CHILDES+AO-Newselsa+Wikipedia-1',
                'saved_models/BabyBERTa_AO-Newsela',
                'saved_models/BabyBERTa_Wikipedia-1',
               ]

## Data.

# CHILDES.
CHILDES = './data/CHILDES/aochildes.txt'

# BLiMP.
BLIMP = './data/BLiMP/' # data across many files.

# Zorro.
ZORRO = './data/Zorro/'
ZORRO_SCRAMBLED = './data/Zorro_scrambled/'

## Output Directories.
BABYBERTA_CHILDES_OUT = './output/CHILDES/babyberta_childes_mlm.csv'
BABYBERTA_BLIMP_OUT = './output/BLiMP/babyberta_blimp_mlm.csv'
BABYBERTA_SCRAMBLED_ZORRO_OUT = './output/Zorro_scrambled/babyberta_zorro_scrambled_mlm.csv'

ALL_MODELS_BLIMP_OUT = './output/BLiMP/all_models_blimp_mlm.csv'
ALL_MODELS_ZORRO_OUT = './output/Zorro/all_models_zorro_mlm.csv'

## Torch Setup

In [None]:
## Set up Torch to use the GPU if available.
if USE_GPU and torch.cuda.is_available():  # Tell PyTorch to use the GPU. 
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('Will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Helper Functions

In [None]:
## Helper function to apply Masked Language Modeling to an entire sentence (pseudo log probability).
def pseudo_log_prob(sentence, model, tokenizer, include_unigram=True, device="cpu"):
    '''
    Helper function to apply Masked Language Modeling to an entire sentence (pseudo log probability).
    This method applies the [MASK] token sequentially across each subword token of the sentence and
    queries the model for the log-likelihood of the original token.  The log likelihoods are summed
    across the entire sentence.  See Salazar et al. 2019 for comments.
    '''    
    logp_sentence = torch.tensor([0], dtype=torch.float64, device=device)
    unigram_prob = []
    # Get the full sentence embedding for MLM prediction.
    sent_tokenized = tokenizer(sentence, return_tensors="pt").to(device)
    for i in range(1, len(sent_tokenized["input_ids"][0])-1): # In order to skip the <start> and <end> tokens (vary by model).
        # Grab the original token before masking.               
        input_id = sent_tokenized["input_ids"][0, i].clone() # Need to copy the token_id tensor.
        #print("Original input id", input_id)
        
        # Apply the <mask> token in order.
        sent_tokenized["input_ids"][0, i] = tokenizer.mask_token_id
        #print("Input sentence:", tokenizer.convert_ids_to_tokens(sent_tokenized["input_ids"][0]))
                        
        # Get model outputs with softmax applied.  
        # Note: outputs will have shape batch_size x sequence_length x vocab_size.
        with torch.no_grad():
            outputs = model(**sent_tokenized, return_dict=True).logits.softmax(dim=2)
        #print(outputs)
        token_prob = outputs[0, i, input_id]#.detach().numpy()

        # Remove the <mask> token from the input for the next iteration.
        sent_tokenized["input_ids"][0, i] = input_id
        
        # Aggregate.
        logp_sentence += torch.log(token_prob)
        
        if not include_unigram: continue
        token = tokenizer.convert_ids_to_tokens([input_id])[0]
        unigram_prob.append({token:token_prob})
        
        #print(f"Probability of token {token}: {token_prob}")
    
    # This return will create new columns in the original dataframe.
    #return pd.Series([logp_sentence, unigram_prob], index=["mlm_logprob", "unigram_mlm_logprob"])
    
    # Unpack the pyTorch tensor.
    logp_sentence = logp_sentence.to("cpu").detach().numpy()[0]
    output = {'sentence':sentence, 
            'pseudoLogProb':logp_sentence}
    if include_unigram:
        output['mlmUnigramProb'] = unigram_prob
    
    return output

In [None]:
## Helper function to apply pseudo_log_prob to an entire DataFrame.
def batch_pll(model_name, sentence_df, sent_col, include_unigram=False, device="cpu"):
    '''
    Helper function to apply pseudo_log_prob to an entire DataFrame.  It works by vectorizing
    pseudo_log_prob() and applying it to the pandas column of interest indicated by the
    sent_col parameter.
    '''
    # Instantiate model with tokenizer.
    if "BabyBERTa" in model_name:
        model = RobertaForMaskedLM.from_pretrained(model_name)
        tokenizer = RobertaTokenizerFast.from_pretrained(model_name,
                                          add_prefix_space=True,  # this must be added to produce intended behavior
                                          )
    elif "roberta-base-10M-1" in model_name:
        model = AutoModelForMaskedLM.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    elif "roberta" in model_name:
        model = RobertaForMaskedLM.from_pretrained(model_name)
        tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
    elif "bert" in model_name:
        model = BertForMaskedLM.from_pretrained(model_name)
        tokenizer = BertTokenizer.from_pretrained(model_name)
    else:
        print("No valid model found!  Terminating execution...")
        return
    
    # Send the model to the GPU if chosen.
    model.to(device)
    
    ## Get MLM probability for each sentence in the sentences DataFrame.
    results = np.vectorize(pseudo_log_prob)(sentence_df[sent_col], model, tokenizer, include_unigram=include_unigram, device=device)
    results = pd.DataFrame(results.tolist())
    
    # Cleanup.
    del model
    del tokenizer    
    torch.cuda.empty_cache()
    
    return results

In [None]:
# Helper function to run all the models through a test set in minimal pair (BLiMP) format.
def test_models(model_names, test_set, 
                good_sent_col="sentence_good",
                bad_sent_col="sentence_bad", 
                output_dir=None,
                device="cpu"):
    
    if output_dir is not None and os.path.isfile(output_dir):
        all_models = pd.read_csv(output_dir)
    else:
        all_models = pd.DataFrame(columns=(list(test_set.columns) + ["model", "orig_min_pair_idx", "date"]))

    for model in model_names:
        print(f"Now testing {model}...")
        
        model_good = batch_pll(model, test_set, good_sent_col, device=device)
        model_bad = batch_pll(model, test_set, bad_sent_col, device=device)

        # Error checking.
        if model_good is None or model_bad is None:
            print(f"ERROR! Skipping... {model}")

        # Stitch together the two dataframes.
        model_test = test_set.merge(model_good["pseudoLogProb"], 
                                    left_index=True, 
                                    right_index=True, 
                                    copy=False, 
                                    validate="one_to_one")
        model_test = model_test.merge(model_bad["pseudoLogProb"],
                                      suffixes=("_good", "_bad"),    # Should produce something like "pseudoLogProb_good", "pseudoLogProb_bad"...
                                      left_index=True, 
                                      right_index=True, 
                                      copy=False, 
                                      validate="one_to_one")

        # Get model name.
        sep = model.rfind("/") # some model names may be paths.
        model_name = model[sep + 1:] # if "/" is not a path, rfind() returns -1 so the index is just [0:]...
        
        # Tag which model produced this data.
        model_test["model"] = model_name

        # Preserve the original index.
        model_test.index.rename("orig_min_pair_idx", inplace=True)
        model_test.reset_index(inplace=True)

        # Timestamp.
        model_test["date"] = date.today()

        # Stack the dataframes.
        all_models = pd.concat([all_models, model_test], ignore_index=True)

        # Export in case something goes wrong...
        if output_dir is not None:
            all_models.to_csv(output_dir, index=False)
    
    return all_models

In [None]:
def results_by_phenomenon(output_df, model_col="model", phenomenon_col="linguistics_term", correct_col="correct"):
    # Create new dataframe.
    results = pd.DataFrame(index=output_df[model_col].unique(), columns=output_df[phenomenon_col].unique())
    
    for idx, row in results.iterrows():
        subset = output_df.loc[output_df[model_col] == idx]
        for phenomenon in row.index:
            row[phenomenon] = subset[correct_col].loc[subset[phenomenon_col] == phenomenon].mean()
            
    return results

## Preprocessing & Data Loading.
I want each of the datasets in its own DataFrame.  This may be especially tricky for the B|LiMP dataset because it's split across so many files.

In [None]:
# BLiMP.
all_blimp_files = glob.glob(os.path.join(BLIMP, "*.jsonl"))
blimp = pd.concat((pd.read_json(f, lines=True) for f in all_blimp_files), ignore_index=True)

In [None]:
blimp

In [None]:
# CHILDES.
with open(CHILDES) as f:
    lines = [x.rstrip() for x in f.readlines()]
    childes = pd.DataFrame({'sentence':lines})

In [None]:
# Zorro.
all_zorro_files = glob.glob(os.path.join(ZORRO, "*.txt"))
zorro = pd.DataFrame(columns=["sentence_good", "sentence_bad", "phenomenon", "paradigm"])
for f in all_zorro_files:
    # Read file.
    # File structure is a list of sentences arranged in minimal pairs.
    frame = pd.read_csv(f, names=["sentence"])
    
    # Split the DataFrame into good and bad sentences.
    bad = frame.iloc[::2].reset_index(drop=True) # every other row, starting from the first one.
    good = frame.iloc[1::2].reset_index(drop=True) # every other row, starting from the 2nd one.
    
    # Merge them to mirror BLiMP.
    zorro_subset = good.merge(bad, 
                              left_index=True,
                              right_index=True,
                              suffixes=("_good", "_bad"),
                              validate="one_to_one"
                             )
                     
    # Clean up filename to annotate DataFrame.
    start = f.rfind('/') + 1 # to strip path info.
    end = -4                 # to eliminate .txt extension.
    annotation = f[start:end]
    
    # Split filename into phenomenon and paradigm.
    sep = annotation.rfind('-') # the last hyphen always separates phenomenon from paradigm.
    phenomenon = annotation[:sep]
    paradigm = annotation[sep+1:]
    
    # Annotate the DataFrame.
    zorro_subset["phenomenon"] = phenomenon
    zorro_subset["paradigm"] = paradigm
    
    # Add to full Zorro.
    zorro = pd.concat([zorro, zorro_subset], ignore_index=True)

In [None]:
zorro

## Load the model.

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODELS[0],
                                                  add_prefix_space=True,  # this must be added to produce intended behavior
                                          )
model = RobertaForMaskedLM.from_pretrained(MODELS[0])

In [None]:
## Send the model to the GPU if available.
model.to(device)  # This shouldn't do anything if no GPU was available (device="cpu").

## Single sentence tests
<b>Note:</b> The RoBERTa tokenizer by design uses the _Ġ_ character to indicate a whitespace.  Notice only either whole-word tokens or word-initial subword pieces have the _Ġ_ marker prepended.  The default BERT model uses ## instead.  This is supposedly an engineering trick to improve performance.

Sources:
- https://stackoverflow.com/questions/61134275/difficulty-in-understanding-the-tokenizer-used-in-roberta-model
- https://github.com/openai/gpt-2/issues/80

In [None]:
pseudo_log_prob("The even ones are the grammatical ones.", model, tokenizer)

In [None]:
pseudo_log_prob("The even ones is the grammatical ones.", model, tokenizer)

## Run inference on all the data.
I will start by deleting the model and tokenizer from earlier.  RAM and VRAM are limited, and the helper functions should abstract the loading and unloading of all the components already.

In [None]:
## Cleanup from the earlier inference...
del tokenizer
del model

# I want to get these off memory.  
# The helper function should take care of loading and unloading models.

In [None]:
babyberta_childes = batch_pll(MODELS[0], childes, "sentence", device=device)

In [None]:
babyberta_blimp_good = batch_pll(MODELS[0], blimp, "sentence_good", device=device)

In [None]:
babyberta_blimp_bad = batch_pll(MODELS[0], blimp, "sentence_bad", device=device)

## Export.

In [None]:
# Merge the BLiMP good and bad sentences for exporting.
new_cols = ["pseudoLogProb", "mlmUnigramProb"]
babyberta_blimp = blimp.merge(babyberta_blimp_good[new_cols], 
                              left_index=True, 
                              right_index=True, 
                              copy=False, 
                              validate="one_to_one")
babyberta_blimp = babyberta_blimp.merge(babyberta_blimp_bad[new_cols],
                                        suffixes=("_good", "_bad"),    # Should produce something like "pseudoLogProb_good", "pseudoLogProb_bad"...
                                        left_index=True, 
                                        right_index=True, 
                                        copy=False, 
                                        validate="one_to_one")

In [None]:
# Export.
babyberta_childes.to_csv(BABYBERTA_CHILDES_OUT, index=False)
babyberta_blimp.to_csv(BABYBERTA_BLIMP_OUT, index=False)

## Preliminary Analyses.

In [None]:
## How did BabyBERTa do on BLiMP?
babyberta_blimp_correct = (babyberta_blimp.pseudoLogProb_good > babyberta_blimp.pseudoLogProb_bad).astype(int)
babyberta_blimp_avg_perf = blimp_correct.sum()/blimp_correct.size
print(f"Overall {MODELS[0]} performance on BLiMP:{babyberta_blimp_correct.sum()}/{babyberta_blimp_correct.size} ({babyberta_blimp_avg_perf}) \n")

# Plot.
fig, ax = plt.subplots(figsize=(10, 10))
sns.barplot(babyberta_blimp, x="linguistics_term", y=blimp_correct, errorbar=None, ax=ax)
ax.bar_label(ax.containers[-1], fmt='Mean:\n%.2f', label_type='edge') # Label the bars.
# Include overall average performance.
ax.axhline(y=babyberta_blimp_avg_perf, label='BabyBERTA-1 mean BLiMP perf.', linestyle='--');

# Styling and labeling.
ax.set_ylim([0.0, 1.0])
plt.xticks(rotation=45, ha='right');
ax.legend();
ax.set_title("BabyBERTa per-phenomenon performance on BLiMP")

plt.show()


## Custom distractors.
I will take sentences from the `distractor_agreement_relation_noun.json` and `distractor_agreement_relative_clause.json` files and modify them to make them not quite as easy as they are.

In [None]:
## But first...
sv_agreement_phenomena = ["distractor_agreement_relation_noun", "distractor_agreement_relative_clause"]
# NOTE: I am querying for the FILES and not what the authors listed as a SUBJECT VERB AGREEMENT PHENOMENON.
babyberta_sv_agreement = babyberta_blimp.loc[(babyberta_blimp.UID.isin(sv_agreement_phenomena))].copy()

In [None]:
# Let me filter down the data to make it manageable.
relevant_sv_agree_columns = ["UID", "sentence_good", "pseudoLogProb_good", "sentence_bad", "pseudoLogProb_bad", "linguistics_term", "lexically_identical",]
babyberta_sv_agreement = babyberta_sv_agreement[relevant_sv_agree_columns]

In [None]:
# Add a correctness column.
babyberta_sv_agreement["correct"] = (babyberta_sv_agreement.pseudoLogProb_good > babyberta_sv_agreement.pseudoLogProb_bad).astype(int)
print("How well did BabyBERTa do on these off the bat?")
babyberta_sv_agreement.correct.value_counts()

In [None]:
print("I would like to see a few examples it got right:")
for idx, row in babyberta_sv_agreement.loc[babyberta_sv_agreement.correct == 1].sample(10).iterrows():
    print(f"|------------Row #{idx}--------------")
    print(f"| {idx}a. {row.sentence_good} | good sentence PLL: {row.pseudoLogProb_good}") 
    print(f"| *{idx}b. {row.sentence_bad} | bad sentence PLL: {row.pseudoLogProb_bad}") 
    print("############################################################")

In [None]:
print("... and some it got wrong:")
for idx, row in babyberta_sv_agreement.loc[babyberta_sv_agreement.correct == 0].sample(10).iterrows():
    print(f"|------------Row #{idx}--------------")
    print(f"| {idx}a. {row.sentence_good} | good sentence PLL: {row.pseudoLogProb_good}") 
    print(f"| *{idx}b. {row.sentence_bad} | bad sentence PLL: {row.pseudoLogProb_bad}") 
    print("############################################################")

## Large scale simulation.

In [None]:
all_models_blimp = test_models(SAVED_MODELS + MODELS, blimp, output_dir=ALL_MODELS_BLIMP_OUT, device=device)

In [None]:
babyberta_zorro = test_models(SAVED_MODELS + MODELS, zorro, output_dir=ALL_MODELS_ZORRO_OUT, device=device)

## Analyze the results.

In [None]:
ordered_models = ["phueb/BabyBERTa-1", "phueb/BabyBERTa-2", "phueb/BabyBERTa-3", "bert-base-cased", "bert-large-cased", "roberta-base", "roberta-large"]
ordered_blimp_phenomena = ["anaphor_agreement", "argument_structure", "binding", "control_raising", "determiner_noun_agreement", "ellipsis", "filler_gap_dependency", "irregular_forms", "island_effects", "npi_licensing", "quantifiers", "subject_verb_agreement"]
ordered_zorro_phenomena = ["anaphor_agreement", "argument_structure", "binding", "case", "determiner_noun_agreement", "ellipsis", "filler_gap_dependency", "irregular_forms", "island_effects", "local_attractor", "npi_licensing", "quantifiers", "subject_verb_agreement"]

In [None]:
## How did the BERT models do on BLiMP?
all_models_blimp["correct"] = (all_models_blimp.pseudoLogProb_good > all_models_blimp.pseudoLogProb_bad).astype(int)
for model in all_models_blimp.model.unique():
    subset = all_models_blimp.loc[all_models_blimp.model == model]
    print(f"Overall {model} performance on BLiMP:" +\
          f"{subset.correct.sum()}/{subset.size} ({np.mean(subset.correct)}) \n")

# Plot.
fig, ax = plt.subplots(figsize=(30, 10))
sns.barplot(all_models_blimp, x="linguistics_term", y="correct", hue="model", errorbar=None, ax=ax)
#ax.bar_label(ax.containers[-1], fmt='Mean:\n%.2f', label_type='edge') # Label the bars.
# Include overall average performance.
#ax.axhline(y=babyberta_blimp_avg_perf, label='BabyBERTA-1 mean BLiMP perf.', linestyle='--');

# Styling and labeling.
ax.set_ylim([0.0, 1.0])
plt.xticks(rotation=45, ha='right');
ax.legend();
ax.set_title("BERT-derived models' per-phenomenon performance on BLiMP")

plt.show()

In [None]:
## How did the BERT models do on BLiMP?
all_models_zorro["correct"] = (all_models_zorro.pseudoLogProb_good > all_models_zorro.pseudoLogProb_bad).astype(int)
for model in all_models_zorro.model.unique():
    subset = all_models_zorro.loc[all_models_zorro.model == model]
    print(f"Overall {model} performance on Zorro:" +\
          f"{subset.correct.sum()}/{subset.size} ({np.mean(subset.correct)}) \n")

# Plot.
fig, ax = plt.subplots(figsize=(30, 10))
sns.barplot(all_models_zorro, x="phenomenon", y="correct", hue="model", errorbar=None, ax=ax)
#ax.bar_label(ax.containers[-1], fmt='Mean:\n%.2f', label_type='edge') # Label the bars.
# Include overall average performance.
#ax.axhline(y=babyberta_blimp_avg_perf, label='BabyBERTA-1 mean BLiMP perf.', linestyle='--');

# Styling and labeling.
ax.set_ylim([0.0, 1.0])
plt.xticks(rotation=45, ha='right');
ax.legend();
ax.set_title("BERT-derived models' per-phenomenon performance on Zorro")

plt.show()

## Scrambling Experiment

In [None]:
# Helper function to assemble Zorro.
def assemble_zorro(zorro_directory):
    all_zorro_files = glob.glob(os.path.join(zorro_directory, "*.txt"))
    zorro = pd.DataFrame(columns=["sentence_good", "sentence_bad", "phenomenon", "paradigm"])
    for f in all_zorro_files:
        # Read file.
        # File structure is a list of sentences arranged in minimal pairs.
        frame = pd.read_csv(f, names=["sentence"])

        # Split the DataFrame into good and bad sentences.
        bad = frame.iloc[::2].reset_index(drop=True) # every other row, starting from the first one.
        good = frame.iloc[1::2].reset_index(drop=True) # every other row, starting from the 2nd one.

        # Merge them to mirror BLiMP.
        zorro_subset = good.merge(bad, 
                                  left_index=True,
                                  right_index=True,
                                  suffixes=("_good", "_bad"),
                                  validate="one_to_one"
                                 )

        # Clean up filename to annotate DataFrame.
        start = f.rfind('/') + 1 # to strip path info.
        end = -4                 # to eliminate .txt extension.
        annotation = f[start:end]

        # Split filename into phenomenon and paradigm.
        sep = annotation.rfind('-') # the last hyphen always separates phenomenon from paradigm.
        phenomenon = annotation[:sep]
        paradigm = annotation[sep+1:]

        # Annotate the DataFrame.
        zorro_subset["phenomenon"] = phenomenon
        zorro_subset["paradigm"] = paradigm

        # Add to full Zorro.
        zorro = pd.concat([zorro, zorro_subset], ignore_index=True)
    return zorro

In [None]:
# Run BabyBERTa models on scrambled Zorro data.
models_to_test = ['saved_models/BabyBERTa_AO-CHILDES',
                  'saved_models/BabyBERTa_AO-CHILDES_standard_masking',
                 ]

seed_count = 0 # For keeping track of the different scrambled Zorro runs.
for seed in os.listdir(ZORRO_SCRAMBLED):
    # Check that the path is not a hidden folder.
    if seed.startswith('.'): continue

    # load Scrambled Zorro.
    zorro_dir = os.path.join(ZORRO_SCRAMBLED, seed)
    zorro_scrambled = assemble_zorro(zorro_dir)

    # Tag this particular scrambled dataset.
    zorro_scrambled["scrambling_seed"] = seed_count
    seed_count +=1 # update the seed counter.
    
    # Run the simulation.
    babyberta_zorro = test_models(models_to_test, 
                                  zorro_scrambled, 
                                  output_dir=BABYBERTA_SCRAMBLED_ZORRO_OUT, 
                                  device=device)

In [None]:
os.listdir(ZORRO_SCRAMBLED)

In [None]:
os.isdir(ZORRO_SCRAMBLED + "0")

In [None]:
os.path.join(os.path.join(ZORRO_SCRAMBLED, "0"), "*.txt")

In [None]:
os.path.join(zorro_directory, "*.txt")