In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import torch
from bert_training import training
from copy import deepcopy
from keras.preprocessing.sequence import pad_sequences
from pathlib import Path
from sklearn.metrics import matthews_corrcoef
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import FillMaskPipeline, TextClassificationPipeline, pipeline
sns.set()

Using TensorFlow backend.


In [2]:
## BERT models.
BERT_BASE = 'bert-base-uncased'
BERT_LARGE = 'bert-large-uncased'
BERT_LARGE_CASED = 'bert-large-cased'
MODELS = [BERT_BASE, BERT_LARGE, BERT_LARGE_CASED]
## Data of interest.
DATA_FILE = Path('./data/li-adger_sentences.csv')
OUT_DIR = Path('./output/bert_mlm_results_li-adger_sentences.csv')

## Random seeds used to train BERT.
RANDOM_SEEDS = [18, 11, 97, 7, 39, 40, 67, 5, 84, 72]

In [3]:
## Load the .csv file with LI & Adger sentences into memory.
sentences = pd.read_csv(DATA_FILE)
print(sentences.head())

  dataset notes typo?                   id  \
0      LI   NaN   NaN  32.1.martin.1a.g.01   
1      LI   NaN   NaN  32.1.martin.1a.g.02   
2      LI   NaN   NaN  32.1.martin.1a.g.03   
3      LI   NaN   NaN  32.1.martin.1a.g.04   
4      LI   NaN   NaN  32.1.martin.1a.g.05   

                                        sentence  sent_length  
0              Kerry attempted to study physics.            7  
1             Jimmy attempted to weave a basket.            8  
2     Brittany attempted to touch the porcupine.            8  
3  Frank attempted to eat a triple fudge sundae.           10  
4                Kat attempted to keep her mail.            8  


In [4]:
## Helper function to apply Masked Language Modeling pipeline to a Pandas DataFrame.
def sentence_mlm_prob(sentence, sent_id, model, tokenizer):
    logp_sentence = 0
    unigram_prob = []
    sent_tokenized = tokenizer.tokenize(sentence)
    for i in range(1, len(sent_tokenized) + 1): # Need to offset because the tokenizer will
        # Mask the tokens in order.               automatically pad the sequence with [CLS] and [SEP].
        token = sent_tokenized[i - 1]
        input_id = tokenizer.convert_tokens_to_ids([token])
        
        # Copy the list of tokens and replace one with <mask>.
        masked_sent = sent_tokenized[:]
        masked_sent[i - 1] = tokenizer.mask_token
        
        # Convert the list of tokens back into a coherent sentence.
        inputs = tokenizer.convert_tokens_to_string(masked_sent)
        #print(inputs)
        
        # Get the full sentence embedding for MLM prediction.
        inputs = tokenizer(inputs, return_tensors="pt") # return pytorch tensor
        #print(inputs.input_ids)
        
        # Get model outputs with softmax applied.  
        # Note: outputs will have shape batch_size x sequence_length x vocab_size.
        outputs = model(**inputs, return_dict=True).logits.softmax(dim=2)
        token_prob = outputs[0, i, input_id].detach().numpy()
        
        #print(f"Probability of token {token}: {token_prob}")
        
        # Aggregate.
        logp_sentence += np.log(token_prob)[0]
        unigram_prob.append({token:token_prob})
    
    # This return will create 2 new columns in the original dataframe.
    #return pd.Series([logp_sentence, unigram_prob], index=["mlm_logprob", "unigram_mlm_logprob"])
    return {'id':sent_id,
            'sentence':sentence, 
            'pseudoLogProb':logp_sentence,
            'mlmUnigramProb':unigram_prob}

In [5]:
print("Testing the MLM pipeline...")
## Instantiate the Masked Language Modelling (MLM) BERT.
tokenizer = AutoTokenizer.from_pretrained(BERT_LARGE_CASED)
model = AutoModelForMaskedLM.from_pretrained(BERT_LARGE_CASED)

# Disable gradient calculations for inference (no backpropagation).
torch.no_grad()

## Get MLM probability for the first few sentences in the sentences DataFrame.
results = np.vectorize(sentence_mlm_prob)(sentences.sentence.head(), sentences.id.head(), model, tokenizer)
results = pd.DataFrame(results.tolist())

## Parse the results and calculate z-scores.
new_data = sentences.merge(results[["id", "pseudoLogProb"]], how="inner", on="id", validate="one_to_one")
new_data.rename({"pseudoLogProb":BERT_LARGE_CASED + "_mlm"}, axis="columns", inplace=True)

mlm_data = new_data[BERT_LARGE_CASED + "_mlm"]
new_data[BERT_LARGE_CASED + "_mlm_zscores"] = (mlm_data - mlm_data.mean())/mlm_data.std(ddof=0)

new_data.head()

Testing the MLM pipeline...


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,dataset,notes,typo?,id,sentence,sent_length,bert-large-cased_mlm,bert-large-cased_mlm_zscores
0,LI,,,32.1.martin.1a.g.01,Kerry attempted to study physics.,7,-26.039757,1.06565
1,LI,,,32.1.martin.1a.g.02,Jimmy attempted to weave a basket.,8,-30.894372,0.641492
2,LI,,,32.1.martin.1a.g.03,Brittany attempted to touch the porcupine.,8,-59.184504,-1.830278
3,LI,,,32.1.martin.1a.g.04,Frank attempted to eat a triple fudge sundae.,10,-35.032345,0.279948
4,LI,,,32.1.martin.1a.g.05,Kat attempted to keep her mail.,8,-40.031179,-0.156811


In [7]:
## Apply the pipeline to all three BERT models.
for bert in MODELS:
    print("Now loading {}...".format(bert))
    tokenizer = AutoTokenizer.from_pretrained(bert)
    model = AutoModelForMaskedLM.from_pretrained(bert)
    
    # Disable gradient calculations for inference (no backpropagation).
    torch.no_grad()

    ## Get MLM probability for each sentence in the sentences DataFrame.
    results = np.vectorize(sentence_mlm_prob)(sentences.sentence, sentences.id, model, tokenizer)
    results = pd.DataFrame(results.tolist())

    ## Parse the results and calculate z-scores.
    sentences = sentences.merge(results[["id", "pseudoLogProb"]], how="left", on="id", validate="one_to_one")
    sentences.rename({"pseudoLogProb":bert + "_mlm"}, axis="columns", inplace=True)

    new_data = sentences[bert + "_mlm"] # just to keep the following line short...
    sentences[bert + "_mlm_zscores"] = (new_data - new_data.mean())/new_data.std(ddof=0)

sentences.head()

Now loading bert-base-uncased...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now loading bert-large-uncased...


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Now loading bert-large-cased...


Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-large-cased and are newly initialized: ['cls.predictions.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,dataset,notes,typo?,id,sentence,sent_length,bert-base-uncased_mlm,bert-base-uncased_mlm_zscores,bert-large-uncased_mlm,bert-large-uncased_mlm_zscores,bert-large-cased_mlm,bert-large-cased_mlm_zscores
0,LI,,,32.1.martin.1a.g.01,Kerry attempted to study physics.,7,-27.880027,0.325898,-23.415677,0.655122,-26.039757,0.371827
1,LI,,,32.1.martin.1a.g.02,Jimmy attempted to weave a basket.,8,-31.754557,0.047632,-32.266427,0.034538,-30.894372,0.020289
2,LI,,,32.1.martin.1a.g.03,Brittany attempted to touch the porcupine.,8,-51.947159,-1.402585,-46.151353,-0.939024,-59.184504,-2.028288
3,LI,,,32.1.martin.1a.g.04,Frank attempted to eat a triple fudge sundae.,10,-51.125628,-1.343583,-44.627996,-0.832212,-35.032345,-0.279355
4,LI,,,32.1.martin.1a.g.05,Kat attempted to keep her mail.,8,-43.211186,-0.775174,-34.55576,-0.125982,-40.031179,-0.641336


In [11]:
sentences.to_csv(OUT_DIR)

In [10]:
sentences

Unnamed: 0,dataset,notes,typo?,id,sentence,sent_length,bert-base-uncased_mlm,bert-base-uncased_mlm_zscores,bert-large-uncased_mlm,bert-large-uncased_mlm_zscores,bert-large-cased_mlm,bert-large-cased_mlm_zscores
0,LI,,,32.1.martin.1a.g.01,Kerry attempted to study physics.,7,-27.880027,0.325898,-23.415677,0.655122,-26.039757,0.371827
1,LI,,,32.1.martin.1a.g.02,Jimmy attempted to weave a basket.,8,-31.754557,0.047632,-32.266427,0.034538,-30.894372,0.020289
2,LI,,,32.1.martin.1a.g.03,Brittany attempted to touch the porcupine.,8,-51.947159,-1.402585,-46.151353,-0.939024,-59.184504,-2.028288
3,LI,,,32.1.martin.1a.g.04,Frank attempted to eat a triple fudge sundae.,10,-51.125628,-1.343583,-44.627996,-0.832212,-35.032345,-0.279355
4,LI,,,32.1.martin.1a.g.05,Kat attempted to keep her mail.,8,-43.211186,-0.775174,-34.555760,-0.125982,-40.031179,-0.641336
...,...,...,...,...,...,...,...,...,...,...,...,...
4173,Adger,,,ch9.84-85.g.04,I wondered how Lewis survived.,7,-15.926197,1.184412,-19.627203,0.920756,-19.455108,0.848642
4174,Adger,,,ch9.84-85.g.05,I wondered where Sophie lived.,7,-14.682488,1.273734,-22.011065,0.753608,-19.165803,0.869591
4175,Adger,,,ch9.84-85.g.06,I wondered what Dan cooked for breakfast.,9,-23.778206,0.620487,-25.143076,0.534002,-19.656588,0.834052
4176,Adger,,,ch9.84-85.g.07,I wondered who Lisa married.,7,-25.661793,0.485209,-27.157065,0.392788,-23.216581,0.576262


Unnamed: 0,dataset,notes,typo?,id,sentence,sent_length
0,LI,,,32.1.martin.1a.g.01,Kerry attempted to study physics.,7
1,LI,,,32.1.martin.1a.g.02,Jimmy attempted to weave a basket.,8
2,LI,,,32.1.martin.1a.g.03,Brittany attempted to touch the porcupine.,8
3,LI,,,32.1.martin.1a.g.04,Frank attempted to eat a triple fudge sundae.,10
4,LI,,,32.1.martin.1a.g.05,Kat attempted to keep her mail.,8
...,...,...,...,...,...,...
4173,Adger,,,ch9.84-85.g.04,I wondered how Lewis survived.,7
4174,Adger,,,ch9.84-85.g.05,I wondered where Sophie lived.,7
4175,Adger,,,ch9.84-85.g.06,I wondered what Dan cooked for breakfast.,9
4176,Adger,,,ch9.84-85.g.07,I wondered who Lisa married.,7
