In [1]:
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import AutoTokenizer, AutoModelWithLMHead

In [9]:
import torch
from sklearn import preprocessing
import itertools

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [51]:
data= load_dataset("glue","ax", split = "test", streaming= True)

In [58]:
mods= ["bert-base-uncased", "t5-small"]

datas= [["oscar", "unshuffled_deduplicated_en", "train", True,5],
        ["imdb","plain_text", "test", False ,5],
        ["poem_sentiment","plain_text", "test", True ,5],        
        ["c4", "en", "train", True, 5]      
       ] 

In [None]:
perplex_model_data(m_name ="bert-base-uncased", d_name ="oscar", d_option="unshuffled_deduplicated_en", d_split = "train", d_streaming = True, d_size=5 )   

In [59]:
def perplex_model_data(**kwargs):
    modlist=[]
    maskedLM= ["bert-base-uncased"]
    maskedHead= ["t5-small"]
    tok = AutoTokenizer.from_pretrained(kwargs['m_name'])
    #TODO: fix the redundant loading of models!
    if kwargs['m_name'] in modlist: 
        next 
    elif kwargs['m_name'] in maskedHead:
        model = AutoModelWithLMHead.from_pretrained(kwargs['m_name'])
    else: 
        model = AutoModelForMaskedLM.from_pretrained(kwargs['m_name'])
        print(str(kwargs['m_name']) + " model loaded!")
        modlist.append(kwargs['m_name'])
    data= load_dataset(kwargs['d_name'], kwargs['d_option'], split= kwargs['d_split'], streaming = kwargs['d_streaming'])
    print(str(kwargs['d_name']) + " dataset loaded!")
    if kwargs['d_streaming'] == True: 
        data_head = data.take(kwargs['d_size'])
        try: 
            text= [l['text'] for l in list(data_head)]
        except:
            feature = [d for d in data.features if 'text' in d]
            feature=feature[0]
            text= [l[feature] for l in list(data_head)]
        
    else:
        feature = next(iter(data.features))
        text= data[feature][:kwargs['d_size']]
    
    encodings = tok('\n\n'.join(text), return_tensors='pt')
    try:
        max_length = model.config.max_position_embeddings
    except AttributeError as error:
        max_length = model.config.n_positions
    except:
        max_length=512
        
    #From https://huggingface.co/transformers/perplexity.html
    stride = 512
    lls = []
    for i in range(0, encodings.input_ids.size(1), stride):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i    # may be different from stride on last loop
        input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:,:-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs[0] * trg_len

        lls.append(log_likelihood)

    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
    print("The perplexity of the " + str(kwargs['m_name']) + " model with the " + str(kwargs['d_name']) + " dataset is " + str(ppl.item()))

In [43]:
def get_combo_pp(models, datasets):
    combos= list(itertools.product(models, datasets))
    print("There are " + str(len(list(combos))) + " combinations of models and datasets.")
    for m, d in combos:
        print('Analyzing the ' + m + ' model and the ' + d[0] + ' dataset.')
        perplex_model_data(m_name= m, d_name= d[0], d_option= d[1], d_split= d[2], d_streaming= d[3], d_size= d[4])

In [60]:
get_combo_pp(mods,datas)

There are 8 combinations of models and datasets.
Analyzing the bert-base-uncased model and the oscar dataset.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased model loaded!
oscar dataset loaded!


Token indices sequence length is longer than the specified maximum sequence length for this model (1918 > 512). Running this sequence through the model will result in indexing errors


The perplexity of the bert-base-uncased model with the oscar dataset is 1.7199418544769287
Analyzing the bert-base-uncased model and the imdb dataset.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased model loaded!


Reusing dataset imdb (/home/sasha/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Token indices sequence length is longer than the specified maximum sequence length for this model (1342 > 512). Running this sequence through the model will result in indexing errors


imdb dataset loaded!
The perplexity of the bert-base-uncased model with the imdb dataset is 1.2695562839508057
Analyzing the bert-base-uncased model and the poem_sentiment dataset.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased model loaded!


Using custom data configuration plain_text


poem_sentiment dataset loaded!
The perplexity of the bert-base-uncased model with the poem_sentiment dataset is 4.009187698364258
Analyzing the bert-base-uncased model and the c4 dataset.


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


bert-base-uncased model loaded!
c4 dataset loaded!


Token indices sequence length is longer than the specified maximum sequence length for this model (1051 > 512). Running this sequence through the model will result in indexing errors


The perplexity of the bert-base-uncased model with the c4 dataset is 1.4461493492126465
Analyzing the t5-small model and the oscar dataset.
oscar dataset loaded!


Token indices sequence length is longer than the specified maximum sequence length for this model (2075 > 512). Running this sequence through the model will result in indexing errors


The perplexity of the t5-small model with the oscar dataset is 541160.0625
Analyzing the t5-small model and the imdb dataset.


Reusing dataset imdb (/home/sasha/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)
Token indices sequence length is longer than the specified maximum sequence length for this model (1511 > 512). Running this sequence through the model will result in indexing errors


imdb dataset loaded!
The perplexity of the t5-small model with the imdb dataset is 463074.34375
Analyzing the t5-small model and the poem_sentiment dataset.


Using custom data configuration plain_text


poem_sentiment dataset loaded!
The perplexity of the t5-small model with the poem_sentiment dataset is 1.4519227743148804
Analyzing the t5-small model and the c4 dataset.
c4 dataset loaded!


Token indices sequence length is longer than the specified maximum sequence length for this model (1105 > 512). Running this sequence through the model will result in indexing errors


The perplexity of the t5-small model with the c4 dataset is 186506.234375


In [None]:
from datasets import load_dataset
base_url = "https://storage.googleapis.com/huggingface-nlp/cache/datasets/wikipedia/20200501.en/1.0.0/"
data_files = {"train": base_url + "wikipedia-train.parquet"}
wiki = load_dataset("parquet", data_files=data_files, split="train", streaming=True)
print(next(iter(wiki)))
# {'title': 'Yangliuqing', 'text': 'Yangliuqing () is a market town in Xiqing District...'}