In [1]:
# Transformers installation
! pip install transformers datasets sentencepiece

# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 15.4 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 41.0 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 58.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 62.7 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 70.8 MB/s 
Collecting responses<0.19
  Downloading respo

In [None]:
# sentences
correct = [
      
    "Humans have many basic needs and one of them is to have an environment that can sustain their lives"
    "Our current population is 6 billion people  and it is still growing exponentially."	,
    "This will, if not already, caused  problems as there are very limited spaces for us."	,
    "From large scale power generators to the basic cooking at  our homes, fuel is essential for all of these to happen and work.",
    "In brief, innovators have to face many challenges when they want to develop the  products."	,
    "The solution can be obtain by using technology to achieve a better usage of space that we have and resolve the problems in lands that  inhospitable  such as desserts  and swamps.",
    "As the number of people grows, the need of  habitable environment is unquestionably essential"

]
incorrect = [
    
    "Humans have many basic needs, and one of them is to have an environment that can sustain their lives.",
    "Our current population is 6 billion people, and it is still growing exponentially.",
    "This will, if not already, cause problems as there are very limited spaces for us.",
    "From large scale power generators to the basic cooking in our homes, fuel is essential for all of these to happen and work.",
    "In brief, innovators have to face many challenges when they want to develop products.",
    "The solution can be obtained by using technology to achieve a better usage of space that we have and resolve the problems in lands that are inhospitable, such as deserts and swamps.",
    "As the number of people grows, the need for a habitable environment is unquestionably essential.",
        
]

Let's demonstrate this process with GPT-2.

In [2]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = "cuda"
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
import numpy as np

model_name = 'cointegrated/rubert-tiny'
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def score(model, tokenizer, sentence):
    tensor_input = tokenizer.encode(sentence, return_tensors='pt')
    repeat_input = tensor_input.repeat(tensor_input.size(-1)-2, 1)
    mask = torch.ones(tensor_input.size(-1) - 1).diag(1)[:-2]
    masked_input = repeat_input.masked_fill(mask == 1, tokenizer.mask_token_id)
    labels = repeat_input.masked_fill( masked_input != tokenizer.mask_token_id, -100)
    with torch.inference_mode():
        loss = model(masked_input, labels=labels).loss
    return np.exp(loss.item())

print(score(sentence='London is the capital of Great Britain.', model=model, tokenizer=tokenizer)) 
# 4.541251105675365
print(score(sentence='London is the capital of South America.', model=model, tokenizer=tokenizer)) 
# 6.162017238332462

Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/241k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/468k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

4.54125218839413
6.162018707472031


In [4]:
print(score(sentence='I finally managed to disentangle myself from perplexity', model=model, tokenizer=tokenizer)) 

173.3513819946354


In [5]:
print(score(sentence='Yesterday I went to the market', model=model, tokenizer=tokenizer)) 

12.805445255215101


# Comparison of `BERT` and `GPT-2` for sentence preplexity score:  [Source](https://www.scribendi.ai/comparing-bert-and-gpt-2-as-language-models-to-score-the-grammatical-correctness-of-a-sentence/)


In [6]:
import sys
import numpy as np
 
import torch
from transformers import BertTokenizer,BertForMaskedLM
# Load pre-trained model (weights)
with torch.no_grad():
    model = BertForMaskedLM.from_pretrained('bert-large-cased')
    model.eval()
    
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-large-cased')

def score(sentence):
    tokenize_input = tokenizer.tokenize(sentence)
    tokenize_input = ["[CLS]"]+tokenize_input+["[SEP]"]
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    with torch.no_grad():
        loss=model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.detach().numpy())
 
print(f"Incorrect sentence: ")
print(score(sentence='The solution can be obtain by using technology to achieve a better usage of space that we have and resolve the problems in lands that  inhospitable  such as desserts  and swamps.'))

print(f"Correct sentence: ")
print(score(sentence='The solution can be obtained by using technology to achieve a better usage of space that we have and resolve the problems in lands that are inhospitable, such as deserts and swamps.'))

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Incorrect sentence: 
3.6149743
Correct sentence: 
4.5391293


In [7]:
import torch
import sys
import numpy as np
 
from transformers import GPT2Tokenizer, GPT2LMHeadModel
# Load pre-trained model (weights)
with torch.no_grad():
        model = GPT2LMHeadModel.from_pretrained('gpt2')
        model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 
def score(sentence):
    tokenize_input = tokenizer.encode(sentence)
    tensor_input = torch.tensor([tokenize_input])
    loss=model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.detach().numpy())
 
print(f"Incorrect sentence: ")
print(score(sentence='The solution can be obtain by using technology to achieve a better usage of space that we have and resolve the problems in lands that  inhospitable  such as desserts  and swamps.'))

print(f"Correct sentence: ")
print(score(sentence='The solution can be obtained by using technology to achieve a better usage of space that we have and resolve the problems in lands that are inhospitable, such as deserts and swamps.'))

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Incorrect sentence: 
162.65617
Correct sentence: 
42.37668


# BART

In [8]:
from transformers import BartTokenizerFast, BartForConditionalGeneration
from datasets import load_dataset

model_checkpoint = 'a1noack/bart-large-gigaword'
tokenizer = BartTokenizerFast.from_pretrained("a1noack/bart-large-gigaword")
model = BartForConditionalGeneration.from_pretrained(model_checkpoint, return_dict=True)
device = "cuda" if torch.cuda.is_available() else "cpu"

test = load_dataset("gigaword", split='test[:20]')
encodings =  tokenizer(test['document'], return_tensors='pt', padding=True, truncation=True, max_length=1024).to(device)

model = model.to(device)
model.eval()
number_beams = 8
result = model.generate(encodings['input_ids'],  num_beams=number_beams, return_dict_in_generate=True, max_length=model.config.max_length, output_scores=True, output_attentions=True)

log_sent = []

for batch_num in range(0, result.scores[0].shape[0], number_beams):
    max_score = torch.tensor(-1*1e6, dtype=torch.float).to(device)
    for beam_num in range(number_beams):
        max_score = torch.max(torch.stack([torch.max(result.scores[-1][batch_num+beam_num]), max_score]))
    log_sent.append(max_score)
    
print("Perplexity:", torch.exp((-1*(torch.stack(log_sent).sum()))/result.sequences.shape[1]))

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.40k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.70k [00:00<?, ?B/s]

Downloading and preparing dataset gigaword/default (download: 551.61 MiB, generated: 916.93 MiB, post-processed: Unknown size, total: 1.43 GiB) to /root/.cache/huggingface/datasets/gigaword/default/1.2.0/ea83a8b819190acac5f2dae011fad51dccf269a0604ec5dd24795b64efb424b6...


Downloading data:   0%|          | 0.00/578M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3803957 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/189651 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1951 [00:00<?, ? examples/s]

Dataset gigaword downloaded and prepared to /root/.cache/huggingface/datasets/gigaword/default/1.2.0/ea83a8b819190acac5f2dae011fad51dccf269a0604ec5dd24795b64efb424b6. Subsequent calls will reuse this data.
Perplexity: tensor(1., device='cuda:0')


In [9]:
from transformers import BartTokenizerFast, BartForConditionalGeneration
from transformers import AutoTokenizer
from transformers import AutoModelForMaskedLM

with torch.no_grad():
  model_checkpoint = 'bart'
  model_name ='facebook/bart-base'
  model = AutoModelForMaskedLM.from_pretrained(model_name)
  # model = BartForConditionalGeneration.from_pretrained(model_checkpoint, return_dict=True)
  model.eval()

# Load pre-trained model tokenizer (vocabulary)
# tokenizer = BartTokenizerFast.from_pretrained("bart")
tokenizer  = AutoTokenizer.from_pretrained('facebook/bart-base') # bart-large is the sam

def score(sentence):
    tokenize_input = tokenizer.encode(sentence)
    tensor_input = torch.tensor([tokenize_input])
    loss=model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.detach().numpy())
 
if __name__=='__main__':
    for line in sys.stdin:
        if line.strip() !='':
            print(line.strip()+'\t'+ str(score(line.strip())))
        else:
            break

print(f"Incorrect sentence: ")
print(score(sentence='The solution can be obtain by using technology to achieve a better usage of space that we have and resolve the problems in lands that inhospitable such as desserts and swamps.'))

print(f"Correct sentence: ")
print(score(sentence='The solution can be obtained by using technology to achieve a better usage of space that we have and resolve the problems in lands that are inhospitable, such as deserts and swamps.'))

Downloading:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Incorrect sentence: 
1.0240532
Correct sentence: 
1.0000085


# T5

In [10]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

with torch.no_grad():
  model = T5ForConditionalGeneration.from_pretrained("t5-small")
  model.eval()

  # Load pre-trained model tokenizer (vocabulary)
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def score(sentence):
    tokenize_input = tokenizer.encode(sentence)
    print(tokenize_input)
    tensor_input = torch.tensor([tokenize_input])
    loss=model(tensor_input, labels=tensor_input)[0]
    return np.exp(loss.detach().numpy())
 
print(f"Incorrect sentence: ")
print(score(sentence='The solution can be obtain by using technology to achieve a better usage of space that we have and resolve the problems in lands that inhospitable such as desserts and swamps.'))

print(f"Correct sentence: ")
print(score(sentence='The solution can be obtained by using technology to achieve a better usage of space that we have and resolve the problems in lands that are inhospitable, such as deserts and swamps.'))

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Incorrect sentence: 
[37, 1127, 54, 36, 3442, 57, 338, 748, 12, 1984, 3, 9, 394, 4742, 13, 628, 24, 62, 43, 11, 7785, 8, 982, 16, 3, 6347, 24, 16, 11982, 5230, 179, 224, 38, 7737, 7, 11, 28945, 7, 5, 1]
1.4678051
Correct sentence: 
[37, 1127, 54, 36, 5105, 57, 338, 748, 12, 1984, 3, 9, 394, 4742, 13, 628, 24, 62, 43, 11, 7785, 8, 982, 16, 3, 6347, 24, 33, 16, 11982, 5230, 179, 6, 224, 38, 9980, 7, 11, 28945, 7, 5, 1]
1.2870257
