<a href="https://colab.research.google.com/github/kyungjejo/exprgram/blob/master/sentence-replacement/research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT2 Language Model for Sentence Probability Scoring

In [5]:
!pip install pytorch_pretrained_bert
!pip install transformers

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 24.4MB/s eta 0:00:01[K     |█████▎                          | 20kB 2.2MB/s eta 0:00:01[K     |████████                        | 30kB 3.2MB/s eta 0:00:01[K     |██████████▋                     | 40kB 2.1MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 2.6MB/s eta 0:00:01[K     |███████████████▉                | 61kB 3.1MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 3.6MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 4.1MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 4.6MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 3.5MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 3.5MB/s eta 0:00:01[K     |██████████████████████

In [0]:
import math
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [0]:
# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [0]:
def sentence_score_loss(sentence):
  tokenize_input = tokenizer.tokenize(sentence)

  #50256 is the token_id for <|endoftext|>
  tensor_input = torch.tensor([ [50256]  +  tokenizer.convert_tokens_to_ids(tokenize_input)])

  with torch.no_grad():
    outputs = model(tensor_input, labels=tensor_input)
    loss, logits = outputs[:2]

  return (-float(loss.numpy())*len(tokenize_input))

In [0]:
def sentence_score_cal(sentence):
  tokenize_input = tokenizer.tokenize(sentence)

  #50256 is the token_id for <|endoftext|>
  tensor_input = torch.tensor([ [50256]  +  tokenizer.convert_tokens_to_ids(tokenize_input)])

  with torch.no_grad():
    outputs = model(tensor_input, labels=tensor_input)
    loss, logits = outputs[:2]
    logits = logits.squeeze(0)
    predicted_probs = torch.softmax(logits, dim=1)
  
  lp = 0.0
  for i in range(len(tokenize_input)):
      masked_index = i
      predicted_prob = predicted_probs[masked_index, :]
      lp += np.log(predicted_prob[tokenizer.convert_tokens_to_ids([tokenize_input[i]])[0]])
  print(lp)
  return (-float(loss.numpy())*len(tokenize_input))

In [135]:
sents = ["this book is on the desk", "this bo ok is on the desk", "this airplane is on the desk", "this book is in the desk", "this book is of the desk"]

sent_logprob = [(sent, sentence_score_loss(sent)) for sent in sents]
sent_logprob.sort(key = lambda element: element[1], reverse=True)
print(sent_logprob)

sent_logprob = [(sent, sentence_score_cal(sent)) for sent in sents]
sent_logprob.sort(key = lambda element: element[1], reverse=True)
print(sent_logprob)

[('this book is on the desk', -31.22244930267334), ('this book is in the desk', -33.49128341674805), ('this book is of the desk', -35.620848655700684), ('this airplane is on the desk', -37.91392421722412), ('this bo ok is on the desk', -49.58194589614868)]
tensor(-31.2224)
tensor(-49.5819)
tensor(-37.9139)
tensor(-33.4913)
tensor(-35.6208)
[('this book is on the desk', -31.22244930267334), ('this book is in the desk', -33.49128341674805), ('this book is of the desk', -35.620848655700684), ('this airplane is on the desk', -37.91392421722412), ('this bo ok is on the desk', -49.58194589614868)]


# Bert Language Model for Sentence Scoring

In [7]:
!pip install pytorch_pretrained_bert
!pip install transformers



In [0]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
import math
import numpy as np

In [9]:
bertMaskedLm = BertForMaskedLM.from_pretrained('bert-base-uncased')
bertMaskedLm.eval()
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




In [0]:
def sentence_score(sentence):
  input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
  outputs = bertMaskedLm(input_ids, masked_lm_labels=input_ids)
  loss, logits = outputs[:2]
  print(loss)
  print(logits)
  print(-loss * len(tokenizer.encode(sentence, add_special_tokens=True)))

In [11]:
sentence_score("This book is on the table")
sentence_score("This book is in the table")
sentence_score("This airplane is of the table")

tensor(4.8480, grad_fn=<NllLossBackward>)
tensor([[[ -6.7902,  -6.7427,  -6.7477,  ...,  -6.1828,  -6.0003,  -4.2203],
         [-13.7448, -13.6137, -13.3702,  ..., -13.9954, -12.5992, -10.1035],
         [-12.8292, -13.0563, -12.8318,  ..., -10.6491, -10.3101,  -9.8543],
         ...,
         [-18.4380, -18.4090, -18.0203,  ..., -16.6678, -14.6983, -15.0451],
         [-12.2875, -12.3563, -12.1337,  ..., -11.9327, -10.7706,  -8.7098],
         [-10.7685, -10.6716, -10.6607,  ...,  -8.5996,  -9.3426,  -8.8667]]],
       grad_fn=<AddBackward0>)
tensor(-38.7843, grad_fn=<MulBackward0>)
tensor(5.1330, grad_fn=<NllLossBackward>)
tensor([[[ -6.5220,  -6.4830,  -6.5045,  ...,  -5.9453,  -5.6975,  -4.0519],
         [-14.0537, -14.0063, -13.9371,  ..., -14.7972, -11.5710, -10.7611],
         [-12.4478, -12.6976, -12.5044,  ..., -11.4573, -10.6053,  -9.7046],
         ...,
         [-16.6369, -16.8116, -16.4827,  ..., -15.4665, -12.4070, -12.4446],
         [-10.8566, -11.1585, -11.0800,  ...

# Bert for Next Sentence Prediction

In [0]:
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch
import numpy as np
import math

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
model.eval()

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [0]:
def next_sentence(sentence1, sentence2):
  sent1_toks = ["[CLS]"] + tokenizer.tokenize(sentence1) + ["[SEP]"]
  sent2_toks = tokenizer.tokenize(sentence2) + ["[SEP]"]
  sent=sent1_toks+sent2_toks

  indexed_tokens = tokenizer.convert_tokens_to_ids(sent)
  segments_ids = [0]*len(sent1_toks) + [1]*len(sent2_toks)

  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])
  
  prediction = model(tokens_tensor, token_type_ids=segments_tensors)
  prediction=prediction[0] # tuple to tensor
  softmax = torch.nn.Softmax(dim=1)
  prediction_sm = softmax(prediction)

  return prediction_sm.tolist()[0][0]

In [15]:
print(next_sentence("How old are you?", "The Eiffel Tower is in Paris"))
print(next_sentence("How old are you?", "I am seven years old"))

0.0004167333827354014
0.9999886751174927


# Bert for Masked Language Model

In [24]:
!pip install pytorch_pretrained_bert



In [0]:
from transformers import BertTokenizer, BertForMaskedLM
import torch
import math
import numpy as np

In [0]:
PAD, MASK, CLS, SEP = '[PAD]', '[MASK]', '[CLS]', '[SEP]'


In [0]:
def to_bert_input(tokens, bert_tokenizer):
    token_idx = torch.tensor(bert_tokenizer.convert_tokens_to_ids(tokens))
    sep_idx = tokens.index('[SEP]')
    segment_idx = token_idx * 0
    segment_idx[(sep_idx + 1):] = 1
    mask = (token_idx != 0)
    return token_idx.unsqueeze(0), segment_idx.unsqueeze(0), mask.unsqueeze(0)


In [0]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertForMaskedLM.from_pretrained("bert-base-uncased")
bert_model.eval()

In [0]:
def masked_prediction(sent):
  tokens = bert_tokenizer.tokenize(sent.strip())
  if tokens[0] != CLS:
    tokens = [CLS] + tokens
  if tokens[-1] != SEP:
    tokens.append(SEP)
  token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
  print(mask)
  with torch.no_grad():
    logits = bert_model(token_idx, masked_lm_labels=token_idx)
  logits = logits[1].squeeze(0)
  probs = torch.softmax(logits, dim=1)
  print(logits.shape)
  print(probs.shape)

  mask_cnt = 0
  for idx, token in enumerate(tokens):
    if token == MASK:
      mask_cnt += 1
      topk_prob, topk_indices = torch.topk(probs[idx, :], 5)
      print(topk_prob)
      print(topk_indices)
      topk_tokens = bert_tokenizer.convert_ids_to_tokens(topk_indices.numpy())
      for prob, tok in zip(topk_prob, topk_tokens):
        print('{} {}'.format(tok, prob))

In [122]:
masked_prediction("The book is [MASK] the desk")

tensor([[True, True, True, True, True, True, True, True]])
torch.Size([8, 30522])
torch.Size([8, 30522])
tensor([0.9268, 0.0263, 0.0147, 0.0043, 0.0042])
tensor([2006, 2104, 2369, 3875, 2125])
on 0.9267967343330383
under 0.026349736377596855
behind 0.014741572551429272
beside 0.004327051341533661
off 0.004184301942586899


# Fix Sentence with GPT2 (Prob Score) & Bert (Masked Predict)

In [0]:
!pip install transformers

In [0]:
import math
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel, BertTokenizer, BertForMaskedLM

In [0]:
# Load pre-trained model
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_model.eval()
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

bert_model = BertForMaskedLM.from_pretrained("bert-base-uncased")
bert_model.eval()
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [0]:
PAD, MASK, CLS, SEP = '[PAD]', '[MASK]', '[CLS]', '[SEP]'

def to_bert_input(tokens, bert_tokenizer):
    token_idx = torch.tensor(bert_tokenizer.convert_tokens_to_ids(tokens))
    sep_idx = tokens.index('[SEP]')
    segment_idx = token_idx * 0
    segment_idx[(sep_idx + 1):] = 1
    mask = (token_idx != 0)
    return token_idx.unsqueeze(0), segment_idx.unsqueeze(0), mask.unsqueeze(0)

In [0]:
def sentence_score_cal(sentence):
  tokenize_input = tokenizer.tokenize(sentence)

  #50256 is the token_id for <|endoftext|>
  tensor_input = torch.tensor([ [50256]  +  tokenizer.convert_tokens_to_ids(tokenize_input)])

  with torch.no_grad():
    outputs = model(tensor_input, labels=tensor_input)
    loss, logits = outputs[:2]
    logits = logits.squeeze(0)
    predicted_probs = torch.softmax(logits, dim=1)
  
  lp = 0.0
  prob_list = []
  for i in range(len(tokenize_input)):
      masked_index = i
      predicted_prob = predicted_probs[masked_index, :]
      lp += np.log(predicted_prob[tokenizer.convert_tokens_to_ids([tokenize_input[i]])[0]])
      prob_list.append((i, float((predicted_prob[tokenizer.convert_tokens_to_ids([tokenize_input[i]])[0]]).numpy())))

  return float(lp.numpy()), prob_list

In [0]:
def masked_prediction(sent, idx):
  tokens = bert_tokenizer.tokenize(sent.strip())
  tokens[idx] = MASK
  if tokens[0] != CLS:
    tokens = [CLS] + tokens
  if tokens[-1] != SEP:
    tokens.append(SEP)
  token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
  with torch.no_grad():
    logits = bert_model(token_idx, masked_lm_labels=token_idx)
  logits = logits[1].squeeze(0)
  probs = torch.softmax(logits, dim=1)

  mask_cnt = 0
  for idx, token in enumerate(tokens):
    if token == MASK:
      mask_cnt += 1
      topk_prob, topk_indices = torch.topk(probs[idx, :], 5)
      topk_tokens = bert_tokenizer.convert_ids_to_tokens(topk_indices.numpy())
      print(' '.join(tokens))
      for prob, tok in zip(topk_prob, topk_tokens):
        print('{} {}'.format(tok, prob))

In [175]:
sents = ["this book is on the desk", "this bo ok is on the desk", "this airplane is on the desk", "this book is in the desk", "this book is of the desk"]

sent_logprob = [(sent, sentence_score_cal(sent)) for sent in sents]
sent_logprob.sort(key = lambda element: element[1][0], reverse=True)
print(sent_logprob)

[('this book is on the desk', (-31.222448348999023, [(0, 0.00018200573686044663), (1, 0.012033280916512012), (2, 0.4904617667198181), (3, 0.0023123419377952814), (4, 0.2065800577402115), (5, 5.370907820179127e-05)])), ('this book is in the desk', (-33.49128341674805, [(0, 0.00018200573686044663), (1, 0.012033280916512012), (2, 0.4904617667198181), (3, 0.007219349965453148), (4, 0.10646791756153107), (5, 3.4524689453974133e-06)])), ('this book is of the desk', (-35.620849609375, [(0, 0.00018200573686044663), (1, 0.012033280916512012), (2, 0.4904617667198181), (3, 0.0025211551692336798), (4, 0.09160611778497696), (5, 1.366039100503258e-06)])), ('this airplane is on the desk', (-37.91392517089844, [(0, 0.00018200573686044663), (1, 1.4136198842606973e-05), (2, 0.1977468729019165), (3, 0.008266404271125793), (4, 0.1509680151939392), (5, 5.388425779528916e-05)])), ('this bo ok is on the desk', (-49.581947326660156, [(0, 0.00018200573686044663), (1, 3.663938332465477e-05), (2, 4.5645767386304

In [177]:
sent = sent_logprob[0]
sent_probs = sent[1][1]
sent_probs.sort(key = lambda element: element[1])

print(sent_probs)
for sent_prob in sent_probs:
  if sent_prob[1] > 0.01:
    break
  token_idx = sent_prob[0]
  if (token_idx != 0) and (token_idx != len(sent_probs)-1):
    masked_prediction(sent[0], token_idx)

[(5, 5.370907820179127e-05), (0, 0.00018200573686044663), (3, 0.0023123419377952814), (1, 0.012033280916512012), (4, 0.2065800577402115), (2, 0.4904617667198181)]
[CLS] this book is [MASK] the desk [SEP]
on 0.9265573024749756
under 0.027666058391332626
behind 0.012123161926865578
off 0.005696693900972605
in 0.005355173721909523
