In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
input_ids = tokenizer.encode("paraphrase: I need login help", return_tensors='pt')
greedy_output = model.generate(input_ids, num_beams=7, no_repeat_ngram_size=2, min_length=25, max_length=75)
message = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
print(message)

False : ''I need to log in to my account to see if it's logged in.


In [9]:
input_ids = tokenizer.encode(f'question: {message}', return_tensors='pt')
greedy_output = model.generate(input_ids, num_beams=1, no_repeat_ngram_size=1, min_length=50, max_length=100)
print("Output:\n" + 100 * '-')

message_ = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
print(message_)

Output:
----------------------------------------------------------------------------------------------------
a storm in the middle of an hour. I am mad! The tent was broken during this period and it is now breaking up after about two hours, not even one day before my next job starts again? (#false)


In [27]:
import torch

class Adequacy():
  
  def __init__(self, model_tag='prithivida/parrot_adequacy_model'):
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    self.adequacy_model = AutoModelForSequenceClassification.from_pretrained(model_tag)
    self.tokenizer = AutoTokenizer.from_pretrained(model_tag)

  def filter(self, input_phrase, para_phrases, adequacy_threshold, device="cpu"):
      top_adequacy_phrases = []
      for para_phrase in para_phrases:
        x = self.tokenizer(input_phrase, para_phrase, return_tensors='pt', max_length=128, truncation=True)
        x = x.to(device)
        self.adequacy_model = self.adequacy_model.to(device)
        logits = self.adequacy_model(**x).logits
        probs = logits.softmax(dim=1)
        prob_label_is_true = probs[:,1]
        adequacy_score = prob_label_is_true.item()
        if adequacy_score >= adequacy_threshold:
            top_adequacy_phrases.append(para_phrase)
      return top_adequacy_phrases


  def score(self, input_phrase, para_phrases, adequacy_threshold, device="cpu"):
      adequacy_scores = {}
      for para_phrase in para_phrases:
        x = self.tokenizer(input_phrase, para_phrase, return_tensors='pt', max_length=128, truncation=True)
        x = x.to(device)
        self.adequacy_model = self.adequacy_model.to(device)
        logits = self.adequacy_model(**x).logits
        probs = logits.softmax(dim=1)
        prob_label_is_true = probs[:,1]
        adequacy_score = prob_label_is_true.item()
        if adequacy_score >= adequacy_threshold:
          adequacy_scores[para_phrase] = adequacy_score
      return adequacy_scores

class Fluency():
  def __init__(self, model_tag='prithivida/parrot_fluency_model'):
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    self.fluency_model = AutoModelForSequenceClassification.from_pretrained(model_tag, num_labels=2)
    self.fluency_tokenizer = AutoTokenizer.from_pretrained(model_tag)

  def filter(self, para_phrases, fluency_threshold, device="cpu"):
      import numpy as np
      from scipy.special import softmax
      self.fluency_model = self.fluency_model.to(device)
      top_fluent_phrases = []
      for para_phrase in para_phrases:
        input_ids = self.fluency_tokenizer("Sentence: " + para_phrase, return_tensors='pt', truncation=True)
        input_ids = input_ids.to(device)
        prediction = self.fluency_model(**input_ids)
        scores = prediction[0][0].detach().cpu().numpy()
        scores = softmax(scores)
        fluency_score = scores[1] # LABEL_0 = Bad Fluency, LABEL_1 = Good Fluency
        if fluency_score >= fluency_threshold:
          top_fluent_phrases.append(para_phrase)
      return top_fluent_phrases

  def score(self, para_phrases, fluency_threshold, device="cpu"):
      import numpy as np
      from scipy.special import softmax
      self.fluency_model = self.fluency_model.to(device)
      fluency_scores = {}
      for para_phrase in para_phrases:
        input_ids = self.fluency_tokenizer("Sentence: " + para_phrase, return_tensors='pt', truncation=True)
        input_ids = input_ids.to(device)
        prediction = self.fluency_model(**input_ids)
        scores = prediction[0][0].detach().cpu().numpy()
        scores = softmax(scores)
        fluency_score = scores[1] # LABEL_0 = Bad Fluency, LABEL_1 = Good Fluency
        if fluency_score >= fluency_threshold:
          fluency_scores[para_phrase] = fluency_score
      return fluency_scores
      


class Diversity():

  def __init__(self, model_tag='paraphrase-distilroberta-base-v2'):
    from sentence_transformers import SentenceTransformer
    self.diversity_model = SentenceTransformer(model_tag)

  def rank(self, input_phrase, para_phrases, diversity_ranker='levenshtein'):
      if diversity_ranker == "levenshtein":
        return self.levenshtein_ranker(input_phrase, para_phrases)
      elif diversity_ranker == "euclidean":
        return self.euclidean_ranker(input_phrase, para_phrases)
      elif diversity_ranker == "diff":
        return self.diff_ranker(input_phrase, para_phrases)

  def euclidean_ranker(self, input_phrase, para_phrases):
      import pandas as pd
      from sklearn_pandas import DataFrameMapper
      from sklearn.preprocessing import MinMaxScaler
      from scipy import spatial

      diversity_scores = {}
      outputs = []
      input_enc = self.diversity_model.encode(input_phrase.lower())
      for para_phrase in para_phrases:              
          paraphrase_enc = self.diversity_model.encode(para_phrase.lower())
          euclidean_distance = (spatial.distance.euclidean(input_enc, paraphrase_enc))
          outputs.append((para_phrase,  euclidean_distance))
      df = pd.DataFrame(outputs, columns=['paraphrase', 'scores'])
      fields = []
      for col in df.columns:
          if col == "scores":
              tup = ([col], MinMaxScaler())
          else:  
              tup = ([col], None)
          fields.append(tup) 

      mapper = DataFrameMapper(fields, df_out=True)
      for index, row in mapper.fit_transform(df.copy()).iterrows():
          diversity_scores[row['paraphrase']] = row['scores']
      return  diversity_scores

  def levenshtein_ranker(self, input_phrase, para_phrases):
      import Levenshtein
      diversity_scores = {}
      for para_phrase in para_phrases:              
          distance = Levenshtein.distance(input_phrase.lower(), para_phrase)
          diversity_scores[para_phrase] =  distance
      return diversity_scores
  
  def diff_ranker(self, input_phrase, para_phrases):
    import difflib
    differ = difflib.Differ()
    diversity_scores ={}
    for para_phrase in para_phrases:
        diff = differ.compare(input_phrase.split(), para_phrase.split())
        count = 0
        for d in diff:
          if "+" in d or "-" in d:
            count += 1
        diversity_scores[para_phrase] = count
    return diversity_scores


def reprhase(input_phrase, tokenizer,model,use_gpu=False, diversity_ranker="levenshtein", do_diverse=False, style=1, max_length=32, adequacy_threshold = 0.90, fluency_threshold = 0.90): 
    if use_gpu:
        device= "cuda:0"
    else:
        device = "cpu"
    import re
    save_phrase = input_phrase
    if len(input_phrase) >= max_length:
        max_length += 32 	
    input_phrase = re.sub('[^a-zA-Z0-9 \?\'\-\/\:\.]', '', input_phrase)
    input_phrase = "paraphrase: " + input_phrase
    print(input_phrase)
    input_ids = tokenizer.encode(input_phrase, return_tensors='pt')
    input_ids = input_ids.to(device)
    max_return_phrases = 10

    # for n in range(2, 9):
    #     if max_return_phrases % n == 0:
    #         break 
    #print("max_return_phrases - ", max_return_phrases , " and beam groups -", n)            
    preds = model.generate(
                input_ids,
                do_sample=True, 
                max_length=max_length, 
                top_k=50, 
                top_p=0.95, 
                early_stopping=True,
                num_return_sequences=max_return_phrases)


    paraphrases= set()

    for pred in preds:
        gen_pp = tokenizer.decode(pred, skip_special_tokens=True).lower()
        gen_pp = re.sub('[^a-zA-Z0-9 \?\'\-]', '', gen_pp)
        paraphrases.add(gen_pp)

    adq = Adequacy()
    flu = Fluency()
    div = Diversity()
    adequacy_filtered_phrases = adq.filter(input_phrase, paraphrases, adequacy_threshold, device )
    if len(adequacy_filtered_phrases) > 0 :
      fluency_filtered_phrases = flu.filter(adequacy_filtered_phrases, fluency_threshold, device )
      if len(fluency_filtered_phrases) > 0 :
          diversity_scored_phrases = div.rank(input_phrase, fluency_filtered_phrases, diversity_ranker)
          para_phrases = []
          for para_phrase, diversity_score in diversity_scored_phrases.items():
              para_phrases.append((para_phrase, diversity_score))
          para_phrases.sort(key=lambda x:x[1], reverse=True)
          return para_phrases[0]
      else:
          return [(save_phrase,0)]



In [29]:

blah = reprhase(input_phrase="Can you help me login to my account?", tokenizer=tokenizer, model = model)

paraphrase: Can you help me login to my account?
true
true
true
true
false
true
true
true
true
true


In [11]:
blah

{'contradiction',
 'falsch',
 'falses',
 'negative',
 'neutral',
 'positive',
 'true'}