<a href="https://colab.research.google.com/github/kaledai069/Answer-Validity-Checker-with-Word-Vectorizer-Neural-Nets/blob/master/Alternate_Solution_Ranker_Comparing_scoring_capability_between_ByT5_(BCS)_and_T5_Small_(One_Full_Epoch_Trained).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
!pip install -q pyspellchecker

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import torch
import pandas as pd
import random
import os
import time
import string

from tqdm import tqdm
from spellchecker import SpellChecker
from transformers import T5ForConditionalGeneration, AutoTokenizer, BartForConditionalGeneration, BartTokenizer

In [5]:
DATASET_PATH = "/content/gdrive/MyDrive/Clue-Answer Dataset/CLUE_ANSWER_DATA_VERSION_1.csv"
T5_RANKER_PATH = "/content/gdrive/MyDrive/Second Pass Model/t5_word_segmented_pair_2_epochs/"
BYT5_RANKER_PATH = "/content/gdrive/MyDrive/Second Pass Model/byt5_reranker/"
BART_RANKER_PATH = "/content/gdrive/MyDrive/Second Pass Model/bart-base-model-3M-one_epoch/"

In [6]:
# loading the dataset in a df and sampling random chunk for observation
clue_answer_df = pd.read_csv(DATASET_PATH)

In [7]:
sample_size = 2000
sample_df = clue_answer_df.sample(n = sample_size, random_state = 69)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# ByT5-small as the reranker model from Huggingface (BCS)
MODEL_BYT5 = 'google/byt5-small'
tokenizer_byt5 = AutoTokenizer.from_pretrained(MODEL_BYT5)
model_byt5 = T5ForConditionalGeneration.from_pretrained(BYT5_RANKER_PATH)

# T5-small as the reranker model trained with own data (3 Million Clue-Answer Pair)
MODEL_T5 = 't5-small'
tokenizer_t5 = AutoTokenizer.from_pretrained(MODEL_T5)
model_t5 = T5ForConditionalGeneration.from_pretrained(T5_RANKER_PATH)

# Bart-base as the reranker model trained for a single epoch on 3M clue-answer pair
MODEL_BART = 'facebook/bart-base'
tokenizer_bart = BartTokenizer.from_pretrained(MODEL_BART)
model_bart = BartForConditionalGeneration.from_pretrained(BART_RANKER_PATH)

tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [11]:
def inference_model(model_, tokenizer_, clue, answer, model_type):
  with torch.inference_mode():
    if model_type == 'byt5':
      inputs = tokenizer_(['Q: ' + clue], return_tensors='pt')['input_ids']

    else: # i.e. T5-small
      # cause i didn't use above like format for clue in the T5-small training
      inputs = tokenizer_(['Q: ' + clue], return_tensors='pt')['input_ids']

    labels = tokenizer_([answer], return_tensors='pt')['input_ids']
    loss = model_(inputs, labels = labels)
    answer_length = labels.shape[1]
    logprob = -loss[0].item() * answer_length
    return logprob

In [16]:
# comparing side-by-side results between ByT5 and Bart-base
result_df_list = []
num_samples = 10
for _ in range(num_samples):
  random_index = random.randint(0, len(sample_df) - 1)
  clue = sample_df['clue'].iloc[random_index]
  answer = sample_df['answer'].iloc[random_index]

  start_t = time.time()
  byt5_score = inference_model(model_byt5, tokenizer_byt5, clue, answer, "byt5")
  byt5_inf_time = time.time() - start_t

  start_t = time.time()
  bart_score = inference_model(model_bart, tokenizer_bart, clue, answer, "bart")
  bart_inf_time = time.time() - start_t

  result_df_list.append((clue, answer, byt5_score, bart_score, byt5_inf_time, bart_inf_time, byt5_inf_time / bart_inf_time))

result_df = pd.DataFrame(result_df_list, columns = ["Clue", 'Answer', 'ByT5 Score', 'BART Score', 'BART Inference Time', 'BART Inference Time', 'BART Time Factor'])
print(f"T5_small Model is {int(result_df['BART Time Factor'].mean())} times faster than ByT5_small for single inference.")

result_df.head(10)

T5_small Model is 4 times faster than ByT5_small for single inference.


Unnamed: 0,Clue,Answer,ByT5 Score,BART Score,BART Inference Time,BART Inference Time.1,BART Time Factor
0,robert mueller's agency,the fbi,-36.81134,-4.446746,1.302998,0.362882,3.590693
1,chuck (charles) lamb is a plant,lobelia,-5.852175,-4.254205,0.857822,0.223972,3.830044
2,called one's bluff,dared,-5.69822,-4.843416,0.677315,0.179946,3.763986
3,an ___ of prevention....,ounce,-0.005506,-0.017474,0.543561,0.138816,3.915696
4,joad family seeker,to m,-55.757532,-18.012772,0.484505,0.143193,3.383567
5,"like 2018's hereditary, for example",eerie,-6.568623,-8.998318,0.610896,0.143132,4.268066
6,the impression one gives is false,imitation,-12.021878,-4.876175,0.63048,0.147244,4.281858
7,la cenerentola composer and family,rossin is,-68.485556,-17.988733,0.63117,0.145307,4.343714
8,"1956 star of vadim's ""and god created woman""",bardot,-8.875616,-8.635655,0.684203,0.154163,4.438183
9,"y using indian's skin over colour, chiefly, th...",inhumanely,-20.32325,-15.289476,0.747178,0.170324,4.38681


In [None]:
# comparing side-by-side result
result_df_list = []
num_samples = 10
for _ in range(num_samples):
  random_index = random.randint(0, len(sample_df) - 1)
  clue = sample_df['clue'].iloc[random_index]
  answer = sample_df['answer'].iloc[random_index]

  start_t = time.time()
  byt5_score = inference_model(model_byt5, tokenizer_byt5, clue, answer, "byt5")
  byt5_inf_time = time.time() - start_t

  start_t = time.time()
  t5_score = inference_model(model_t5, tokenizer_t5, clue, answer, "t5")
  t5_inf_time = time.time() - start_t

  result_df_list.append((clue, answer, byt5_score, t5_score, byt5_inf_time, t5_inf_time, byt5_inf_time / t5_inf_time))

  # print(f"BYT5 Score: CLUE is {clue} & POSITIVE ANSWER is {answer} ---> {byt5_score}")
  # print(f"T5 Score: CLUE is {clue} & POSITIVE ANSWER is {answer} ---> {t5_score}")
  # print()
result_df = pd.DataFrame(result_df_list, columns = ["Clue", 'Answer', 'ByT5 Score', 'T5 Score', 'ByT5 Inference Time', 'T5 Inference Time', 'T5 Time Factor'])
print(f"T5_small Model is {int(result_df['T5 Time Factor'].mean())} times faster than ByT5_small for single inference.")

result_df.head(10)

T5_small Model is 8 times faster than ByT5_small for single inference.


Unnamed: 0,Clue,Answer,ByT5 Score,T5 Score,ByT5 Inference Time,T5 Inference Time,T5 Time Factor
0,'tess' star,kinski,-15.810485,-17.096556,0.558415,0.079601,7.015174
1,see articles with hate,loathe,-2.480923,-7.331069,0.671372,0.077159,8.701187
2,can't skip,must do,-48.287979,-8.234839,0.529486,0.073305,7.223091
3,begot,fathered,-21.444815,-10.880735,0.448263,0.073695,6.082649
4,note written without consonants,iou,-14.644562,-4.49532,0.751931,0.079365,9.474369
5,the only lottery urkel ever plays (wordplay),theirishdweebstakes,-35.56916,-40.626554,0.990014,0.110854,8.930757
6,classic d.c. train station,union,-7.736264,-9.583914,0.635041,0.058219,10.907805
7,kites bought in detroit (wordplay),flying tigers,-30.66953,-17.377535,0.578616,0.066761,8.666973
8,comparatively well,haler,-10.005526,-8.948947,0.39884,0.057228,6.969304
9,planned social occasions,"events,",-39.810776,-10.111033,0.461499,0.063823,7.230945


---
<center>T5_small Model is 8 times faster than ByT5_small for single inference.</center>

---

In [9]:
# testing if the byt5-reranker actually does gives score or not

# modification to be applied to positive answer to generate negative answers

def replace_random_chars(input_string, num_chars_to_replace):
    positions_to_replace = random.sample(range(len(input_string)), num_chars_to_replace)

    replaced_string = list(input_string)
    for position in positions_to_replace:
        replaced_string[position] = random.choice(string.ascii_letters.lower())

    return ''.join(replaced_string)

# random character omission from the input string

def remove_random_chars(input_string, num_chars_to_remove):
  positions_to_remove = random.sample(range(len(input_string)), 1)

  for i, pos in enumerate(positions_to_remove):
    input_string = input_string[:pos - i] + input_string[pos - i + 1 :]

  return input_string

In [10]:
# checking if the answer is a single valid word
spell = SpellChecker()

def get_alternate_solutions(answer):
  neg_answer_list = []
  if answer != '':
    # original single answer
    is_valid_word = spell.known([answer])

    # generating negative answers with single valid words
    if is_valid_word:
      # get two negative answers for single valid word
      for _ in range(3):
        if len(answer) >= 10:
          # concurrent two letter replacement
          neg_answer_with_replace = replace_random_chars(answer, 2 )
          neg_answer_list.append(neg_answer_with_replace.replace(' ', ''))

        else:
          # a single letter replacement
          neg_answer_with_replace = replace_random_chars(answer, 1)
          neg_answer_list.append(neg_answer_with_replace.replace(' ', ''))

      if len(answer) > 5:
        neg_answer_with_omission = remove_random_chars(answer, 1)
        neg_answer_list.append(neg_answer_with_omission.replace(' ', ''))

    # generate negative answers with unsegmented answers
    else:
      neg_answer_with_replace_list = []
      neg_answer_with_omission_list = []

      for _ in range(3):
        if len(answer) >= 15:
          neg_answer_with_replace_list.append(replace_random_chars(answer, 3))
          neg_answer_with_omission_list.append(remove_random_chars(answer, 2))

        if len(answer) >= 10:
          neg_answer_with_replace_list.append(replace_random_chars(answer, 2))
          neg_answer_with_omission_list.append(remove_random_chars(answer, 1))

        if len(answer) >= 5:
          neg_answer_with_replace_list.append(replace_random_chars(answer, 1))
          neg_answer_with_omission_list.append(remove_random_chars(answer, 1))

      for neg_answer in neg_answer_with_replace_list + neg_answer_with_omission_list:
        neg_answer_list.append(neg_answer.replace(' ', ''))
  return neg_answer_list

In [12]:
def compare_ans_with_neg_answer(model_, tokenizer_, sample_count, model_name, random_state = None):
  for _ in range(sample_count):
    if random_state is None:
      random_index = random.randint(0, len(sample_df) - 1)
    else:
      random_index = random_state

    clue = sample_df['clue'].iloc[random_index]
    answer = sample_df['answer'].iloc[random_index].replace(' ', '')
    neg_answer_list = get_alternate_solutions(answer)

    model_score = inference_model(model_, tokenizer_, clue, answer, model_name)
    print(f"{model_name} Score: Clue is '{clue}' & Postive Answer is '{answer}' ---> {model_score}")
    for neg_ans in neg_answer_list:
      model_score = inference_model(model_, tokenizer_, clue, neg_ans, model_name)
      print(f"{model_name} Score: Clue is '{clue}' & Negative Answer is [{neg_ans}] ---> {model_score}")
    print()

#### ByT5 scoring ability assessment and trend analysis with altered negative answers

In [13]:
compare_ans_with_neg_answer(model_byt5, tokenizer_byt5, 5, 'byt5')

byt5 Score: Clue is '"ships of inland commerce."' & Postive Answer is 'conestogawagons' ---> -13.34942626953125
byt5 Score: Clue is '"ships of inland commerce."' & Negative Answer is [cotestngawagonk] ---> -76.67035675048828
byt5 Score: Clue is '"ships of inland commerce."' & Negative Answer is [conestogkwugons] ---> -55.5800666809082
byt5 Score: Clue is '"ships of inland commerce."' & Negative Answer is [conestogawagrns] ---> -48.63178634643555
byt5 Score: Clue is '"ships of inland commerce."' & Negative Answer is [conostoehwagons] ---> -55.121368408203125
byt5 Score: Clue is '"ships of inland commerce."' & Negative Answer is [conesvogawagons] ---> -45.03205871582031
byt5 Score: Clue is '"ships of inland commerce."' & Negative Answer is [conestogawagbns] ---> -45.45990753173828
byt5 Score: Clue is '"ships of inland commerce."' & Negative Answer is [confstogawrgops] ---> -66.52779388427734
byt5 Score: Clue is '"ships of inland commerce."' & Negative Answer is [conesgogabagons] ---> -56

#### Bart-base scoring ability assessment and treand analysis with altered negative answers

In [15]:
compare_ans_with_neg_answer(model_bart, tokenizer_bart, 15, 'byt5')

byt5 Score: Clue is 'summer stock unit (wordplay)' & Postive Answer is 'rerun' ---> -8.967248916625977
byt5 Score: Clue is 'summer stock unit (wordplay)' & Negative Answer is [requn] ---> -25.049829483032227
byt5 Score: Clue is 'summer stock unit (wordplay)' & Negative Answer is [refun] ---> -25.946245193481445
byt5 Score: Clue is 'summer stock unit (wordplay)' & Negative Answer is [rerrn] ---> -31.57805633544922

byt5 Score: Clue is 'nose about, taking notice, about to get close to listen' & Postive Answer is 'bendanear' ---> -22.35840368270874
byt5 Score: Clue is 'nose about, taking notice, about to get close to listen' & Negative Answer is [beedanear] ---> -31.80578899383545
byt5 Score: Clue is 'nose about, taking notice, about to get close to listen' & Negative Answer is [bendaneer] ---> -30.01490879058838
byt5 Score: Clue is 'nose about, taking notice, about to get close to listen' & Negative Answer is [bendaneaa] ---> -39.51449203491211
byt5 Score: Clue is 'nose about, taking not

#### T5 scoring ability assessment and trend analysis with altered negative answers

In [None]:
compare_ans_with_neg_answer(model_t5, tokenizer_t5, 10, 't5')

t5 Score: Clue is 'e pluribus ___-' & Postive Answer is 'unum' ---> -10.301292657852173
t5 Score: Clue is 'e pluribus ___-' & Negative Answer is [unue] ---> -16.40703582763672
t5 Score: Clue is 'e pluribus ___-' & Negative Answer is [unus] ---> -10.94296145439148
t5 Score: Clue is 'e pluribus ___-' & Negative Answer is [nnum] ---> -15.708335876464844

t5 Score: Clue is 'typical new delhi worshiper' & Postive Answer is 'hindu' ---> -8.475731134414673
t5 Score: Clue is 'typical new delhi worshiper' & Negative Answer is [htndu] ---> -25.649831295013428
t5 Score: Clue is 'typical new delhi worshiper' & Negative Answer is [findu] ---> -20.646931171417236
t5 Score: Clue is 'typical new delhi worshiper' & Negative Answer is [hitdu] ---> -21.162238597869873

t5 Score: Clue is '"... ___ we extinguish sight and speech": browning' & Postive Answer is 'ere' ---> -1.9811012744903564
t5 Score: Clue is '"... ___ we extinguish sight and speech": browning' & Negative Answer is [erw] ---> -13.3852987289

Insight: So, a general trend with both BCS trained ByT5 and our T5-small is that the log probability or score for the positive answer is the lowest is counter for its absolute value.

#### Assessing the difference of scoring ability between fine-tuned 't5-small' and huggingface pretrained 't5-small' model.

In [None]:
# fine-tuned t5-small on partial clue-answer dataset
MODEL_T5 = 't5-small'
tokenizer_t5 = AutoTokenizer.from_pretrained(MODEL_T5)
model_t5 = T5ForConditionalGeneration.from_pretrained(T5_RANKER_PATH)

model_t5_pretrained = T5ForConditionalGeneration.from_pretrained(MODEL_T5)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# side-by-side comparison between fine-tuned and the pre-trained T5-small model
result_df_list = []
num_samples = 10
for _ in range(num_samples):
  random_index = random.randint(0, len(sample_df) - 1)
  clue = sample_df['clue'].iloc[random_index]
  answer = sample_df['answer'].iloc[random_index]
  # clue = 'Q: ' + clue
  start_t = time.time()
  t5_finetuned_score = inference_model(model_t5, tokenizer_t5, clue, answer, "t5")
  t5_finetuned_inf_time = time.time() - start_t

  start_t = time.time()
  t5_pretrained_score = inference_model(model_t5_pretrained, tokenizer_t5, clue, answer, "t5")
  t5_pretrained_inf_time = time.time() - start_t

  result_df_list.append((clue, answer, t5_finetuned_score, t5_pretrained_score, t5_finetuned_inf_time, t5_pretrained_inf_time, t5_pretrained_inf_time / t5_finetuned_inf_time))
result_df = pd.DataFrame(result_df_list, columns = ["Clue", 'Answer', 'T5 (Fine-Tuned) Score', 'T5 (Pre-Trained) Score', 'T5 (Fine-Tuned) Inference Time', 'T5 (Pre-Trained) Inference Time', 'T5 Time Factor'])
# print(f"T5_small Model is {int(result_df['T5 Time Factor'].mean())} times faster than ByT5_small for single inference.")

result_df.head(10)

Unnamed: 0,Clue,Answer,T5 (Fine-Tuned) Score,T5 (Pre-Trained) Score,T5 (Fine-Tuned) Inference Time,T5 (Pre-Trained) Inference Time,T5 Time Factor
0,Q: was apathetic.,moped,-8.274224,-26.979149,0.839202,0.392857,0.468132
1,Q: two-time triple crown winner,arcaro,-11.990229,-39.967548,0.314405,0.235514,0.749078
2,Q: burgundy wines,reds,-3.969891,-16.50027,0.202767,0.282063,1.391071
3,Q: 1996 christian slater movie,bed of roses,-15.735118,-42.545462,0.11535,0.229063,1.985819
4,Q: abbreviation in a help wanted a,eoe,-8.428806,-23.418467,0.120671,0.122185,1.012548
5,Q: helpless rector departs in a huff,naked,-7.801006,-24.09374,0.129929,0.109075,0.839496
6,Q: kind of mattress pa,eggcrate,-14.155943,-39.775558,0.100059,0.120623,1.205512
7,Q: certain class of artificial stimulants (wor...,strapons,-21.392548,-34.377415,0.115667,0.112806,0.975267
8,Q: impressive note for a soprano to hit,highc,-13.788865,-29.816079,0.115679,0.112956,0.976461
9,Q: ararat director egoyan,a tom,-8.966247,-23.298345,0.115884,0.114532,0.988328


In [None]:
ran_index = 169

compare_ans_with_neg_answer(model_t5, tokenizer_t5, 1, 't5', ran_index)
compare_ans_with_neg_answer(model_t5_pretrained, tokenizer_t5, 1, 't5', ran_index)

t5 Score: Clue is 'included in selfie staggering out of spanish celebration' & Postive Answer is 'fiesta' ---> -6.63912034034729
t5 Score: Clue is 'included in selfie staggering out of spanish celebration' & Negative Answer is [fievta] ---> -27.524006366729736
t5 Score: Clue is 'included in selfie staggering out of spanish celebration' & Negative Answer is [fitsta] ---> -21.804675579071045
t5 Score: Clue is 'included in selfie staggering out of spanish celebration' & Negative Answer is [fiemta] ---> -29.945266246795654
t5 Score: Clue is 'included in selfie staggering out of spanish celebration' & Negative Answer is [festa] ---> -7.876183748245239

t5 Score: Clue is 'included in selfie staggering out of spanish celebration' & Postive Answer is 'fiesta' ---> -23.506690979003906
t5 Score: Clue is 'included in selfie staggering out of spanish celebration' & Negative Answer is [fiestt] ---> -26.880847930908203
t5 Score: Clue is 'included in selfie staggering out of spanish celebration' & Ne

Basically, the fine-tuned model has the ability to score lower for the positive answer in comparison to the pretrained fetch t5_small model.

#### Checking for the answer type and segmented answer / unsegmented answer status

In [None]:
clue_answer_df['segmented answer'] = clue_answer_df['answer'].apply(lambda x: x.replace(' ', '') if not isinstance(x, float) else x)

In [None]:
spaced_answer = []
for ans in clue_answer_df['segmented answer']:
  if not isinstance(ans, float):
    if ' ' in ans:
      spaced_answer.append(ans)
len(spaced_answer)