In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive




*   Import all necessary libraries



In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.2
Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting mu

In [None]:
import os
import time
import datetime
from google.colab import drive

import pandas as pd
import seaborn as sns
import numpy as np
import random

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import AutoModelForCausalLM, GenerationConfig, AutoTokenizer

# Set the seed value all over the place to make this reproducible.
SEED_VAL = 42

random.seed(SEED_VAL)
np.random.seed(SEED_VAL)
torch.manual_seed(SEED_VAL)
torch.cuda.manual_seed_all(SEED_VAL)
# used only for splitting the training set into train and val
# we don't want to randomly split the whole dataset
# we want to use the corpus's splits, so that we can compare results with others

from transformers import GPT2LMHeadModel, GPT2Config, GPT2LMHeadModel
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration)
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers.optimization import Adafactor, AdafactorSchedule

import nltk
nltk.download('punkt') # library that divides a text into a list of sentences

MAX_LENGTH = 1024

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Prepare data


*   Download 20 news groups using the sklearn library in Python
*   Acess the text data and store them in data.

In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Download the dataset's splits
newsgroups_data_train = fetch_20newsgroups(subset='train')

df = pd.DataFrame(newsgroups_data_train.data, columns=['news'])
df.head()

Unnamed: 0,news
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...




* Preprocessing the dataset "data" to extract the subject  and the message body from each message.
* Formating it to a specific structure with the subject followed by a summary indicator ('; TLDR: ') and the message body.
* In the test split, we don't add the message body, as that's what we want the model to learn to generate.
* However, we save it separately so that we can use evaluation metrics with the reference later.





In [None]:
import re
# tokenizer = T5Tokenizer.from_pretrained('t5-small')

subject_and_body_prompt, subject_prompt, body_output = [], [], []

for index, row in df.iterrows():
  el = row['news']
  lines = el.split('\n')

  body, subject, element = '', '', ''

  for line in lines:
    # save the subject
    if 'Subject:' in line:
      subject = line[len('Subject:') + 1:]

    # ignoring other headers
    elif len(re.findall("^[A-Za-z-_\.]+:", line)) != 0:
      continue

    # save the body, respecting the model's maximum nr of tokens
    elif len(line) > 1:
      if (len(body.split(' '))):
        body += line + ' '

  # input: subject ; TLDR: body
  # we put backwards so that the model learns to generate the continuation
  full_element = 'summarize: ' + subject + ';' + body[:-1]
  subject_element = 'summarize: ' + subject

  subject_and_body_prompt.append(full_element)
  subject_prompt.append(subject_element)
  body_output.append(body[:-1])

df['subject_and_body_prompt'] = subject_and_body_prompt
df['subject_prompt'] = subject_prompt
df['body_output'] = body_output

df = df.drop(columns=['news'])
df.head()
#len(df)

Unnamed: 0,subject_and_body_prompt,subject_prompt,body_output
0,summarize: WHAT car is this!?; I was wondering...,summarize: WHAT car is this!?,I was wondering if anyone out there could enl...
1,summarize: SI Clock Poll - Final Call;A fair n...,summarize: SI Clock Poll - Final Call,A fair number of brave souls who upgraded thei...
2,"summarize: PB questions...;well folks, my mac ...",summarize: PB questions...,"well folks, my mac plus finally gave up the gh..."
3,summarize: Re: Weitek P9000 ?;Robert J.C. Kyan...,summarize: Re: Weitek P9000 ?,Robert J.C. Kyanko (rob@rjck.UUCP) wrote: > ab...
4,summarize: Re: Shuttle Launch Question;From ar...,summarize: Re: Shuttle Launch Question,"From article <C5owCB.n3p@world.std.com>, by to..."


In [None]:
val   = df.sample(n=100, random_state=SEED_VAL)
train = df.loc[~df.index.isin(val.index)]

#Reset the indexes
val   = val.reset_index()
train = train.reset_index()

In [None]:
val.head()
#len(val)

# save val output as a reference for evaluation in the future
with open('drive/MyDrive/reference_output.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(row['body_output'] + '\n')

In [None]:
train.head()
#len(train)

Unnamed: 0,index,subject_and_body_prompt,subject_prompt,body_output
0,0,summarize: WHAT car is this!?; I was wondering...,summarize: WHAT car is this!?,I was wondering if anyone out there could enl...
1,1,summarize: SI Clock Poll - Final Call;A fair n...,summarize: SI Clock Poll - Final Call,A fair number of brave souls who upgraded thei...
2,2,"summarize: PB questions...;well folks, my mac ...",summarize: PB questions...,"well folks, my mac plus finally gave up the gh..."
3,3,summarize: Re: Weitek P9000 ?;Robert J.C. Kyan...,summarize: Re: Weitek P9000 ?,Robert J.C. Kyanko (rob@rjck.UUCP) wrote: > ab...
4,4,summarize: Re: Shuttle Launch Question;From ar...,summarize: Re: Shuttle Launch Question,"From article <C5owCB.n3p@world.std.com>, by to..."




*   Setting GPT2 Tokenizer





*   Defining a custom dataset 'GPT2Dataset' for pytorch which will be used for model.



In [None]:
# https://github.com/francoisstamant/lyrics-generation-with-GPT2/blob/main/GPT2_final.ipynb
class NewsDataset(Dataset):
  def __init__(self, dataframe, max_length=MAX_LENGTH, split='train'):
    self.tokenizer = AutoTokenizer.from_pretrained('t5-small')
    # self.tokenizer.pad_token = self.tokenizer.eos_token
    self.news_count = 0
    self.subjects = []
    self.bodies = []

    for index, row in dataframe.iterrows():
      subject = row['subject_prompt']
      body = row['body_output']

      subject_encoding = self.tokenizer(
          subject,
          return_tensors='pt',
          max_length=max_length,
          padding='max_length'
      )['input_ids'][0][:1024]

      body_encoding = self.tokenizer(
          body,
          return_tensors='pt',
          max_length=max_length,
          padding='max_length'
      )['input_ids'][0][:1024]

      self.subjects.append(subject_encoding.clone().detach())
      self.bodies.append(body_encoding.clone().detach())

    self.news_count = len(self.subjects)

  def __len__(self):
    return self.news_count

  def __getitem__(self, idx):
    return self.subjects[idx], self.bodies[idx]




Creating the custom dataset




In [None]:
train_dataset = NewsDataset(train)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [None]:
len(train_dataset)

11214

In [None]:
val_dataset = NewsDataset(val, split='val')

In [None]:
len(val_dataset)

100

# Training

In [None]:
# from transformers import AutoModelForPreTraining
tokenizer = AutoTokenizer.from_pretrained('t5-small')
model =T5ForConditionalGeneration.from_pretrained('t5-small')
# model = AutoModelForPreTraining.from_pretrained('gpt2')

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm

def train_model(
    dataset, model, tokenizer,
    batch_size=1, epochs=20, lr=2e-5,
    max_seq_len=MAX_LENGTH, warmup_steps=200,
    gpt2_type="gpt2", output_dir="drive/MyDrive/", output_prefix="gpt2_fine-tuning",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()


    optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )
    train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    loss_values = []
    ppl_values = []
    for epoch in range(epochs):
        running_loss = 0.0
        running_ppl = 0.0

        print(f"Training epoch {epoch}")
        print('Loss: ', loss)
        if epoch != 0:
            print('Average loss: ', loss_values[-1])
            print('Average perplexity: ', ppl_values[-1])

        for idx, entry in tqdm(enumerate(train_dataloader)):
            subject, body = entry[0], entry[1]

            if (subject.size()[-1] != body.size()[-1]):
                #print('different sizes')
                continue
            #print(subject.size())
            #print(body.size())
            input_tensor = subject.to(device)
            continuation = body.to(device)
            outputs = model(input_tensor, labels=continuation)
            loss = outputs[0]
            loss.backward()

            running_loss =+ loss.item() * batch_size # batch size

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None

            # # # # # # # # perplexity # # # # # # # #
            encodings = body
            max_length = model.config.n_positions
            stride = 512
            seq_len = len(encodings)

            nlls = []
            prev_end_loc = 0
            for begin_loc in range(0, seq_len, stride):
                end_loc = min(begin_loc + max_length, seq_len)
                trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
                input_ids = encodings[begin_loc:end_loc].to(device)
                target_ids = input_ids.clone()
                target_ids[:-trg_len] = -100

                with torch.no_grad():
                    outputs = model(input_ids, labels=target_ids)
                    neg_log_likelihood = outputs.loss

                nlls.append(neg_log_likelihood)
                prev_end_loc = end_loc
                if end_loc == seq_len:
                    break
            ppl = torch.exp(torch.stack(nlls).mean())
            running_ppl =+ ppl * batch_size # batch size
            # # # # # # # #

        loss_values.append(running_loss / len(dataset))
        ppl_values.append(running_ppl / len(dataset))

    #plt.plot(loss_values)
    #plt.plot(ppl_values)
    return model, loss_values, ppl_values

In [None]:
#!pip install light-the-torch
#!ltt install torch torchvision

In [None]:
model, loss_values, ppl_values = train_model(train_dataset, model, tokenizer, epochs=4)

Training epoch 0
Loss:  0


11214it [1:07:35,  2.76it/s]


Training epoch 1
Loss:  tensor(3.3771, device='cuda:0', grad_fn=<NllLossBackward0>)
Average loss:  0.000301147306669156
Average perplexity:  tensor(0.0018, device='cuda:0')


11214it [1:07:30,  2.77it/s]


Training epoch 2
Loss:  tensor(2.3564, device='cuda:0', grad_fn=<NllLossBackward0>)
Average loss:  0.0002101278356079624
Average perplexity:  tensor(0.0005, device='cuda:0')


11214it [1:07:28,  2.77it/s]


Training epoch 3
Loss:  tensor(1.6826, device='cuda:0', grad_fn=<NllLossBackward0>)
Average loss:  0.00015004178260298405
Average perplexity:  tensor(0.0003, device='cuda:0')


4400it [26:23,  2.72it/s]

In [None]:
torch.save(model.state_dict(), 'drive/MyDrive/fine-tuned-t5.pt')

In [None]:
loss_values

In [None]:
ppl_values

# Generation

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')
model.load_state_dict(torch.load('drive/MyDrive/fine-tuned-t5.pt'))
model.eval()

tokenizer = T5Tokenizer.from_pretrained('t5-small')
# tokenizer.pad_token = tokenizer.eos_token

# add the EOS token as PAD token to avoid warnings
# model = AutoModelForCausalLM.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

## Greedy search

In [None]:
from tqdm import tqdm

greedy_outputs = []
for index, row in val.iterrows():
  # encode context the generation is conditioned on
  model_inputs = tokenizer(row['subject_prompt'], return_tensors='pt')

  greedy_output = model.generate(
      **model_inputs,
      max_new_tokens=MAX_LENGTH - len(model_inputs['input_ids'][0]))

  text = tokenizer.decode(greedy_output[0], skip_special_tokens=True)
  greedy_outputs.append(text)

In [None]:
for index, row in val.iterrows():
  print(row['subject_prompt'])
  print(greedy_outputs[index])
  if index == 3:
    break

In [None]:
# save output to file
with open('drive/MyDrive/t5-greedy_output-2.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(greedy_outputs[index].replace("\n", " ") + '\n')

## Beam search

In [None]:
from tqdm import tqdm
beam_outputs = []

for index, row in val.iterrows():
  # encode context the generation is conditioned on
  model_inputs = tokenizer(row['subject_prompt'], return_tensors='pt')

  beam_output = beam_output = model.generate(
    **model_inputs,
    max_new_tokens=MAX_LENGTH - len(model_inputs['input_ids'][0]),
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
  )

  text = tokenizer.decode(beam_output[0], skip_special_tokens=True)
  beam_outputs.append(text)

In [None]:
# save output to file
with open('drive/MyDrive/t5-beam_output.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(beam_outputs[index][len(row['subject_prompt']):].replace("\n", " ") + '\n')

## Top-k sampling

In [None]:
from tqdm import tqdm

topk_outputs = []
for index, row in val.iterrows():
  # encode context the generation is conditioned on
  model_inputs = tokenizer(row['subject_prompt'], return_tensors='pt')

  topk_output = sample_output = model.generate(
      **model_inputs,
      max_new_tokens=MAX_LENGTH - len(model_inputs['input_ids'][0]),
      do_sample=True,
      top_k=50
  )

  text = tokenizer.decode(topk_output[0], skip_special_tokens=True)
  topk_outputs.append(text)

In [None]:
# save output to file
with open('drive/MyDrive/t5-topk_output.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(topk_outputs[index][len(row['subject_prompt']):].replace("\n", " ") + '\n')

## Top-p sampling

In [None]:
from tqdm import tqdm

topp_outputs = []
for index, row in val.iterrows():
  # encode context the generation is conditioned on
  model_inputs = tokenizer(row['subject_prompt'], return_tensors='pt')

  topp_output = model.generate(
      **model_inputs,
      max_new_tokens=MAX_LENGTH - len(model_inputs['input_ids'][0]),
      do_sample=True,
      top_p=0.92,
      top_k=0
  )


  text = tokenizer.decode(topp_output[0], skip_special_tokens=True)
  topp_outputs.append(text)

In [None]:
# save output to file
with open('drive/MyDrive/t5-topp_output.txt', 'w') as outfile:
  for index, row in val.iterrows():
    outfile.write(topp_outputs[index][len(row['subject_prompt']):].replace("\n", " ") + '\n')

# Evaluation

## SEScore

[Source code](https://github.com/xu1998hz/SEScore)

- Currently down... See implementation and if it's feasible to recreate it (e.g., if it doesn't require too many resources)

## EmbSim

[Source code](https://github.com/geek-ai/Texygen)

## NLLTest

[Source code](https://github.com/geek-ai/Texygen)

From [PyTorch](https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html#torch.nn.NLLLoss): `torch.nn.functional.nll_loss(input, target)`
- `input` has dimensions (N,C) and contains log-probabilities of each word generated by the model. Each entry i is the log-prob of word i being from class j (equal to word j)

- `target` has dimension (N) and contains the real index in the vocabulary of the real input data. Each entry i is the index of the word (between 0 and C-1)

# Calculate Scores of Metrics

In [None]:
metrics = {}
# https://blog.paperspace.com/automated-metrics-for-evaluating-generated-text/ might be helpful

In [None]:
%pip install nltk
%pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
hypothesis = "It is a guide to action which ensures that the military always obeys the commands of the party. He read the book because he was interested in world history."
references = "It is a guide to action that ensures that the military will forever heed Party commands. It is the guiding principle which guarantees the military forces always being under the command of the Party. It is the practical guide for the army always to heed the directions of the  party. He was interested in world history because he read the book."
"""
hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
    ...         'ensures', 'that', 'the', 'military', 'always',
    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
    ...          'heed', 'Party', 'commands']
    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
    ...          'guarantees', 'the', 'military', 'forces', 'always',
    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
    ...          'of', 'the', 'party']

    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
    ...         'interested', 'in', 'world', 'history']
    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
    ...          'because', 'he', 'read', 'the', 'book']
"""

import re

def clean_tokens(sentence_tokenized):
  for i in range(len(sentence_tokenized)):
    sentence_tokenized[i] = re.sub('[^A-Za-z ]', '', sentence_tokenized[i])
  return sentence_tokenized



In [None]:
print(generated_output)
print(reference_output)

['\n', '\n', '\n', '\n', 'g.\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 's.\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', ' form of sexism.\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 'in a row.\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', ' u.s.\n', '\n', '\n', 'ble for mac mobile.\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 'gland scout.\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n']


In [None]:
from nltk.translate.bleu_score import corpus_bleu
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

#information from https://www.kaggle.com/code/ralphkrueger/nltk-bleu-score-calculator
#Calculate and print BLEU-1, using 1-grams as highest-order n-grams
#Reference is placed in [square brackets] because you can score the machine-translated sentence against multiple references.
#The weights are set so that calculation is based solely on 1-gram precision.

#https://www.nltk.org/_modules/nltk/translate/bleu_score.html
#reference_text = "It is a guide to action which ensures that the military always obeys the commands of the party. He read the book because he was interested in world history."
#candidate_text = "It is a guide to action that ensures that the military will forever heed Party commands. It is the guiding principle which guarantees the military forces always being under the command of the Party. It is the practical guide for the army always to heed the directions of the  party. He was interested in world history because he read the book."
#reference_text = sent_tokenize(reference_text)
#candidate_text = sent_tokenize(candidate_text)

reference_text = reference_output
candidate_text = generated_output

reference_text = [word_tokenize(t) for t in clean_tokens(reference_text)]
candidate_text = [word_tokenize(t) for t in clean_tokens(candidate_text)]
if len(reference_text) > len(candidate_text):
  reference_text = reference_text[:len(candidate_text)]
else:
  candidate_text = candidate_text[:len(reference_text)]
#print(reference_text)
#print(candidate_text)

def bleu1(reference_text, candidate_text):
  bleu_score = corpus_bleu(reference_text, candidate_text, weights=(1, 0))
  metrics["BLEU-1"] = bleu_score

def bleu2(reference_text, candidate_text):
  bleu_score = corpus_bleu(reference_text, candidate_text, weights=(0.5, 0.5))
  metrics["BLEU-2"] = bleu_score

def bleu3(reference_text, candidate_text):
  bleu_score = corpus_bleu(reference_text, candidate_text, weights=(0.333, 0.333, 0.334))
  metrics["BLEU-3"] = bleu_score

def bleu4(reference_text, candidate_text):
  bleu_score = corpus_bleu(reference_text, candidate_text, weights=(0.25, 0.25, 0.25, 0.25))
  metrics["BLEU-4"] = bleu_score

print(bleu1(reference_text, candidate_text))
print(bleu2(reference_text, candidate_text))
print(bleu3(reference_text, candidate_text))
print(bleu4(reference_text, candidate_text))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


None
None


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


None
None


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
import numpy as np
import copy
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
import nltk
nltk.download('punkt')

#reference_text = "It is a guide to action which ensures that the military always obeys the commands of the party. He read the book because he was interested in world history."
#candidate_text = "It is a guide to action that ensures that the military will forever heed Party commands. It is the guiding principle which guarantees the military forces always being under the command of the Party. It is the practical guide for the army always to heed the directions of the  party. He was interested in world history because he read the book."
#reference_text = sent_tokenize(reference_text)
#candidate_text = sent_tokenize(candidate_text)
#reference_text = [word_tokenize(element) for element in reference_text]

def self_bleu(reference_text, candidate_text):
  self_bleu_scores = []

  ### get rid of special characters and split sentences
  reference_text = clean_tokens(reference_text)
  candidate_text = clean_tokens(candidate_text)
#  print("REFERENCE:   ", reference_text)

  for i in range(len(candidate_text)):
    #have all references == candidate text without one sentence we want to use for score
    remaining_sentences = candidate_text.copy()
    remaining_sentences.pop(i)
#    hyp = candidate_text[i]

    hyp = word_tokenize(candidate_text[i])
    remaining_sentences = [word_tokenize(element) for element in remaining_sentences]

    bleu_score = sentence_bleu(remaining_sentences, hyp)
    self_bleu_scores.append(bleu_score)
#    print(bleu_score)

#  for i in candidate_text:
#    sentence_copy = copy.deepcopy(candidate_text)
#    remaining_sentences = sentence_copy.remove(i)
#    sentence_copy = word_tokenize(sentence_copy)
#    remaining_sentences = [word_tokenize(element) for element in remaining_sentences]
#    bleu_score = sentence_bleu(remaining_sentences, sentence_copy)
#    self_bleu_scores.append(bleu_score)
#    print(bleu_score)

  avg_self_bleu = np.mean(self_bleu_scores)
  metrics["SELF-BLEU"] = avg_self_bleu

  return avg_self_bleu

print(self_bleu(reference_output, generated_output))

It is a guide to action which ensures that the military always obeys the commands of the party. He read the book because he was interested in world history.
It is a guide to action that ensures that the military will forever heed Party commands. It is the guiding principle which guarantees the military forces always being under the command of the Party. It is the practical guide for the army always to heed the directions of the  party. He was interested in world history because he read the book.
7.57965434483665e-155
3.6718992240469637e-78
3.844853295436682e-78
6.3497053018839554e-232
1.8791881298709114e-78


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
%pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
# https://pypi.org/project/rouge/?ref=blog.paperspace.com
from rouge import Rouge
rouge = Rouge()

#reference_text = "It is a guide to action which ensures that the military always obeys the commands of the party. He read the book because he was interested in world history."
#candidate_text = "It is a guide to action that ensures that the military will forever heed Party commands. It is the guiding principle which guarantees the military forces always being under the command of the Party. It is the practical guide for the army always to heed the directions of the  party. He was interested in world history because he read the book."

def calculate_rouge(reference_text, candidate_text):
    scores = rouge.get_scores(candidate_text, reference_text)
    return scores

print(calculate_rouge(reference_output, generated_output))

[{'rouge-1': {'r': 0.9615384615384616, 'p': 0.625, 'f': 0.7575757528007345}, 'rouge-2': {'r': 0.6428571428571429, 'p': 0.32727272727272727, 'f': 0.43373493528814056}, 'rouge-l': {'r': 0.8076923076923077, 'p': 0.525, 'f': 0.6363636315886134}}]


In [None]:
#METEOR SCORE
from nltk.translate import meteor
import numpy as np
import copy
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.translate.bleu_score import sentence_bleu
import nltk
nltk.download('punkt')
nltk.download('wordnet')

#def calculate_meteor(candidate, reference):
def calculate_meteor(reference_text, candidate_text):
  reference_text = clean_tokens(reference_text)
  candidate_text = clean_tokens(candidate_text)
#  reference_text = clean_tokens(sent_tokenize(reference_text))
#  candidate_text = clean_tokens(sent_tokenize(candidate_text))
#  print("REFERENCE:   ", reference_text)

  meteor_scores = []
  for i in range(len(candidate_text)):
    #have all references == candidate text without one sentence we want to use for score
    remaining_sentences = candidate_text.copy()
    remaining_sentences.pop(i)

    hyp = candidate_text[i]
#    print("SENTENCE: ",hyp)
#    print("REMAINING: ", remaining_sentences)

    hyp = word_tokenize(hyp)
    remaining_sentences = [word_tokenize(element) for element in remaining_sentences]

    meteor_score = round(meteor(remaining_sentences, hyp), 4)
    meteor_scores.append(meteor_score)

  avg_meteor = np.mean(meteor_scores)
  metrics["meteor"] = avg_meteor
  print(f"METEOR Score: {avg_meteor}")

  return avg_meteor

#reference_text1 = "It3 5 is a guide to action which ensures that the military8 always obeys the commands of the party. He read the book because he was interested in world history."
#candidate_text1 = "It is a guide to action that ensures that the military will forever heed Party commands. It is the guiding principle which guarantees the military forces always being under the command of the Party. It is the practical guide for the army always to heed the directions of the  party. He was interested in world history because he read the book."

calculate_meteor(reference_output, generated_output)


METEOR Score: 0.345925


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0.345925

### Dist-N (N-Gram Repetition Rates & N-Gram Diversity)

In [None]:
# N-Gram Repetition Rates - aka Dist-N
'''
A low diversity score suggests the model suffers from repetition, and a high diversity score means the
model generated text is lexically diverse. - https://arxiv.org/pdf/2210.15097.pdf Lisa Li, page 3
'''

# code from https://github.com/neural-dialogue-metrics/Distinct-N/tree/main

#helper methods
from itertools import chain
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

def pad_sequence(sequence, n, pad_left=False, pad_right=False,
                 left_pad_symbol=None, right_pad_symbol=None):
    sequence = iter(sequence)
    if pad_left:
        sequence = chain((left_pad_symbol,) * (n - 1), sequence)
    if pad_right:
        sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
    return sequence


def ngrams(sequence, n, pad_left=False, pad_right=False,
           left_pad_symbol=None, right_pad_symbol=None):
    sequence = pad_sequence(sequence, n, pad_left, pad_right,
                            left_pad_symbol, right_pad_symbol)

    history = []
    while n > 1:
        history.append(next(sequence))
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]

# the real deal
def distinct_n_sentence_level(sentence, n):
    if len(sentence) == 0:
        return 0.0  # Prevent a zero division
    distinct_ngrams = set(ngrams(sentence, n))
    return len(distinct_ngrams) / len(sentence)


def distinct_n_corpus_level(sentences, n):
    return sum(distinct_n_sentence_level(sentence, n) for sentence in sentences) / len(sentences)

#def calculate_distinctn(candidate_text, ngram_num):
#  return distinct_n_corpus_level(candidate_text, ngram_num)

#candidate_text = "It is a guide to action that ensures that the military will forever heed Party commands. It is the guiding principle which guarantees the military forces always being under the command of the Party. It is the practical guide for the army always to heed the directions of the  party. He was interested in world history because he read the book."
#candidate_text = "Machine learning is a fascinating field that encompasses a wide range of techniques and algorithms. It involves the use of statistical models and computer systems to perform tasks without explicit programming. Natural language processing, image recognition, and recommendation systems are just a few applications of machine learning. The algorithms learn from data and make predictions or decisions based on that learning. N-gram metrics can be applied to analyze the structure and patterns within this diverse field, providing insights into the relationships between words and phrases. The integration of n-gram analysis enhances our understanding of the language used in machine learning literature and contributes to refining the algorithms for even more accurate predictions."
candidate_text = clean_tokens(generated_output)

ngram_num = 2

print(distinct_n_corpus_level(candidate_text, ngram_num))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


0.7026769867219591


In [None]:
# N-Gram Diversity
# DIV = Sum of Dist-N

from nltk import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

def calculate_ngram_diversity(text, n):
    text = clean_tokens(text)
    # Tokenize the text into words
    words = [word_tokenize(t) for t in text]

    diversity = []
    for t in text:
      words = word_tokenize(t)
      # Generate N-grams
      ngrams_list = list(ngrams(words, n))
      # Calculate diversity by counting distinct N-grams
      diversity.append(len(set(ngrams_list)))

    return diversity

# Set N for N-grams (e.g., N=2 for bigrams)
n_value = 2
#candidate_text = "Machine learning is a fascinating field that encompasses a wide range of techniques and algorithms. It involves the use of statistical models and computer systems to perform tasks without explicit programming. Natural language processing, image recognition, and recommendation systems are just a few applications of machine learning. The algorithms learn from data and make predictions or decisions based on that learning. N-gram metrics can be applied to analyze the structure and patterns within this diverse field, providing insights into the relationships between words and phrases. The integration of n-gram analysis enhances our understanding of the language used in machine learning literature and contributes to refining the algorithms for even more accurate predictions."
#candidate_text = clean_tokens(sent_tokenize(candidate_text))

# Calculate N-Gram Diversity
diversity = calculate_ngram_diversity(generated_output, n_value)

print(f"N-Gram Diversity (N={n_value}): {diversity}")

N-Gram Diversity (N=2): [14, 15, 15, 13, 23, 26]


### Shannon entropy

https://arxiv.org/pdf/2004.10450.pdf - the paper Johann referenced

--> In the proposed framework, we evaluate the quality of a single sentence x ∈ X by asking humans for a quality
judgment HJ(x). We can define the quality Q of a model as the expected human “quality” judgment for sentences
drawn from it: Q(p) = Ex∼p[HJ(x)]

-> so we need people to score the quality of the sentences and then use this to measure the quality

https://people.math.harvard.edu/~ctm/home/text/others/shannon/entropy/entropy.pdf - original paper by shannon

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html - package used here

In [None]:
import numpy as np
from scipy.stats import entropy

def calculate_shannon(base, HJ): # we need a base and an array of human judgement scores
  base = base  # work in units of bits, typically 2
  H = entropy(HJ, base=base)

  metrics["shannon"] = H
  print(f"Shannon Entropy: {H}")

  return H

# GPT

In [None]:
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
from nltk.translate.meteor_score import meteor_score
from rouge import Rouge
from nltk import ngrams
from nltk.util import ngrams as nltk_ngrams
from nltk.tokenize import word_tokenize
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
nltk.download('punkt')

# Sample reference and candidate texts
reference_text = "This is a reference sentence. It serves as an example for testing similarity metrics."
candidate_text = "This is an example sentence used for testing the similarity metrics."

# Metrics dictionary to store scores
metrics_dict = {}

# 1. BLEU Score
bleu_score = sentence_bleu([word_tokenize(reference_text)], word_tokenize(candidate_text))
metrics_dict["BLEU"] = bleu_score

# 2. SELF-BLEU Score
self_bleu_scores = []
#for _ in range(10):  # You can adjust the number of iterations
for i in candidate_tokens:
    reference_set = [word_tokenize(reference_text)]
    candidate_set = word_tokenize(candidate_text)
#    self_bleu_scores.append(corpus_bleu([reference_set] * len(candidate_set), [candidate_set]))
    self_bleu_scores.append(corpus_bleu([reference_set], [candidate_set]))

avg_self_bleu = np.mean(self_bleu_scores)
metrics_dict["SELF-BLEU"] = avg_self_bleu

# 3. ROUGE Score
#scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
#rouge_scores = scorer.score(reference_text, candidate_text)
rouge = Rouge()
rouge_score = rouge.get_scores([word_tokenize(candidate_text)], word_tokenize(reference_text))
metrics_dict["ROUGE"] = rouge_score

# 4. METEOR Score
meteor = meteor_score([reference_text], candidate_text)
metrics_dict["METEOR"] = meteor

# 5. MAUVE Score
# Assuming you have a function `mauve_metric` from a previous example
# Please replace the function with your actual implementation
mauve_score = mauve_metric(candidate_text, reference_text)
metrics_dict["MAUVE"] = mauve_score

# 6. N-gram Repetition Rates
def ngram_repetition_rate(text, n):
    words = word_tokenize(text)
    ngrams_list = list(ngrams(words, n))
    ngram_counter = Counter(ngrams_list)
    repetition_rate = sum(count > 1 for count in ngram_counter.values()) / len(ngrams_list)
    return repetition_rate

ngram_repetition_rate_2 = ngram_repetition_rate(candidate_text, 2)
metrics_dict["N-gram Repetition Rate (N=2)"] = ngram_repetition_rate_2

# 7. Repetitiveness (Cosine Similarity of TF-IDF Vectors)
def repetitiveness(reference, candidate):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform([reference, candidate])
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix[0, 1]

repetitiveness_score = repetitiveness(reference_text, candidate_text)
metrics_dict["Repetitiveness"] = repetitiveness_score

# Print the metrics dictionary
print("Metrics Dictionary:")
for metric, score in metrics_dict.items():
    print(f"{metric}: {score}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


AssertionError: ignored

# Old Tries

## BLEU and SELF-BLEU

In [None]:
#%pip install nltk



In [None]:
# Calculate BLEU score or other evaluation metrics, NEED to use NLP library for BLEU calculation)
from nltk.translate.bleu_score import sentence_bleu

def calculate_bleu():
  # Calculate BLEU score
  bleu_score = sentence_bleu([reference_tokens], candidate_tokens)
  metrics["bleu"] = bleu_score

  print(f"BLEU Score: {bleu_score}")

BLEU Score: 2.827255547394629e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
# SELF-BLEU - needs to be fitted to our code

import numpy as np
import copy

'''
def get_bleu_score(sentence, remaining_sentences):
    lst = []
    for i in remaining_sentences:
        bleu = sentence_bleu(sentence, i)
        lst.append(bleu)
    return lst
'''

def calculate_selfBleu(candidate_tokens):
    '''
    sentences - list of sentences generated by NLG system -> in our case candidate_tokens
    '''
    bleu_scores = []

    for i in candidate_tokens:
        sentences_copy = copy.deepcopy(candidate_tokens)
        remaining_sentences = sentences_copy.remove(i)
#        print(sentences_copy)
#        bleu = get_bleu_score(i,sentences_copy)
        bleu = calculate_bleu(i,sentences_copy)
        bleu_scores.append(bleu)

    self_bleu = np.mean(bleu_scores)

    metrics["self_bleu"] = self_bleu
    print(f"SELF_BLEU Score: {self_bleu}")

    return self_bleu

## ROUGE

In [None]:
pip install rouge

In [None]:
from rouge import Rouge
rouge = Rouge()

#def calculate_rouge(candidate, reference):
def calculate_rouge():
    '''
    candidate, reference: generated and ground-truth sentences
    '''
    # Tokenize your reference and generated texts into lists of words or tokens
    rouge_score = rouge.get_scores([candidate_tokens], reference_tokens)
    metrics["rouge"] = rouge_score
    print(f"ROUGE Score: {rouge_score}")

    return scores

## METEOR

In [None]:
#METEOR SCORE
from nltk.translate import meteor

#def calculate_meteor(candidate, reference):
def calculate_meteor():
  '''
  candidate, reference: tokenized list of words in the sentence
  '''
#  reference_tokens = word_tokenize(reference)
#  candidate_tokens = word_tokenize(candidate)
  meteor_score = round(meteor([candidate_tokens],reference_tokens), 4)
  metrics["meteor"] = meteor_score
  print(f"METEOR Score: {meteor_score}")

  return meteor_score


## Word Movers Distance

In [None]:
%pip install ot

In [None]:
from gensim.models import Word2Vec
from nltk.corpus import stopwords
#import nltk
#nltk.download('stopwords')
import gensim.downloader as api

# Load pre-trained Word2Vec model (example using 'word2vec-google-news-300')
word2vec_model = api.load("word2vec-google-news-300")

# Function to calculate Word Mover's Distance
def calculate_wmdistance(doc1, doc2, model, stop_words):
    # Tokenize and remove stopwords
    doc1_tokens = [word for word in doc1.lower().split() if word not in stop_words]
    doc2_tokens = [word for word in doc2.lower().split() if word not in stop_words]

    # Calculate Word Mover's Distance
    distance = model.wmdistance(doc1_tokens, doc2_tokens)

    return distance

# Example documents
document1 = "Machine learning is fascinating."
document2 = "Natural language processing is an interesting field of study."

# Download stopwords from NLTK
stop_words = set(stopwords.words('english'))

# Calculate Word Mover's Distance
wmdistance = calculate_wmdistance(document1, document2, word2vec_model, stop_words)

print(f"Word Mover's Distance: {wmdistance}")

ModuleNotFoundError: ignored

## Metrics open for discussion (aka perplexity)

Perplexity

In [None]:
'''
PERPLEXITY
we can compute it on already seen set or on part of training set that is new
'''

import math

def calculate_perplexity(language_model, test_data):
    total_log_prob = 0
    total_words = 0

    for sentence in test_data:
        context = []  # Initial context
        for word in sentence:
            total_words += 1
            probability = language_model.get_word_probability(word, context)
            total_log_prob += math.log(probability)
            context.append(word)  # Update context for the next word

    perplexity = math.exp(-total_log_prob / total_words)
    return perplexity




In [None]:
#approximate the purpose of a text by calculating its lexicon-based topicality scores
# https://github.com/Ejhfast/empath-client
pip install empath
from empath import Empath
lexicon = Empath()


### MAUVE

In [None]:
# MAUVE: https://github.com/krishnap25/mauve-experiments || https://github.com/krishnap25/mauve
pip install nltk==3.4.5
# pip install transformers==4.2.0
pip install scikit-learn==0.22.1
pip install faiss-gpu==1.7.0
pip install tqdm==4.40.0 # or higher for all
'''
numpy>=1.18.1
scikit-learn>=0.22.1
faiss-cpu>=1.7.0
tqdm>=4.40.0
'''



## Diversity Metrics

### Dist-N (N-Gram Repetition Rates & N-Gram Diversity)

In [None]:
# N-Gram Repetition Rates - aka Dist-N
'''
A low diversity score suggests the model suffers from repetition, and a high diversity score means the
model generated text is lexically diverse. - https://arxiv.org/pdf/2210.15097.pdf Lisa Li, page 3
'''

# code from https://github.com/neural-dialogue-metrics/Distinct-N/tree/main

#helper methods
from itertools import chain
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')

def pad_sequence(sequence, n, pad_left=False, pad_right=False,
                 left_pad_symbol=None, right_pad_symbol=None):
    sequence = iter(sequence)
    if pad_left:
        sequence = chain((left_pad_symbol,) * (n - 1), sequence)
    if pad_right:
        sequence = chain(sequence, (right_pad_symbol,) * (n - 1))
    return sequence


def ngrams(sequence, n, pad_left=False, pad_right=False,
           left_pad_symbol=None, right_pad_symbol=None):
    sequence = pad_sequence(sequence, n, pad_left, pad_right,
                            left_pad_symbol, right_pad_symbol)

    history = []
    while n > 1:
        history.append(next(sequence))
        n -= 1
    for item in sequence:
        history.append(item)
        yield tuple(history)
        del history[0]

# the real deal
def distinct_n_sentence_level(sentence, n):
    if len(sentence) == 0:
        return 0.0  # Prevent a zero division
    distinct_ngrams = set(ngrams(sentence, n))
    return len(distinct_ngrams) / len(sentence)


def distinct_n_corpus_level(sentences, n):
    return sum(distinct_n_sentence_level(sentence, n) for sentence in sentences) / len(sentences)

#def calculate_distinctn(candidate_text, ngram_num):
#  return distinct_n_corpus_level(candidate_text, ngram_num)

#reference_text = "It is a guide to action which ensures that the military always obeys the commands of the party. He read the book because he was interested in world history."
#candidate_text = "It is a guide to action that ensures that the military will forever heed Party commands. It is the guiding principle which guarantees the military forces always being under the command of the Party. It is the practical guide for the army always to heed the directions of the  party. He was interested in world history because he read the book."
candidate_text = "Machine learning is a fascinating field that encompasses a wide range of techniques and algorithms. It involves the use of statistical models and computer systems to perform tasks without explicit programming. Natural language processing, image recognition, and recommendation systems are just a few applications of machine learning. The algorithms learn from data and make predictions or decisions based on that learning. N-gram metrics can be applied to analyze the structure and patterns within this diverse field, providing insights into the relationships between words and phrases. The integration of n-gram analysis enhances our understanding of the language used in machine learning literature and contributes to refining the algorithms for even more accurate predictions."
#reference_text = clean_tokens(sent_tokenize(reference_text))
candidate_text = clean_tokens(sent_tokenize(candidate_text))

ngram_num = 2

print(distinct_n_corpus_level(candidate_text, ngram_num))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


0.7026769867219591


In [None]:
# N-Gram Diversity
# DIV = Sum of Dist-N

from nltk import ngrams
from nltk.tokenize import word_tokenize
from collections import Counter

def calculate_ngram_diversity(text, n):
    # Tokenize the text into words
    words = [word_tokenize(t) for t in text]

    diversity = []
    for t in text:
      words = word_tokenize(t)
      # Generate N-grams
      ngrams_list = list(ngrams(words, n))
      # Calculate diversity by counting distinct N-grams
      diversity.append(len(set(ngrams_list)))

    return diversity

# Set N for N-grams (e.g., N=2 for bigrams)
n_value = 2
candidate_text = "Machine learning is a fascinating field that encompasses a wide range of techniques and algorithms. It involves the use of statistical models and computer systems to perform tasks without explicit programming. Natural language processing, image recognition, and recommendation systems are just a few applications of machine learning. The algorithms learn from data and make predictions or decisions based on that learning. N-gram metrics can be applied to analyze the structure and patterns within this diverse field, providing insights into the relationships between words and phrases. The integration of n-gram analysis enhances our understanding of the language used in machine learning literature and contributes to refining the algorithms for even more accurate predictions."
candidate_text = clean_tokens(sent_tokenize(candidate_text))

# Calculate N-Gram Diversity
diversity = calculate_ngram_diversity(candidate_text, n_value)

print(f"N-Gram Diversity (N={n_value}): {diversity}")

N-Gram Diversity (N=2): [14, 15, 15, 13, 23, 26]


### Shannon entropy

https://arxiv.org/pdf/2004.10450.pdf - the paper Johann referenced

--> In the proposed framework, we evaluate the quality of a single sentence x ∈ X by asking humans for a quality
judgment HJ(x). We can define the quality Q of a model as the expected human “quality” judgment for sentences
drawn from it: Q(p) = Ex∼p[HJ(x)]

-> so we need people to score the quality of the sentences and then use this to measure the quality

https://people.math.harvard.edu/~ctm/home/text/others/shannon/entropy/entropy.pdf - original paper by shannon

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.entropy.html - package used here

In [None]:
import numpy as np
from scipy.stats import entropy

def calculate_shannon(base, HJ): # we need a base and an array of human judgement scores
  base = base  # work in units of bits, typically 2
  H = entropy(HJ, base=base)

  metrics["shannon"] = H
  print(f"Shannon Entropy: {H}")

  return H

### Syntactic and lexical diversity

Syntactic and lexical diversity - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8049133/pdf/peerj-cs-07-443.pdf page 7

NE-recognizer and POS-tagger provided in the Python spaCy (https://spacy.io/) package to find the NE- and POS-tags as well as the neuralcoref (https://github.com/huggingface/neuralcoref) extension to detect coreference clusters

In [None]:
pip install $(spacy info en_core_web_sm --url)

### Repetitiveness

From Fröhling: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8049133/pdf/peerj-cs-07-443.pdf

We try to expose those statistical differences, assumed to be easiest to be picked up by automated detection methods, through the share of stop-words, unique words and words from “top-lists” in a text’s total words. We expect a more diverse, human-written text
to have a higher share of unique words and a lower share of top-words and words from “top-lists”. We propose to expose the repetitiveness by calculating the n-gram overlap of words (lexical repetition) and POS-tags (syntactic repetition) in consecutive sentences. Human text is expected to be less repetitive both in sentence structure and word choice. We introduce the “conjunction overlap” as a measure of the n-gram overlap around and-conjunctions to make explicit the reported failure of language models of plainly
repeating words around those conjunctions.

Take the stop-words defined by the spaCy package and take a list with the top 10,000 words (https://github.com/first20hours/google-10000-english) used in English determined by Google to calculate the share of a text’s words that are in the top 100, top 1,000 and top 10,000 words of that list


In [None]:
# As this paper does not give a concrete implementation, we try to recreate it based on the information stated the cell above

import spacy
import requests
from collections import Counter
from nltk import ngrams
from nltk.corpus import stopwords

# Download the list of top 10,000 English words
url = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english.txt"
response = requests.get(url)
top_words = set(response.text.split())

# Load spaCy English language model
nlp = spacy.load("en_core_web_sm")

def get_word_stats(text):
    doc = nlp(text)

    # Extract words and stopwords
    words = [token.text.lower() for token in doc if token.is_alpha]
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word not in stop_words]

    # Calculate share of words in the top lists
    top_100_words = [word for word in filtered_words if word in top_words]
    top_1000_words = top_100_words[:1000]
    top_10000_words = top_100_words[:10000]

    share_top_100 = len(top_100_words) / len(filtered_words)
    share_top_1000 = len(top_1000_words) / len(filtered_words)
    share_top_10000 = len(top_10000_words) / len(filtered_words)

    return share_top_100, share_top_1000, share_top_10000

def get_ngram_overlap(text, n):
    sentences = [sent.text for sent in nlp(text).sents]
    ngrams_list = [ngrams(sent.split(), n) for sent in sentences]
    ngram_counts = Counter(ngram for sublist in ngrams_list for ngram in sublist)

    return ngram_counts

# Example text
text = "We try to expose those statistical differences, assumed to be easiest to be picked up by automated detection methods, through the share of stop-words, unique words and words from 'top-lists' in a text’s total words."

# Calculate word stats
share_top_100, share_top_1000, share_top_10000 = get_word_stats(text)
print(f"Share of words in top 100: {share_top_100:.2%}")
print(f"Share of words in top 1000: {share_top_1000:.2%}")
print(f"Share of words in top 10000: {share_top_10000:.2%}")

# Calculate n-gram overlap
ngram_counts = get_ngram_overlap(text, 2)
print("N-gram Counts:")
for ngram, count in ngram_counts.items():
    print(f"{ngram}: {count}")


## Coherence Metrics

In [None]:
# similarity between the prompt and the continuation

In [None]:
# learn coreference clusters and track the appearance of their entities throughout the text
# how? -> count transitions between subject, object, other, non present (NIL?) entities

In [None]:
# Number of categories and amount of focus

In [None]:
# Yule's statistic Q for measuring semantic associaton between word-pairs

In [None]:
from nltk import FreqDist
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def yules_q(text, n):
    # Tokenize the text and remove stopwords
    words = [word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stopwords.words('english')]

    # Generate n-grams
    ngrams_list = list(ngrams(words, n))

    # Calculate observed and expected frequencies
    freq_dist = FreqDist(ngrams_list)
    observed_freq = sum(freq_dist.values())

    m1 = FreqDist(words)
    expected_freq = sum([(freq * (freq - 1)) for freq in m1.values()])

    # Calculate Yule's Q
    if expected_freq == 0:
        return 0
    else:
        yules_q = (observed_freq - expected_freq) / (observed_freq + expected_freq)
        return yules_q

# Example text
example_text = "This is an example sentence. Another example sentence for testing."

# Set the value of N for N-grams (e.g., N=2 for bigrams)
n_value = 2

# Calculate Yule's Q for the example text
result = yules_q(example_text, n_value)

# Print the result
print(f"Yule's Q (N={n_value}): {result}")

LookupError: ignored