In [None]:
! pip install transformers
! pip install accelerate -U
! pip install evaluate
! pip install -U nltk

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TextDataset, EvalPrediction
from scipy.special import softmax
from sklearn.metrics import log_loss
import numpy as np
import torch
import re
import os
import nltk
nltk.download('wordnet')
from sklearn.metrics.pairwise import cosine_similarity
import torch
from nltk.translate.bleu_score import sentence_bleu

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
model = GPT2LMHeadModel.from_pretrained('/kaggle/working/model')
tokenizer = GPT2Tokenizer.from_pretrained('/kaggle/working/model')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Clear output folder
import os
def remove_folder_contents(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                remove_folder_contents(file_path)
                os.rmdir(file_path)
        except Exception as e:
            print(e)

folder_path = '/kaggle/working'
remove_folder_contents(folder_path)
os.rmdir(folder_path)

# Get datasets from Github

In [None]:
# !wget "https://raw.githubusercontent.com/iamsarbgrewal/python-books/main/training.txt"
# !wget "https://raw.githubusercontent.com/iamsarbgrewal/python-books/main/validation.txt"
# !wget "https://raw.githubusercontent.com/iamsarbgrewal/python-books/main/test.txt"

In [5]:
text1 = "A for loop is distinct from a while loop in Python when it is designed to iterate over a specific sequence or range of values. A for loop iterates over a sequence or range for a set number of times, allowing it to perform actions on each iteration. This approach simplifies tasks that require repetitive iteration, such as calculating a sum or filtering results based on a specified criterion. In contrast, a while loop repeatedly executes a block of code as long as a given condition remains true. When the condition becomes false, the loop terminates, and the sequence or range from which the loop was called is skipped. This distinction allows for more flexible and adaptable code that can handle a wide range of scenarios."
text2 = "for loop iteration while loop repeat iterable true condition"

In [None]:
model_output_path = '/kaggle/working/model'
train_dataset = TextDataset(tokenizer=tokenizer, file_path="/kaggle/working/training.txt", block_size=128)
val_dataset = TextDataset(tokenizer=tokenizer, file_path="/kaggle/working/validation.txt", block_size=128)
test_dataset = TextDataset(tokenizer=tokenizer, file_path="/kaggle/working/test.txt", block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=model_output_path,
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_strategy='steps',
    save_steps=100,
    evaluation_strategy='steps',
    eval_steps=100,
    logging_steps=100,
    fp16=True,
    save_total_limit=2,
    report_to='none',
    learning_rate=0.00001,
    eval_accumulation_steps=1
)

# Create and train the model using the custom trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()
# Save the model
trainer.save_model(model_output_path)
# Save the tokenizer
tokenizer.save_pretrained(model_output_path)

In [9]:
def score(text1, text2):
    # Tokenize and encode the texts
    inputs = tokenizer([text1, text2], padding=True, truncation=True, return_tensors='pt')
    vector1 = inputs.input_ids[0].reshape(1, -1)
    vector2 =  inputs.input_ids[1].reshape(1, -1)
    cosine_sim = cosine_similarity(vector1, vector2)[0][0]

    # Calculate Jaccard similarity
    def jaccard_similarity(text1, text2):
        set1 = set(text1.split())
        set2 = set(text2.split())
        return len(set1 & set2) / len(set1 | set2)

    jaccard_score = jaccard_similarity(text1, text2)
    self_bleu_score = sentence_bleu([text2.split()], text1.split())

    # Length-based similarity
    length_similarity = 1 / (1 + abs(len(text1) - len(text2)))

    # Combine scores using weighted average
    weight_cosine = 0.6
    weight_jaccard = 0.2
    weight_self_bleu = 0.1
    weight_length = 0.5

    combined_score = (
        weight_cosine * cosine_sim +
        weight_jaccard * jaccard_score +
        weight_length * length_similarity
    )
    return {"Cosine score" : cosine_sim, "Jaccard score" : jaccard_score, "Length score" : length_similarity, "Combined Score": combined_score}

In [14]:
score(text1, text2)

Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


{'Cosine score': 0.4912103343990242,
 'Jaccard score': 0.04819277108433735,
 'Length score': 0.0014992503748125937,
 'Combined Score': 0.3051143800436883}

In [None]:
def generate_response(prompt):    
    model = GPT2LMHeadModel.from_pretrained('/kaggle/working/model')
    tokenizer = GPT2Tokenizer.from_pretrained('/kaggle/working/model')
    # Create the attention mask and pad token id
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)
    
    output = model.generate(
        input_ids,
        max_length=256,
        num_beams=5,
        attention_mask=attention_mask,
        num_return_sequences=1, # Generate a single sequence
        temperature=1,       # Controls randomness (higher for more diversity)
        early_stopping=True,
        top_k = 40
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)