# Import necessary libraries

In [3]:
import torch
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from tqdm import tqdm

# Call empty_cache to clear the unused memory allocated by PyTorch

In [37]:
torch.mps.empty_cache()
print("Deleted unuesed memory from Macbook MPS (GPU)")

Deleted unuesed memory from Macbook MPS (GPU)


# Load pre-trained tokenizer

In [38]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name')



# Find out the device (CPU or GPU) to evaluate the dataset

In [39]:
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
# print(f"Using device '{device}' to train the model.")

# Model Evaluation using Validation Dataset

## Explore the structure of Validation Dataset before evaluation

In [40]:
# Fetch the validation dataset CSV file path
dataset_common_path = "/Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/NLP_Datasets"
validation_df_path = f"{dataset_common_path}/squad_extracted_validation.csv"

# Read validation dataset CSV file into a dataframe
validation_df = pd.read_csv(validation_df_path)

In [41]:
# Shape of Valiation dataframe
validation_df.shape

(10570, 4)

In [42]:
# Information about Valiation dataframe
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10570 entries, 0 to 10569
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           10570 non-null  object
 1   title        10570 non-null  object
 2   input_text   10570 non-null  object
 3   target_text  10570 non-null  object
dtypes: object(4)
memory usage: 330.4+ KB


In [43]:
# Description Valiation dataframe
validation_df.describe().T

Unnamed: 0,count,unique,top,freq
id,10570,10570,56be4db0acb8001400a502ec,1
title,10570,48,Super_Bowl_50,810
input_text,10570,10541,['Question: Where did Maududi\'s books place I...,2
target_text,10570,9543,"['two', 'two', 'two']",22


In [44]:
# First 5 rows of Valiation dataframe
validation_df.head()

Unnamed: 0,id,title,input_text,target_text
0,56be4db0acb8001400a502ec,Super_Bowl_50,['Question: Which NFL team represented the AFC...,"['Denver Broncos', 'Denver Broncos', 'Denver B..."
1,56be4db0acb8001400a502ed,Super_Bowl_50,['Question: Which NFL team represented the NFC...,"['Carolina Panthers', 'Carolina Panthers', 'Ca..."
2,56be4db0acb8001400a502ee,Super_Bowl_50,['Question: Where did Super Bowl 50 take place...,"['Santa Clara, California', ""Levi's Stadium"", ..."
3,56be4db0acb8001400a502ef,Super_Bowl_50,['Question: Which NFL team won Super Bowl 50? ...,"['Denver Broncos', 'Denver Broncos', 'Denver B..."
4,56be4db0acb8001400a502f0,Super_Bowl_50,['Question: What color was used to emphasize t...,"['gold', 'gold', 'gold']"


In [45]:
# Last 5 rows of Valiation dataframe
validation_df.tail()

Unnamed: 0,id,title,input_text,target_text
10565,5737aafd1c456719005744fb,Force,['Question: What is the metric term less used ...,"['kilogram-force', 'pound-force', 'kilogram-fo..."
10566,5737aafd1c456719005744fc,Force,['Question: What is the kilogram-force sometim...,"['kilopond', 'kilopond', 'kilopond', 'kilopond..."
10567,5737aafd1c456719005744fd,Force,['Question: What is a very seldom used unit of...,"['slug', 'metric slug', 'metric slug', 'metric..."
10568,5737aafd1c456719005744fe,Force,['Question: What seldom used term of a unit of...,"['kip', 'kip', 'kip', 'kip', 'kip']"
10569,5737aafd1c456719005744ff,Force,['Question: What is the seldom used force unit...,"['sthène', 'sthène', 'sthène', 'sthène', 'sthè..."


In [46]:
# Missing values in columns of Valiation dataframe
validation_df_missing_values = validation_df.isnull().sum()
validation_df_missing_values

id             0
title          0
input_text     0
target_text    0
dtype: int64

In [47]:
# Access the first row of Valiation dataframe
validation_df_first_row = validation_df.iloc[0]

# Print each value separately
print(f"ID: {validation_df_first_row['id']}\n")
print(f"Title: {validation_df_first_row['title']}\n")
print(f"Input Text: {validation_df_first_row['input_text']}\n")
print(f"Target Text: {validation_df_first_row['target_text']}")

ID: 56be4db0acb8001400a502ec

Title: Super_Bowl_50

Input Text: ['Question: Which NFL team represented the AFC at Super Bowl 50? Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.']

Target Text: ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']


## Model Evaluation

### Load the saved model and its weights

In [48]:
# Set device to CPU
device = torch.device('cpu')

model_path = "/Users/ravkothu/Documents/Personal_items_at_Oracle/Master_Degree/University_of_San_Diego/Online_Masters/MS_in_Applied_AI/Subjects_and_Resources/AAI-520-A2_NLP/AAI-520-A2_Final_Team_Project/Chatbot_Code/model_checkpoints/final_model.pt"
checkpoint = torch.load(model_path, map_location=device)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
model.load_state_dict(checkpoint['model_state_dict'])
print("Trained model is loaded successfully")

  checkpoint = torch.load(model_path, map_location=device)


Trained model is loaded successfully


### Put model in evaluation mode, tokenize the inputs, generate responses, calculate accuracy and print the results

In [55]:
import ast

# Put the loaded model in evaluation mode
model.eval()

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# Set padding to left
tokenizer.padding_side = "left" 
# Set the padding token
tokenizer.pad_token = tokenizer.eos_token

# Function to generate responses in batches
def generate_responses(input_texts, max_new_tokens=75, num_beams=5, no_repeat_ngram_size=0, early_stopping=True):
    # Tokenize the input texts
    encodings = tokenizer(
        input_texts,
        truncation=True,
        padding='max_length',
        return_tensors='pt',
        max_length=512
    ).to(device)
    
    print("encodings:")
    print("-" * 25)
    print(encodings)
    print("\n\n\n\n\n")
    
    input_ids = encodings['input_ids']

    # Generate outputs using beam search
    outputs = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        num_beams=num_beams,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=early_stopping,
        pad_token_id=tokenizer.eos_token_id
    )
    
    print("Outputs:")
    print("-" * 25)
    print(outputs)
    print("\n\n\n\n\n")

    # Decode the outputs to text
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    print("generated_texts:")
    print("-" * 35)
    print(generated_texts)
    
    return generated_texts


# Evaluate on the validation dataset in batches
batch_size = 8
validation_results = []

for start_idx in tqdm(range(0, min(5, len(validation_df)), batch_size), desc="Evaluating"):
    end_idx = start_idx + batch_size
    batch = validation_df.iloc[start_idx:end_idx]

    input_texts = batch['input_text'].tolist()
    
    # Modify this line to handle list conversion correctly
    target_texts = batch['target_text'].apply(lambda x: ast.literal_eval(x)[0] if isinstance(x, str) else x[0]).tolist()

    # Generate responses for the batch
    predicted_texts = generate_responses(input_texts)

    for idx in range(len(batch)):
        validation_results.append({
            "id": batch.iloc[idx]["id"],
            "input_text": input_texts[idx],
            "target_text": target_texts[idx],
            "predicted_text": predicted_texts[idx]
        })

Evaluating:   0%|                                                                                    | 0/1 [00:00<?, ?it/s]

encodings:
-------------------------
{'input_ids': tensor([[50256, 50256, 50256,  ...,  2026,  2637,    60],
        [50256, 50256, 50256,  ...,  2026,  2637,    60],
        [50256, 50256, 50256,  ...,  2026,  2637,    60],
        ...,
        [50256, 50256, 50256,  ...,  2026,  2637,    60],
        [50256, 50256, 50256,  ...,  2026,  2637,    60],
        [50256, 50256, 50256,  ...,  2026,  2637,    60]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])}








Evaluating: 100%|████████████████████████████████████████████████████████████████████████████| 1/1 [00:14<00:00, 14.91s/it]

Outputs:
-------------------------
tensor([[50256, 50256, 50256,  ...,  2637,    60, 50256],
        [50256, 50256, 50256,  ...,  2637,    60, 50256],
        [50256, 50256, 50256,  ...,  2637,    60, 50256],
        ...,
        [50256, 50256, 50256,  ...,  2637,    60, 50256],
        [50256, 50256, 50256,  ...,  2637,    60, 50256],
        [50256, 50256, 50256,  ...,  2637,    60, 50256]])






generated_texts:
-----------------------------------
['[\'Question: Which NFL team represented the AFC at Super Bowl 50? Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\\\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the




In [52]:
# Calculate Accuracy
correct_predictions = sum(res['predicted_text'] == res['target_text'] for res in validation_results)
accuracy = correct_predictions / len(validation_results)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

# # Display some results for qualitative evaluation
for result in validation_results[0:1]:
    print(f"ID: {result['id']}\n")
    print(f"Input Text: {result['input_text']}\n")
    print(f"Target Text: {result['target_text']}\n")
    print(f"Predicted Text: {result['predicted_text']}")
    print("-" * 50)

Validation Accuracy: 0.00%
ID: 56be4db0acb8001400a502ec

Input Text: ['Question: Which NFL team represented the AFC at Super Bowl 50? Context: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.']

Target Text: Denver Broncos

Predicted Text: ['Question: Which NFL team repre