<a href="https://colab.research.google.com/github/jeffreyong15/Counsel.NLP/blob/main/Baseline%20Experiment/Baseline%20Evaluation/Baseline_Evaluation(260)%5BGPTJ_EDIT%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Library

In [1]:
!pip install sentence-transformers
!pip install datasets
!pip install rouge-score

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, GPT2Tokenizer, Trainer, TrainingArguments, GPT2LMHeadModel, EarlyStoppingCallback
import torch
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from datasets import Dataset
from torch.nn import CrossEntropyLoss
os.environ["HF_TOKEN"] = "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"

## Data Preprocessing

In [3]:
# Load the dataset
df = pd.read_csv("academic_advising_data.csv")
df.head()

Unnamed: 0,Prompt,Response,Category
0,When is CS362 offered?,CS362 is offered every Fall and Spring.,Course Scheduling
1,When is CS105 offered?,CS105 is offered every Summer.,Course Scheduling
2,Can I switch to a double major?,"Yes, you can discuss this option with your adv...",Changing Major
3,How do I register for next semester?,You can register through the online portal sta...,Course Scheduling
4,Do I need elective credits to graduate?,"Yes, you need at least 20 elective credits.",Graduation Requirements


In [4]:
# Encode the responses into labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Response'])

# Split into training, validation, and test sets
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Category'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['Category'])

print("Training set:", train_data.shape)
print("Validation set:", val_data.shape)
print("Test set:", test_data.shape)

Training set: (8000, 4)
Validation set: (1000, 4)
Test set: (1000, 4)


In [5]:
# Vectorize the data
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['Prompt']).toarray()
X_val = vectorizer.transform(val_data['Prompt']).toarray()
X_test = vectorizer.transform(test_data['Prompt']).toarray()

y_train, y_val, y_test = train_data['Response'], val_data['Response'], test_data['Response']

## Baseline Models: Naive Bayes and KNN

In [6]:
# Function to evaluate models
def evaluate_model(preds, y_true):
    return {
        "Accuracy": accuracy_score(y_true, preds),
        "Precision": precision_score(y_true, preds, average="weighted", zero_division=0),
        "Recall": recall_score(y_true, preds, average="weighted", zero_division=0),
        "F1 Score": f1_score(y_true, preds, average="weighted", zero_division=0)
    }

In [7]:
# Naive Bayes
nb_model = MultinomialNB()
start = time.time()
nb_model.fit(X_train, y_train)
nb_time = time.time() - start
nb_preds = nb_model.predict(X_test)
nb_metrics = evaluate_model(nb_preds, y_test)

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
start = time.time()
knn_model.fit(X_train, y_train)
knn_time = time.time() - start
knn_preds = knn_model.predict(X_test)
knn_metrics = evaluate_model(knn_preds, y_test)

metrics_data = {
    'Model': ['Naive Bayes', 'KNN', 'GPT-J'],
    'Accuracy': [nb_metrics['Accuracy'], knn_metrics['Accuracy'], 'In Progress'],
    'Precision': [nb_metrics['Precision'], knn_metrics['Precision'], 'In Progress'],
    'Recall': [nb_metrics['Recall'], knn_metrics['Recall'], 'In Progress'],
    'F1 Score': [nb_metrics['F1 Score'], knn_metrics['F1 Score'], 'In Progress'],
    'Response Time (s)': [nb_time, knn_time, 'In Progress']
}

metrics_df = pd.DataFrame(metrics_data)
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Response Time (s)
0,Naive Bayes,0.703,0.592857,0.703,0.623042,0.587809
1,KNN,0.732,0.703447,0.732,0.71157,0.018881
2,GPT-J,In Progress,In Progress,In Progress,In Progress,In Progress


In [8]:
# Decode the labels back to original responses if needed
# decoded_responses = label_encoder.inverse_transform(preds)
queries = test_data["Prompt"].tolist()
true_labels = test_data["Response"].tolist()

## Baseline Models: GPT-J without Fine-Tuning

In [None]:
gptj_tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
gptj_model = AutoModelForCausalLM.from_pretrained("gpt2-large")

In [None]:
gptj_tokenizer.pad_token = gptj_tokenizer.eos_token
# Function to get GPT-J prediction for each query
def get_gptj_prediction(query, max_length=50):
    inputs = gptj_tokenizer(query, return_tensors="pt", padding=True)  # Adds padding
    with torch.no_grad():
        outputs = gptj_model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,  # Adds attention mask
            max_length=max_length,
            pad_token_id=gptj_tokenizer.eos_token_id  # Sets pad token ID to EOS
        )
    response = gptj_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
# Get predictions and average response time
def evaluate_gptj(queries, true_labels):
    predictions = []
    total_time = 0

    for query in tqdm(queries, desc="Processing Queries"):
        start_time = time.time()
        response = get_gptj_prediction(query)
        end_time = time.time()
        # print(query + ', ' + response)
        # model's response = predicted label
        predictions.append(response)
        total_time += (end_time - start_time)

    avg_response_time = total_time / len(queries)

    return predictions, avg_response_time

In [None]:
# Load the sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
def calculate_similarity(expected_output, actual_output):
    expected_embedding = model.encode(expected_output)
    actual_embedding = model.encode(actual_output)
    similarity = cosine_similarity([expected_embedding], [actual_embedding])[0][0]
    return similarity

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()

    smoothie = SmoothingFunction().method4

    score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
    return score

def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)

    rouge_l_precision = scores['rougeL'].precision
    rouge_l_recall = scores['rougeL'].recall
    rouge_l_f1 = scores['rougeL'].fmeasure

    return rouge_l_precision, rouge_l_recall, rouge_l_f1


def get_metrics(queries, true_labels, evaluation_func):

    predictions, avg_response_time = evaluation_func(queries, true_labels)
    similarities = [calculate_similarity(exp, act) for exp, act in zip(true_labels, predictions)]
    cos_sim = np.sum(similarities) / len(similarities)
    bleu_score = 0
    for i in range(len(predictions)):
        bleu_score += calculate_bleu(true_labels_subset[i], predictions[i])
    bleu_score /= len(predictions)
    rouge_l_precision = np.zeros(len(predictions))
    rouge_l_recall = np.zeros(len(predictions))
    rouge_l_f1 = np.zeros(len(predictions))
    for i in range(len(predictions)):
        rouge_l_precision[i], rouge_l_recall[i], rouge_l_f1[i] = calculate_rouge(true_labels_subset[i], predictions[i])

    rouge_l_precision = np.mean(rouge_l_precision)
    rouge_l_recall = np.mean(rouge_l_recall)
    rouge_l_f1 = np.mean(rouge_l_f1)

    return avg_response_time, bleu_score, cos_sim, rouge_l_precision, rouge_l_recall, rouge_l_f1

#Pretrained Model Results

In [9]:
queries_subset = queries[:100]
true_labels_subset = true_labels[:100]

#avg_response_time_pre, bleu_score_pre, cos_sim_pre, rouge_l_precision_pre, rouge_l_recall_pre, rouge_l_f1_pre = get_metrics(queries_subset, true_labels_subset, evaluate_gptj)
avg_response_time_pre, bleu_score_pre, cos_sim_pre, rouge_l_precision_pre, rouge_l_recall_pre, rouge_l_f1_pre = 3.608155, 0.029, 0.5742, 0.0986, 0.3918, 0.1562
print(f"BLEU Score: {bleu_score_pre * 100:.2f}%")
print(f"Cosine Similarity: {cos_sim_pre * 100:.2f}%")
print(f"Rouge-L Precision: {rouge_l_precision_pre * 100:.2f}%")
print(f"Rouge-L Recall: {rouge_l_recall_pre * 100:.2f}%")
print(f"Rouge-L F1 Score: {rouge_l_f1_pre * 100:.2f}%")

BLEU Score: 2.90%
Cosine Similarity: 57.42%
Rouge-L Precision: 9.86%
Rouge-L Recall: 39.18%
Rouge-L F1 Score: 15.62%


In [10]:
metrics_classification_data = {
    'Model': ['Naive Bayes', 'KNN'],
    'Accuracy': [nb_metrics['Accuracy'], knn_metrics['Accuracy']],
    'Precision': [nb_metrics['Precision'], knn_metrics['Precision']],
    'Recall': [nb_metrics['Recall'], knn_metrics['Recall']],
    'F1 Score': [nb_metrics['F1 Score'], knn_metrics['F1 Score']],
    'Response Time (s)': [nb_time, knn_time]
}

metrics_classification_df = pd.DataFrame(metrics_classification_data)
metrics_classification_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Response Time (s)
0,Naive Bayes,0.703,0.592857,0.703,0.623042,0.587809
1,KNN,0.732,0.703447,0.732,0.71157,0.018881


In [11]:
metrics_llm_data = {
    'Model': ['GPT2-Large[Pretrained]','GPT2-Large [Fine Tuned]', 'GPT J'],
    'BLEU Score': [bleu_score_pre, "In Progress", "In Progress"],
    'Cosine Similarity': [cos_sim_pre, "In Progress", "In Progress"],
    'Rouge-L Precision': [rouge_l_precision_pre, "In Progress", "In Progress"],
    'Rouge-L Recall': [rouge_l_recall_pre, "In Progress", "In Progress"],
    'Rouge-L F1 Score': [rouge_l_f1_pre, "In Progress", "In Progress"],
    'Response Time (s)': [avg_response_time_pre, "In Progress", "In Progress"]
}

metrics_llm_df = pd.DataFrame(metrics_llm_data)
metrics_llm_df

Unnamed: 0,Model,BLEU Score,Cosine Similarity,Rouge-L Precision,Rouge-L Recall,Rouge-L F1 Score,Response Time (s)
0,GPT2-Large[Pretrained],0.029,0.5742,0.0986,0.3918,0.1562,3.608155
1,GPT2-Large [Fine Tuned],In Progress,In Progress,In Progress,In Progress,In Progress,In Progress
2,GPT J,In Progress,In Progress,In Progress,In Progress,In Progress,In Progress


#Fine Tuned Model training

In [12]:
gpt_model_fine = AutoModelForCausalLM.from_pretrained("gpt2-large")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to EOS for GPT-2

# Define a function to tokenize each row
def preprocess_row(row):
    # Combine prompt and response with a separator
    combined_text = f"{row['Prompt']} <|endoftext|> {row['Response']}"

    # Tokenize the text with padding and truncation
    tokenized_output = tokenizer(
        combined_text,
        max_length=512,
        padding="max_length",
        truncation=True
    )
    return {
        "input_ids": tokenized_output["input_ids"],
        "attention_mask": tokenized_output["attention_mask"]
    }

# Apply the preprocessing function to each row in the DataFrame
tokenized_data = train_data.apply(preprocess_row, axis=1, result_type="expand")

# Add the tokenized columns to the original DataFrame
train_data["input_ids"] = tokenized_data["input_ids"]
train_data["attention_mask"] = tokenized_data["attention_mask"]

tokenized_test_data = test_data.apply(preprocess_row, axis=1, result_type="expand")

test_data["input_ids"] = tokenized_test_data["input_ids"]
test_data["attention_mask"] = tokenized_test_data["attention_mask"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
dataset = Dataset.from_pandas(train_data[["input_ids", "attention_mask"]][:1000])
test_dataset = Dataset.from_pandas(test_data[["input_ids", "attention_mask"]][:1000])

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    eval_strategy="steps",
    eval_steps=200,
    per_device_eval_batch_size=4,
    eval_accumulation_steps=4
)

def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    if labels is None:
        labels = inputs.get("input_ids")
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    # Get predictions and labels
    logits, labels = eval_pred
    # Calculate loss
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)).item()

    # Return metrics dictionary
    return {"loss": loss}

# Set up the Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        return compute_loss(model, inputs, return_outputs=return_outputs)


trainer = CustomTrainer(
    model=gpt_model_fine,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Start training
trainer.train()

  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss


#Fine Tuned model Results

In [None]:
def get_gpt_fine_prediction(query, max_length=50):
    inputs = gptj_tokenizer(query, return_tensors="pt", padding=True)  # Adds padding
    with torch.no_grad():
        outputs = gpt_model_fine.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,  # Adds attention mask
            max_length=max_length,
            pad_token_id=gptj_tokenizer.eos_token_id  # Sets pad token ID to EOS
        )
    response = gptj_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def evaluate_gpt_fine(queries, true_labels):
    predictions = []
    total_time = 0

    for query in tqdm(queries, desc="Processing Queries"):
        start_time = time.time()
        response = get_gpt_fine_prediction(query)
        end_time = time.time()
        # print(query + ', ' + response)
        # model's response = predicted label
        predictions.append(response)
        total_time += (end_time - start_time)

    avg_response_time = total_time / len(queries)

    return predictions, avg_response_time

In [None]:
avg_response_time_fine, bleu_score_fine, cos_sim_fine, rouge_l_precision_fine, rouge_l_recall_fine, rouge_l_f1_fine = get_metrics(queries_subset, true_labels_subset, evaluate_gpt_fine)
#avg_response_time_fine, bleu_score_fine, cos_sim_fine, rouge_l_precision_fine, rouge_l_recall_fine, rouge_l_f1_fine =
print(f"BLEU Score: {bleu_score_fine * 100:.2f}%")
print(f"Cosine Similarity: {cos_sim_fine * 100:.2f}%")
print(f"Rouge-L Precision: {rouge_l_precision_fine * 100:.2f}%")
print(f"Rouge-L Recall: {rouge_l_recall_fine * 100:.2f}%")
print(f"Rouge-L F1 Score: {rouge_l_f1_fine * 100:.2f}%")

In [None]:
metrics_llm_data = {
    'Model': ['GPT2-Large[Pretrained]','GPT2-Large [Fine Tuned]', 'GPT J'],
    'BLEU Score': [bleu_score_pre, bleu_score_fine, "In Progress"],
    'Cosine Similarity': [cos_sim_pre, cos_sim_fine, "In Progress"],
    'Rouge-L Precision': [rouge_l_precision_pre, rouge_l_precision_fine, "In Progress"],
    'Rouge-L Recall': [rouge_l_recall_pre, rouge_l_recall_fine, "In Progress"],
    'Rouge-L F1 Score': [rouge_l_f1_pre, rouge_l_f1_fine, "In Progress"],
    'Response Time (s)': [avg_response_time_pre, avg_response_time_fine, "In Progress"]
}

metrics_llm_df = pd.DataFrame(metrics_llm_data)
metrics_llm_df