<a href="https://colab.research.google.com/github/jeffreyong15/Counsel.NLP/blob/main/Baseline%20Experiment/Baseline%20Evaluation/Baseline_Evaluation(260)%5BGPTJ_EDIT%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Library

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

## Data Preprocessing

In [None]:
# Load the dataset
df = pd.read_csv("academic_advising_data.csv")
df.head()

Unnamed: 0,Prompt,Response,Category
0,When is CS362 offered?,CS362 is offered every Fall and Spring.,Course Scheduling
1,When is CS105 offered?,CS105 is offered every Summer.,Course Scheduling
2,Can I switch to a double major?,"Yes, you can discuss this option with your adv...",Changing Major
3,How do I register for next semester?,You can register through the online portal sta...,Course Scheduling
4,Do I need elective credits to graduate?,"Yes, you need at least 20 elective credits.",Graduation Requirements


In [None]:
# Encode the responses into labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Response'])

# Split into training, validation, and test sets
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Category'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['Category'])

print("Training set:", train_data.shape)
print("Validation set:", val_data.shape)
print("Test set:", test_data.shape)

Training set: (8000, 4)
Validation set: (1000, 4)
Test set: (1000, 4)


In [None]:
# Vectorize the data
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['Prompt']).toarray()
X_val = vectorizer.transform(val_data['Prompt']).toarray()
X_test = vectorizer.transform(test_data['Prompt']).toarray()

y_train, y_val, y_test = train_data['Response'], val_data['Response'], test_data['Response']

## Baseline Models: Naive Bayes and KNN

In [None]:
# Function to evaluate models
def evaluate_model(preds, y_true):
    return {
        "Accuracy": accuracy_score(y_true, preds),
        "Precision": precision_score(y_true, preds, average="weighted", zero_division=0),
        "Recall": recall_score(y_true, preds, average="weighted", zero_division=0),
        "F1 Score": f1_score(y_true, preds, average="weighted", zero_division=0)
    }

In [None]:
# Naive Bayes
nb_model = MultinomialNB()
start = time.time()
nb_model.fit(X_train, y_train)
nb_time = time.time() - start
nb_preds = nb_model.predict(X_test)
nb_metrics = evaluate_model(nb_preds, y_test)

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
start = time.time()
knn_model.fit(X_train, y_train)
knn_time = time.time() - start
knn_preds = knn_model.predict(X_test)
knn_metrics = evaluate_model(knn_preds, y_test)

metrics_data = {
    'Model': ['Naive Bayes', 'KNN', 'GPT-J'],
    'Accuracy': [nb_metrics['Accuracy'], knn_metrics['Accuracy'], 'In Progress'],
    'Precision': [nb_metrics['Precision'], knn_metrics['Precision'], 'In Progress'],
    'Recall': [nb_metrics['Recall'], knn_metrics['Recall'], 'In Progress'],
    'F1 Score': [nb_metrics['F1 Score'], knn_metrics['F1 Score'], 'In Progress'],
    'Response Time (s)': [nb_time, knn_time, 'In Progress']
}

metrics_df = pd.DataFrame(metrics_data)
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Response Time (s)
0,Naive Bayes,0.703,0.592857,0.703,0.623042,1.642352
1,KNN,0.732,0.703447,0.732,0.71157,0.022197
2,GPT-J,In Progress,In Progress,In Progress,In Progress,In Progress


In [None]:
# Decode the labels back to original responses if needed
# decoded_responses = label_encoder.inverse_transform(preds)

## Baseline Models: GPT-J without Fine-Tuning

In [None]:
import os
os.environ["HF_TOKEN"] = "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"

In [None]:
gptj_tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
gptj_model = AutoModelForCausalLM.from_pretrained("gpt2-large")
queries = test_data["Prompt"].tolist()
true_labels = test_data["Response"].tolist()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
gptj_tokenizer.pad_token = gptj_tokenizer.eos_token
# Function to get GPT-J prediction for each query
def get_gptj_prediction(query, max_length=50):
    inputs = gptj_tokenizer(query, return_tensors="pt", padding=True)  # Adds padding
    with torch.no_grad():
        outputs = gptj_model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,  # Adds attention mask
            max_length=max_length,
            pad_token_id=gptj_tokenizer.eos_token_id  # Sets pad token ID to EOS
        )
    response = gptj_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
from tqdm import tqdm

In [None]:
# Get predictions and average response time
def evaluate_gptj(queries, true_labels):
    predictions = []
    total_time = 0

    for query in tqdm(queries, desc="Processing Queries"):
        start_time = time.time()
        response = get_gptj_prediction(query)
        end_time = time.time()
        # print(query + ', ' + response)
        # model's response = predicted label
        predictions.append(response)
        total_time += (end_time - start_time)

    avg_response_time = total_time / len(queries)

    return predictions, avg_response_time

In [13]:
queries_subset = queries[:1000]
true_labels_subset = true_labels[:1000]

predictions, avg_response_time = evaluate_gptj(queries_subset, true_labels_subset)

# Calculate metrics for GPT-J
#accuracy = accuracy_score(true_labels_subset, predictions)
#precision = precision_score(true_labels_subset, predictions, average='weighted')
#recall = recall_score(true_labels_subset, predictions, average='weighted')
#f1 = f1_score(true_labels_subset, predictions, average='weighted')

#print("GPT-J Model Performance:")
#print(f"Accuracy: {accuracy * 100:.2f}%")
#print(f"Precision: {precision * 100:.2f}%")
#print(f"Recall: {recall * 100:.2f}%")
#print(f"F1 Score: {f1 * 100:.2f}%")
#print(f"Average Response Time: {avg_response_time:.2f} seconds")

Processing Queries: 100%|██████████| 1000/1000 [4:06:40<00:00, 14.80s/it]


In [14]:
for i in range(1):
    print(queries_subset[i] + ', ' + predictions[i] + ', ' + true_labels_subset[i])

How do I register for next semester?, How do I register for next semester?

You can register for the next semester by filling out the registration form.

How do I change my name?

You can change your name by filling out the change of name form.
, You can register through the online portal starting in October.


In [15]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.1


In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load the sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [17]:
def calculate_similarity(expected_output, actual_output):
    # Generate embeddings
    expected_embedding = model.encode(expected_output)
    actual_embedding = model.encode(actual_output)
    # Compute cosine similarity
    similarity = cosine_similarity([expected_embedding], [actual_embedding])[0][0]
    return similarity

In [18]:
similarities = [calculate_similarity(exp, act) for exp, act in zip(true_labels_subset, predictions)]

In [26]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=4e6d51a47c38c92abec3d5526fca50aa2acebb19e06dfe4d6892471e4c5b0a03
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [28]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

cos_sim = np.sum(similarities) / len(similarities)

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()

    smoothie = SmoothingFunction().method4

    score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
    return score

# Example usage
bleu_score = 0
for i in range(len(predictions)):
    bleu_score += calculate_bleu(true_labels_subset[i], predictions[i])
bleu_score /= len(predictions)

def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)

    rouge_l_precision = scores['rougeL'].precision
    rouge_l_recall = scores['rougeL'].recall
    rouge_l_f1 = scores['rougeL'].fmeasure

    return rouge_l_precision, rouge_l_recall, rouge_l_f1

rouge_l_precision = np.zeros(len(predictions))
rouge_l_recall = np.zeros(len(predictions))
rouge_l_f1 = np.zeros(len(predictions))
for i in range(len(predictions)):
    rouge_l_precision[i], rouge_l_recall[i], rouge_l_f1[i] = calculate_rouge(true_labels_subset[i], predictions[i])

rouge_l_precision = np.mean(rouge_l_precision)
rouge_l_recall = np.mean(rouge_l_recall)
rouge_l_f1 = np.mean(rouge_l_f1)

In [30]:
print(f"BLEU Score: {bleu_score * 100:.2f}%")
print(f"Cosine Similarity: {cos_sim * 100:.2f}%")
print(f"Rouge-L Precision: {rouge_l_precision * 100:.2f}%")
print(f"Rouge-L Recall: {rouge_l_recall * 100:.2f}%")
print(f"Rouge-L F1 Score: {rouge_l_f1 * 100:.2f}%")

BLEU Score: 2.86%
Cosine Similarity: 58.20%
Rouge-L Precision: 10.11%
Rouge-L Recall: 40.53%
Rouge-L F1 Score: 16.03%


In [32]:
metrics_classification_data = {
    'Model': ['Naive Bayes', 'KNN'],
    'Accuracy': [nb_metrics['Accuracy'], knn_metrics['Accuracy']],
    'Precision': [nb_metrics['Precision'], knn_metrics['Precision']],
    'Recall': [nb_metrics['Recall'], knn_metrics['Recall']],
    'F1 Score': [nb_metrics['F1 Score'], knn_metrics['F1 Score']],
    'Response Time (s)': [nb_time, knn_time]
}

metrics_classification_df = pd.DataFrame(metrics_classification_data)
metrics_classification_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Response Time (s)
0,Naive Bayes,0.703,0.592857,0.703,0.623042,1.642352
1,KNN,0.732,0.703447,0.732,0.71157,0.022197


In [33]:
metrics_llm_data = {
    'Model': ['GPT2-Large[Pretrained]','GPT2-Large [Fine Tuned]', 'GPT J'],
    'BLEU Score': [bleu_score, "In Progress", "In Progress"],
    'Cosine Similarity': [cos_sim, "In Progress", "In Progress"],
    'Rouge-L Precision': [rouge_l_precision, "In Progress", "In Progress"],
    'Rouge-L Recall': [rouge_l_recall, "In Progress", "In Progress"],
    'Rouge-L F1 Score': [rouge_l_f1, "In Progress", "In Progress"],
    'Response Time (s)': [avg_response_time, "In Progress", "In Progress"]
}

metrics_llm_df = pd.DataFrame(metrics_llm_data)
metrics_llm_df

Unnamed: 0,Model,BLEU Score,Cosine Similarity,Rouge-L Precision,Rouge-L Recall,Rouge-L F1 Score,Response Time (s)
0,GPT2-Large[Pretrained],0.028605,0.581993,0.101056,0.405278,0.160338,14.798213
1,GPT2-Large [Fine Tuned],In Progress,In Progress,In Progress,In Progress,In Progress,In Progress
2,GPT J,In Progress,In Progress,In Progress,In Progress,In Progress,In Progress


In [21]:
gpt_model_fine = AutoModelForCausalLM.from_pretrained("gpt2-large")
from transformers import Trainer, TrainingArguments