<a href="https://colab.research.google.com/github/jeffreyong15/Counsel.NLP/blob/main/Baseline%20Experiment/Baseline%20Evaluation/Baseline_Evaluation(260).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Library

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

## Data Preprocessing

In [45]:
# Load the dataset
df = pd.read_csv("academic_advising_data.csv")
df.head()

Unnamed: 0,Prompt,Response,Category
0,When is CS362 offered?,CS362 is offered every Fall and Spring.,Course Scheduling
1,When is CS105 offered?,CS105 is offered every Summer.,Course Scheduling
2,Can I switch to a double major?,"Yes, you can discuss this option with your adv...",Changing Major
3,How do I register for next semester?,You can register through the online portal sta...,Course Scheduling
4,Do I need elective credits to graduate?,"Yes, you need at least 20 elective credits.",Graduation Requirements


In [46]:
# Encode the responses into labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Response'])

# Split into training, validation, and test sets
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Category'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['Category'])

print("Training set:", train_data.shape)
print("Validation set:", val_data.shape)
print("Test set:", test_data.shape)

Training set: (8000, 4)
Validation set: (1000, 4)
Test set: (1000, 4)


In [47]:
# Vectorize the data
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['Prompt']).toarray()
X_val = vectorizer.transform(val_data['Prompt']).toarray()
X_test = vectorizer.transform(test_data['Prompt']).toarray()

y_train, y_val, y_test = train_data['Response'], val_data['Response'], test_data['Response']

## Baseline Models: Naive Bayes and KNN

In [48]:
# Function to evaluate models
def evaluate_model(preds, y_true):
    return {
        "Accuracy": accuracy_score(y_true, preds),
        "Precision": precision_score(y_true, preds, average="weighted", zero_division=0),
        "Recall": recall_score(y_true, preds, average="weighted", zero_division=0),
        "F1 Score": f1_score(y_true, preds, average="weighted", zero_division=0)
    }

In [49]:
# Naive Bayes
nb_model = MultinomialNB()
start = time.time()
nb_model.fit(X_train, y_train)
nb_time = time.time() - start
nb_preds = nb_model.predict(X_test)
nb_metrics = evaluate_model(nb_preds, y_test)

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
start = time.time()
knn_model.fit(X_train, y_train)
knn_time = time.time() - start
knn_preds = knn_model.predict(X_test)
knn_metrics = evaluate_model(knn_preds, y_test)

metrics_data = {
    'Model': ['Naive Bayes', 'KNN', 'GPT-J'],
    'Accuracy': [nb_metrics['Accuracy'], knn_metrics['Accuracy'], 'In Progress'],
    'Precision': [nb_metrics['Precision'], knn_metrics['Precision'], 'In Progress'],
    'Recall': [nb_metrics['Recall'], knn_metrics['Recall'], 'In Progress'],
    'F1 Score': [nb_metrics['F1 Score'], knn_metrics['F1 Score'], 'In Progress'],
    'Response Time (s)': [nb_time, knn_time, 'In Progress']
}

metrics_df = pd.DataFrame(metrics_data)
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Response Time (s)
0,Naive Bayes,0.703,0.592857,0.703,0.623042,1.03142
1,KNN,0.732,0.703447,0.732,0.71157,0.026017
2,GPT-J,In Progress,In Progress,In Progress,In Progress,In Progress


In [None]:
# Decode the labels back to original responses if needed
# decoded_responses = label_encoder.inverse_transform(preds)

## Baseline Models: GPT-J without Fine-Tuning

In [None]:
import os
os.environ["HF_TOKEN"] = "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"

In [None]:
gptj_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
gptj_model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
queries = test_data["Prompt"].tolist()
true_labels = test_data["Response"].tolist()

In [None]:
# Function to get GPT-J prediction for each query
def get_gptj_prediction(query, max_length=50):
    inputs = gptj_tokenizer(query, return_tensors="pt")
    with torch.no_grad():
        outputs = gptj_model.generate(inputs.input_ids, max_length=max_length)
    response = gptj_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
# Get predictions and average response time
def evaluate_gptj(queries, true_labels):
    predictions = []
    total_time = 0

    for query in queries:
        start_time = time.time()
        response = get_gptj_prediction(query)
        end_time = time.time()

        # model's response = predicted label
        predictions.append(response)
        total_time += (end_time - start_time)

    avg_response_time = total_time / len(queries)

    return predictions, avg_response_time

In [None]:
predictions, avg_response_time = evaluate_gptj(queries, true_labels)

# Calculate metrics for GPT-J
accuracy = accuracy_score(true_labels, predictions)
precision = precision_score(true_labels, predictions, average='weighted')
recall = recall_score(true_labels, predictions, average='weighted')
f1 = f1_score(true_labels, predictions, average='weighted')

print("GPT-J Model Performance:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print(f"Average Response Time: {avg_response_time:.2f} seconds")