<a href="https://colab.research.google.com/github/jeffreyong15/Counsel.NLP/blob/main/Baseline%20Experiment/Baseline%20Evaluation/Baseline_Evaluation(260)%5BGPTJ_EDIT%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Library

In [None]:
# !pip install sentence-transformers==3.1.1
# !pip install transformers==4.45.2
!pip install datasets
!pip install rouge-score
!pip install bitsandbytes
# !pip install peft

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
!pip install accelerate==0.34.2

Collecting accelerate==0.34.2
  Downloading accelerate-0.34.2-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-0.34.2-py3-none-any.whl (324 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.3.0
    Uninstalling accelerate-1.3.0:
      Successfully uninstalled accelerate-1.3.0
Successfully installed accelerate-0.34.2


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, BitsAndBytesConfig
import transformers
import torch
import os
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from datasets import Dataset
from torch.nn import CrossEntropyLoss
from peft import get_peft_model, LoraConfig
import bitsandbytes as bnb
# os.environ["HF_TOKEN"] = "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"
os.environ["HF_TOKEN"] = "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"

## Data Preprocessing

In [None]:
# Load the dataset
df = pd.read_csv("academic_advising_data.csv")
df.head()

Unnamed: 0,Prompt,Response,Category
0,When is CS362 offered?,CS362 is offered every Fall and Spring.,Course Scheduling
1,When is CS105 offered?,CS105 is offered every Summer.,Course Scheduling
2,Can I switch to a double major?,"Yes, you can discuss this option with your adv...",Changing Major
3,How do I register for next semester?,You can register through the online portal sta...,Course Scheduling
4,Do I need elective credits to graduate?,"Yes, you need at least 20 elective credits.",Graduation Requirements


In [None]:
# Encode the responses into labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Response'])

# Split into training, validation, and test sets
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Category'])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data['Category'])

print("Training set:", train_data.shape)
print("Validation set:", val_data.shape)
print("Test set:", test_data.shape)

Training set: (8000, 4)
Validation set: (1000, 4)
Test set: (1000, 4)


In [None]:
# Vectorize the data
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['Prompt']).toarray()
X_val = vectorizer.transform(val_data['Prompt']).toarray()
X_test = vectorizer.transform(test_data['Prompt']).toarray()

y_train, y_val, y_test = train_data['Response'], val_data['Response'], test_data['Response']

## Baseline Models: Naive Bayes and KNN

In [None]:
# Function to evaluate models
def evaluate_model(preds, y_true):
    return {

        "Accuracy": accuracy_score(y_true, preds),
        "Precision": precision_score(y_true, preds, average="weighted", zero_division=0),
        "Recall": recall_score(y_true, preds, average="weighted", zero_division=0),
        "F1 Score": f1_score(y_true, preds, average="weighted", zero_division=0)
    }

In [None]:
# Naive Bayes
nb_model = MultinomialNB()
start = time.time()
nb_model.fit(X_train, y_train)
nb_time = time.time() - start
nb_preds = nb_model.predict(X_test)
nb_metrics = evaluate_model(nb_preds, y_test)

# KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
start = time.time()
knn_model.fit(X_train, y_train)
knn_time = time.time() - start
knn_preds = knn_model.predict(X_test)
knn_metrics = evaluate_model(knn_preds, y_test)

metrics_data = {
    'Model': ['Naive Bayes', 'KNN', 'GPT-J'],
    'Accuracy': [nb_metrics['Accuracy'], knn_metrics['Accuracy'], 'In Progress'],
    'Precision': [nb_metrics['Precision'], knn_metrics['Precision'], 'In Progress'],
    'Recall': [nb_metrics['Recall'], knn_metrics['Recall'], 'In Progress'],
    'F1 Score': [nb_metrics['F1 Score'], knn_metrics['F1 Score'], 'In Progress'],
    'Response Time (s)': [nb_time, knn_time, 'In Progress']
}

metrics_df = pd.DataFrame(metrics_data)
metrics_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Response Time (s)
0,Naive Bayes,0.703,0.592857,0.703,0.623042,0.632306
1,KNN,0.732,0.703447,0.732,0.71157,0.027186
2,GPT-J,In Progress,In Progress,In Progress,In Progress,In Progress


In [None]:
# Decode the labels back to original responses if needed
# decoded_responses = label_encoder.inverse_transform(preds)
queries = val_data["Prompt"].tolist() # was test_data
true_labels = val_data["Response"].tolist() # was test_data

## Baseline Models: GPT-J without Fine-Tuning

In [None]:
gptj_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gptj_model = AutoModelForCausalLM.from_pretrained("gpt2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
gptj_tokenizer.pad_token = gptj_tokenizer.eos_token
# Function to get GPT-J prediction for each query
def get_gptj_prediction(query, max_length=50):
    inputs = gptj_tokenizer(query, return_tensors="pt", padding=True)  # Adds padding
    with torch.no_grad():
        outputs = gptj_model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,  # Adds attention mask
            max_length=max_length,
            pad_token_id=gptj_tokenizer.eos_token_id  # Sets pad token ID to EOS
        )
    response = gptj_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
# Get predictions and average response time
def evaluate_gptj(queries, true_labels):
    predictions = []
    total_time = 0

    for query in tqdm(queries, desc="Processing Queries"):
        start_time = time.time()
        response = get_gptj_prediction(query)
        end_time = time.time()
        # print(query + ', ' + response)
        # model's response = predicted label
        predictions.append(response)
        total_time += (end_time - start_time)

    avg_response_time = total_time / len(queries)

    return predictions, avg_response_time

In [None]:
# Load the sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def calculate_similarity(expected_output, actual_output):
    expected_embedding = model.encode(expected_output)
    actual_embedding = model.encode(actual_output)
    similarity = cosine_similarity([expected_embedding], [actual_embedding])[0][0]
    return similarity

def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()

    smoothie = SmoothingFunction().method4

    score = sentence_bleu(reference, candidate, smoothing_function=smoothie)
    return score

def calculate_rouge(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)

    rouge_l_precision = scores['rougeL'].precision
    rouge_l_recall = scores['rougeL'].recall
    rouge_l_f1 = scores['rougeL'].fmeasure

    return rouge_l_precision, rouge_l_recall, rouge_l_f1


def get_metrics(queries, true_labels, evaluation_func):

    predictions, avg_response_time = evaluation_func(queries, true_labels)
    similarities = [calculate_similarity(exp, act) for exp, act in zip(true_labels, predictions)]
    cos_sim = np.sum(similarities) / len(similarities)
    bleu_score = 0
    for i in range(len(predictions)):
        bleu_score += calculate_bleu(true_labels_subset[i], predictions[i])
    bleu_score /= len(predictions)
    rouge_l_precision = np.zeros(len(predictions))
    rouge_l_recall = np.zeros(len(predictions))
    rouge_l_f1 = np.zeros(len(predictions))
    for i in range(len(predictions)):
        rouge_l_precision[i], rouge_l_recall[i], rouge_l_f1[i] = calculate_rouge(true_labels_subset[i], predictions[i])

    rouge_l_precision = np.mean(rouge_l_precision)
    rouge_l_recall = np.mean(rouge_l_recall)
    rouge_l_f1 = np.mean(rouge_l_f1)

    return avg_response_time, bleu_score, cos_sim, rouge_l_precision, rouge_l_recall, rouge_l_f1

#Pretrained Model Results

In [None]:
queries_subset = queries[:100]
true_labels_subset = true_labels[:100]

#avg_response_time_pre, bleu_score_pre, cos_sim_pre, rouge_l_precision_pre, rouge_l_recall_pre, rouge_l_f1_pre = get_metrics(queries_subset, true_labels_subset, evaluate_gptj)
avg_response_time_pre, bleu_score_pre, cos_sim_pre, rouge_l_precision_pre, rouge_l_recall_pre, rouge_l_f1_pre = 3.608155, 0.029, 0.5742, 0.0986, 0.3918, 0.1562
print(f"BLEU Score: {bleu_score_pre * 100:.2f}%")
print(f"Cosine Similarity: {cos_sim_pre * 100:.2f}%")
print(f"Rouge-L Precision: {rouge_l_precision_pre * 100:.2f}%")
print(f"Rouge-L Recall: {rouge_l_recall_pre * 100:.2f}%")
print(f"Rouge-L F1 Score: {rouge_l_f1_pre * 100:.2f}%")

BLEU Score: 2.90%
Cosine Similarity: 57.42%
Rouge-L Precision: 9.86%
Rouge-L Recall: 39.18%
Rouge-L F1 Score: 15.62%


In [None]:
metrics_classification_data = {
    'Model': ['Naive Bayes', 'KNN'],
    'Accuracy': [nb_metrics['Accuracy'], knn_metrics['Accuracy']],
    'Precision': [nb_metrics['Precision'], knn_metrics['Precision']],
    'Recall': [nb_metrics['Recall'], knn_metrics['Recall']],
    'F1 Score': [nb_metrics['F1 Score'], knn_metrics['F1 Score']],
    'Response Time (s)': [nb_time, knn_time]
}

metrics_classification_df = pd.DataFrame(metrics_classification_data)
metrics_classification_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,Response Time (s)
0,Naive Bayes,0.703,0.592857,0.703,0.623042,0.632306
1,KNN,0.732,0.703447,0.732,0.71157,0.027186


In [None]:
metrics_llm_data = {
    'Model': ['GPT2-Large[Pretrained]','GPT2-Large [Fine Tuned]', 'GPT J'],
    'BLEU Score': [bleu_score_pre, "In Progress", "In Progress"],
    'Cosine Similarity': [cos_sim_pre, "In Progress", "In Progress"],
    'Rouge-L Precision': [rouge_l_precision_pre, "In Progress", "In Progress"],
    'Rouge-L Recall': [rouge_l_recall_pre, "In Progress", "In Progress"],
    'Rouge-L F1 Score': [rouge_l_f1_pre, "In Progress", "In Progress"],
    'Response Time (s)': [avg_response_time_pre, "In Progress", "In Progress"]
}

metrics_llm_df = pd.DataFrame(metrics_llm_data)
metrics_llm_df

Unnamed: 0,Model,BLEU Score,Cosine Similarity,Rouge-L Precision,Rouge-L Recall,Rouge-L F1 Score,Response Time (s)
0,GPT2-Large[Pretrained],0.029,0.5742,0.0986,0.3918,0.1562,3.608155
1,GPT2-Large [Fine Tuned],In Progress,In Progress,In Progress,In Progress,In Progress,In Progress
2,GPT J,In Progress,In Progress,In Progress,In Progress,In Progress,In Progress


#Fine Tuned Model training

In [None]:
gpt_model_fine = AutoModelForCausalLM.from_pretrained("gpt2")

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to EOS for GPT-2

# Define a function to tokenize each row
def preprocess_row(row):
    # Combine prompt and response with a separator
    combined_text = f"""<|startoftext|>{row['Prompt']} ==> {row['Response']}
    <|endoftext|>"""

    # Tokenize the text with padding and truncation
    tokenized_output = tokenizer(
        combined_text,
        padding=True,
        truncation=True
    )
    return {
        "input_ids": tokenized_output["input_ids"],
        "attention_mask": tokenized_output["attention_mask"],
        "combined": combined_text
    }

# Apply the preprocessing function to each row in the DataFrame
tokenized_data = train_data.apply(preprocess_row, axis=1, result_type="expand")

# Add the tokenized columns to the original DataFrame
train_data["input_ids"] = tokenized_data["input_ids"]
train_data["attention_mask"] = tokenized_data["attention_mask"]
train_data["combined"] = tokenized_data["combined"]

tokenized_test_data = test_data.apply(preprocess_row, axis=1, result_type="expand")

test_data["input_ids"] = tokenized_test_data["input_ids"]
test_data["attention_mask"] = tokenized_test_data["attention_mask"]
test_data["combined"] = tokenized_test_data["combined"]

In [None]:
gpt_model_fine

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
for param in gpt_model_fine.parameters():
    param.requires_grad = False

config = LoraConfig(r=16, lora_alpha=32, target_modules=['c_attn', 'c_proj'], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")

gpt_model_fine = get_peft_model(gpt_model_fine, config)

trainable_params = 0
all_param = 0
for _, param in gpt_model_fine.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}" )

trainable params: 1622016 || all params: 126061824 || trainables%: 1.2866829532785438




In [None]:
train_data

Unnamed: 0,Prompt,Response,Category,label,input_ids,attention_mask,combined
8699,How do I register for next semester?,You can register through the online portal sta...,Course Scheduling,1456,"[27, 91, 9688, 1659, 5239, 91, 29, 2437, 466, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>How do I register for next seme...
3738,Can I switch to a double major?,"Yes, you can discuss this option with your adv...",Changing Major,1003,"[27, 91, 9688, 1659, 5239, 91, 29, 6090, 314, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>Can I switch to a double major?...
9278,What happens if I fail a course?,You should meet with your advisor to discuss o...,Academic Policies,1949,"[27, 91, 9688, 1659, 5239, 91, 29, 2061, 4325,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>What happens if I fail a course...
2400,What happens if I fail a course?,You should meet with your advisor to discuss o...,Academic Policies,1949,"[27, 91, 9688, 1659, 5239, 91, 29, 2061, 4325,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>What happens if I fail a course...
4237,Can I retake a course for a better grade?,"Yes, you can retake a course, and the new grad...",Academic Policies,1004,"[27, 91, 9688, 1659, 5239, 91, 29, 6090, 314, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>Can I retake a course for a bet...
...,...,...,...,...,...,...,...
9874,What is the grading scale?,"The grading scale is A, B, C, D, and F.",Academic Policies,713,"[27, 91, 9688, 1659, 5239, 91, 29, 2061, 318, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>What is the grading scale? ==> ...
6209,When is CS160 offered?,CS160 is offered every Fall and Spring.,Course Scheduling,59,"[27, 91, 9688, 1659, 5239, 91, 29, 2215, 318, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>When is CS160 offered? ==> CS16...
9106,What are the prerequisites for CS328?,You need to complete CS320 and CS295.,Prerequisites,1737,"[27, 91, 9688, 1659, 5239, 91, 29, 2061, 389, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>What are the prerequisites for ...
2692,What are the steps to change my major?,Fill out a change of major form and get approv...,Changing Major,431,"[27, 91, 9688, 1659, 5239, 91, 29, 2061, 389, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",<|startoftext|>What are the steps to change my...


In [None]:
dataset = Dataset.from_pandas(train_data[['combined', 'input_ids','attention_mask',]])
test_dataset = Dataset.from_pandas(test_data[['combined', 'input_ids','attention_mask']])

In [None]:
dataset

Dataset({
    features: ['combined', 'input_ids', 'attention_mask', '__index_level_0__'],
    num_rows: 8000
})

In [None]:
# dataset = Dataset.from_pandas(train_data[["input_ids", "attention_mask"]])
# test_dataset = Dataset.from_pandas(test_data[["input_ids", "attention_mask"]])

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    eval_strategy="steps",
    eval_steps=200,
    per_device_eval_batch_size=4,
    eval_accumulation_steps=4
)

def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    if labels is None:
        labels = inputs.get("input_ids")
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    # Get predictions and labels
    logits, labels = eval_pred
    # Calculate loss
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)).item()

    # Return metrics dictionary
    return {"loss": loss}

# Set up the Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        return compute_loss(model, inputs, return_outputs=return_outputs)


trainer = Trainer(
    model=gpt_model_fine,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=test_dataset,
    # compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Start training
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbaljot6[0m ([33mbaljot6-san-jose-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss,Validation Loss
200,1.5129,1.084413
400,0.8731,0.570157
600,0.6194,0.364697
800,0.4555,0.270892
1000,0.4009,0.238591
1200,0.3486,0.221105
1400,0.3114,0.212104
1600,0.2971,0.207491
1800,0.2688,0.208168
2000,0.2662,0.205363


TrainOutput(global_step=4200, training_loss=0.4349679924192883, metrics={'train_runtime': 366.5341, 'train_samples_per_second': 65.478, 'train_steps_per_second': 16.37, 'total_flos': 333240913465344.0, 'train_loss': 0.4349679924192883, 'epoch': 2.1})

#Fine Tuned model Results

In [None]:
def get_gpt_fine_prediction(query, max_length=50):
    inputs = gptj_tokenizer(query, return_tensors="pt", padding=True)  # Adds padding
    inputs = inputs.to(gpt_model_fine.device)
    with torch.no_grad():
        outputs = gpt_model_fine.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,  # Adds attention mask
            max_length=max_length,
            pad_token_id=gptj_tokenizer.eos_token_id  # Sets pad token ID to EOS
        )
    response = gptj_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def evaluate_gpt_fine(queries, true_labels):
    predictions = []
    total_time = 0

    for query in tqdm(queries, desc="Processing Queries"):
        start_time = time.time()
        response = get_gpt_fine_prediction(query)
        end_time = time.time()
        start_response = response.find('> ')
        end_response = response.find('\n')
        fine_tuned_response = response[start_response + 2:end_response]
        # model's response = predicted label
        print(fine_tuned_response)
        predictions.append(fine_tuned_response)
        total_time += (end_time - start_time)

    avg_response_time = total_time / len(queries)

    return predictions, avg_response_time

In [None]:
avg_response_time_fine, bleu_score_fine, cos_sim_fine, rouge_l_precision_fine, rouge_l_recall_fine, rouge_l_f1_fine = get_metrics(queries_subset, true_labels_subset, evaluate_gpt_fine)
#avg_response_time_fine, bleu_score_fine, cos_sim_fine, rouge_l_precision_fine, rouge_l_recall_fine, rouge_l_f1_fine =
print(f"BLEU Score: {bleu_score_fine * 100:.2f}%")
print(f"Cosine Similarity: {cos_sim_fine * 100:.2f}%")
print(f"Rouge-L Precision: {rouge_l_precision_fine * 100:.2f}%")
print(f"Rouge-L Recall: {rouge_l_recall_fine * 100:.2f}%")
print(f"Rouge-L F1 Score: {rouge_l_f1_fine * 100:.2f}%")

Processing Queries:   1%|          | 1/100 [00:00<01:10,  1.41it/s]

Yes, you need at least 40 elective credits.


Processing Queries:   2%|▏         | 2/100 [00:01<01:12,  1.35it/s]

CS477 is offered every Fall and Spring.


Processing Queries:   3%|▎         | 3/100 [00:02<01:10,  1.37it/s]

Fill out a change of major form and get approval from your advisor.


Processing Queries:   3%|▎         | 3/100 [00:02<01:20,  1.21it/s]


KeyboardInterrupt: 

In [None]:
metrics_llm_data = {
    'Model': ['GPT2 [Pretrained]','GPT2 [Fine Tuned]', 'GPT J'],
    'BLEU Score': [bleu_score_pre, bleu_score_fine, "In Progress"],
    'Cosine Similarity': [cos_sim_pre, cos_sim_fine, "In Progress"],
    'Rouge-L Precision': [rouge_l_precision_pre, rouge_l_precision_fine, "In Progress"],
    'Rouge-L Recall': [rouge_l_recall_pre, rouge_l_recall_fine, "In Progress"],
    'Rouge-L F1 Score': [rouge_l_f1_pre, rouge_l_f1_fine, "In Progress"],
    'Response Time (s)': [avg_response_time_pre, avg_response_time_fine, "In Progress"]
}

metrics_llm_df = pd.DataFrame(metrics_llm_data)
metrics_llm_df

Unnamed: 0,Model,BLEU Score,Cosine Similarity,Rouge-L Precision,Rouge-L Recall,Rouge-L F1 Score,Response Time (s)
0,GPT2 [Pretrained],0.029,0.5742,0.0986,0.3918,0.1562,3.608155
1,GPT2 [Fine Tuned],0.714037,0.824349,0.766786,0.764071,0.763817,0.659231
2,GPT J,In Progress,In Progress,In Progress,In Progress,In Progress,In Progress


# GPT2-Large Fine Tuned

In [None]:
gpt_train =train_data[['Prompt', 'Response']].copy()
gpt_test = test_data[['Prompt', 'Response']].copy()

In [None]:
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
gptj = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", device_map="auto", trust_remote_code=True, quantization_config=bnb_config)
gptj_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
gptj_tokenizer.pad_token = gptj_tokenizer.eos_token


# Define a function to tokenize each row
def preprocess_row(row):
    # Combine prompt and response with a separator
    combined_text = f"""<|startoftext|>{row['Prompt']} ==> {row['Response']}
    <|endoftext|>"""

    # Tokenize the text with padding and truncation
    tokenized_output = gptj_tokenizer(
        combined_text,
        padding=True,
        truncation=True
    )
    return {
        "input_ids": tokenized_output["input_ids"],
        "attention_mask": tokenized_output["attention_mask"],
        "combined": combined_text
    }

# Apply the preprocessing function to each row in the DataFrame
tokenized_data = gpt_train.apply(preprocess_row, axis=1, result_type="expand")

# Add the tokenized columns to the original DataFrame
gpt_train["input_ids"] = tokenized_data["input_ids"]
gpt_train["attention_mask"] = tokenized_data["attention_mask"]
gpt_train["combined"] = tokenized_data["combined"]

tokenized_test_data = gpt_test.apply(preprocess_row, axis=1, result_type="expand")

gpt_test["input_ids"] = tokenized_test_data["input_ids"]
gpt_test["attention_mask"] = tokenized_test_data["attention_mask"]
gpt_test["combined"] = tokenized_test_data["combined"]

config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Some weights of the model checkpoint at EleutherAI/gpt-j-6B were not used when initializing GPTJForCausalLM: ['transformer.h.0.attn.bias', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.10.attn.bias', 'transformer.h.10.attn.masked_bias', 'transformer.h.11.attn.bias', 'transformer.h.11.attn.masked_bias', 'transformer.h.12.attn.bias', 'transformer.h.12.attn.masked_bias', 'transformer.h.13.attn.bias', 'transformer.h.13.attn.masked_bias', 'transformer.h.14.attn.bias', 'transformer.h.14.attn.masked_bias', 'transformer.h.15.attn.bias', 'transformer.h.15.attn.masked_bias', 'transformer.h.16.attn.bias', 'transformer.h.16.attn.masked_bias', 'transformer.h.17.attn.bias', 'transformer.h.17.attn.masked_bias', 'transformer.h.18.attn.bias', 'transformer.h.18.attn.masked_bias', 'transformer.h.19.attn.bias', 'transformer.h.19.attn.masked_bias', 'transformer.h.2.attn.bias', 'transformer.h.2.attn.masked_bias', 'transformer.h.20.attn.bi

tokenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [None]:
gptj

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): Embedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-27): 28 x GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (out_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): GPTJMLP(
          (fc_in): Linear4bit(in_features=4096, out_features=16384, bias=True)
          (fc_out): Linear4bit(in_features=16384, out_features=4096, bias=True)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )


In [None]:
for param in gpt_model_fine.parameters():
    param.requires_grad = False

config = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")

gptj_model_fine = get_peft_model(gptj, config)

trainable_params = 0
all_param = 0
for _, param in gptj_model_fine.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()

print(f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}" )

trainable params: 7340032 || all params: 3239650528 || trainables%: 0.2265686356155018


In [None]:
dataset = Dataset.from_pandas(gpt_train[['combined', 'input_ids','attention_mask',]])
test_dataset = Dataset.from_pandas(gpt_test[['combined', 'input_ids','attention_mask']])

In [None]:
# dataset = Dataset.from_pandas(train_data[["input_ids", "attention_mask"]])
# test_dataset = Dataset.from_pandas(test_data[["input_ids", "attention_mask"]])

training_args = TrainingArguments(
    output_dir="./gptj-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    eval_strategy="steps",
    eval_steps=200,
    per_device_eval_batch_size=4,
    eval_accumulation_steps=4
)

def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    if labels is None:
        labels = inputs.get("input_ids")
    outputs = model(**inputs)
    logits = outputs.get("logits")
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
    return (loss, outputs) if return_outputs else loss

def compute_metrics(eval_pred):
    # Get predictions and labels
    logits, labels = eval_pred
    # Calculate loss
    loss_fct = CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)).item()

    # Return metrics dictionary
    return {"loss": loss}

# Set up the Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        return compute_loss(model, inputs, return_outputs=return_outputs)


trainer = Trainer(
    model=gptj_model_fine,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=test_dataset,
    # compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=transformers.DataCollatorForLanguageModeling(gptj_tokenizer, mlm=False)
)

# Start training
trainer.train()

Step,Training Loss,Validation Loss
200,0.2198,0.214722
400,0.1941,0.20459
600,0.2003,0.19873
800,0.1942,0.195068
1000,0.2098,0.196289
1200,0.191,0.194458
1400,0.1943,0.193481
1600,0.1924,0.192505
1800,0.1775,0.191895
2000,0.1879,0.190918


TrainOutput(global_step=4200, training_loss=0.21534694126674106, metrics={'train_runtime': 1674.4006, 'train_samples_per_second': 14.333, 'train_steps_per_second': 3.583, 'total_flos': 2.2497676737202944e+16, 'train_loss': 0.21534694126674106, 'epoch': 2.1})

In [None]:
def get_gptj_fine_prediction(query, max_length=50):
    inputs = gptj_tokenizer(query, return_tensors="pt", padding=True)  # Adds padding
    inputs = inputs.to(gptj_model_fine.device)
    with torch.no_grad():
        outputs = gptj_model_fine.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,  # Adds attention mask
            max_length=max_length,
            pad_token_id=gptj_tokenizer.eos_token_id  # Sets pad token ID to EOS
        )
    response = gptj_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

def evaluate_gptj_fine(queries, true_labels):
    predictions = []
    total_time = 0

    for query in tqdm(queries, desc="Processing Queries"):
        start_time = time.time()
        response = get_gptj_fine_prediction(query)
        end_time = time.time()
        start_response = response.find('?')
        end_response = response.find('.')
        fine_tuned_response = response[start_response+1:end_response + 1]
        # model's response = predicted label
        print(fine_tuned_response.strip())
        predictions.append(fine_tuned_response.strip())
        total_time += (end_time - start_time)

    avg_response_time = total_time / len(queries)

    return predictions, avg_response_time

In [None]:
gptj_avg_response_time_fine, gptj_bleu_score_fine, gptj_cos_sim_fine, gptj_rouge_l_precision_fine, gptj_rouge_l_recall_fine, gptj_rouge_l_f1_fine = get_metrics(queries_subset, true_labels_subset, evaluate_gptj_fine)
#avg_response_time_fine, bleu_score_fine, cos_sim_fine, rouge_l_precision_fine, rouge_l_recall_fine, rouge_l_f1_fine =
print(f"BLEU Score: {gptj_bleu_score_fine * 100:.2f}%")
print(f"Cosine Similarity: {gptj_cos_sim_fine * 100:.2f}%")
print(f"Rouge-L Precision: {gptj_rouge_l_precision_fine * 100:.2f}%")
print(f"Rouge-L Recall: {gptj_rouge_l_recall_fine * 100:.2f}%")
print(f"Rouge-L F1 Score: {gptj_rouge_l_f1_fine * 100:.2f}%")

Processing Queries:   1%|          | 1/100 [00:03<06:19,  3.83s/it]

Yes, you need at least 30 elective credits.


Processing Queries:   2%|▏         | 2/100 [00:07<06:25,  3.93s/it]

CS477 is offered every Fall and Spring.


Processing Queries:   3%|▎         | 3/100 [00:11<06:12,  3.84s/it]

Changing your major requires approval from the academic advisor and the chair of the major change committee.


Processing Queries:   4%|▍         | 4/100 [00:15<06:07,  3.83s/it]

Core requirements include English 101, Math 121, Science 121, and a required core course.


Processing Queries:   5%|▌         | 5/100 [00:19<06:03,  3.82s/it]

The Senior project is a comprehensive research project that requires a significant amount of time and effort.


Processing Queries:   6%|▌         | 6/100 [00:22<05:56,  3.79s/it]

The Senior Project course should be taken in your final semester.


Processing Queries:   7%|▋         | 7/100 [00:26<05:51,  3.78s/it]

Yes, you need at least 30 elective credits.


Processing Queries:   8%|▊         | 8/100 [00:30<05:48,  3.79s/it]

Can I take courses for graduation without taking the CS106 course?

Yes, you can take courses for graduation without taking the CS106 course.


Processing Queries:   9%|▉         | 9/100 [00:34<05:42,  3.77s/it]

Changing your major requires approval from the academic advisor and the chair of the major change committee.


Processing Queries:  10%|█         | 10/100 [00:38<05:45,  3.84s/it]

The grading scale is a 4-point scale.


Processing Queries:  11%|█         | 11/100 [00:41<05:40,  3.82s/it]

You need a total of 120 credits to graduate.


Processing Queries:  12%|█▏        | 12/100 [00:45<05:33,  3.79s/it]

No, you cannot take CS166 without CS338.


Processing Queries:  13%|█▎        | 13/100 [00:49<05:27,  3.77s/it]

The Senior Project course should be taken in your final semester.


Processing Queries:  14%|█▍        | 14/100 [00:53<05:29,  3.84s/it]

The grading scale is a 4-point scale.


Processing Queries:  15%|█▌        | 15/100 [00:57<05:22,  3.80s/it]

Yes, there is a study group for CS235.


Processing Queries:  16%|█▌        | 16/100 [01:00<05:19,  3.80s/it]




Processing Queries:  17%|█▋        | 17/100 [01:04<05:15,  3.80s/it]

Yes, you can switch to a double major.


Processing Queries:  18%|█▊        | 18/100 [01:08<05:11,  3.79s/it]

You can retake a course for free.


Processing Queries:  19%|█▉        | 19/100 [01:12<05:14,  3.88s/it]

Yes, summer courses are available for students who are interested in a specific subject.


Processing Queries:  20%|██        | 20/100 [01:16<05:12,  3.91s/it]

The grading scale is a 4-point scale.


Processing Queries:  21%|██        | 21/100 [01:20<05:10,  3.94s/it]

CS286 is offered every Fall and Spring.


Processing Queries:  22%|██▏       | 22/100 [01:24<05:03,  3.89s/it]

The Senior project is a comprehensive research project that requires a significant amount of time and effort.


Processing Queries:  23%|██▎       | 23/100 [01:28<04:58,  3.88s/it]

Yes, you need at least 30 elective credits.


Processing Queries:  24%|██▍       | 24/100 [01:32<04:53,  3.87s/it]

You can retake a course for free.


Processing Queries:  25%|██▌       | 25/100 [01:35<04:46,  3.82s/it]

The Senior Project course should be taken in your final semester.


Processing Queries:  26%|██▌       | 26/100 [01:39<04:42,  3.82s/it]

Yes, you can switch to a double major.


Processing Queries:  26%|██▌       | 26/100 [01:42<04:50,  3.93s/it]


KeyboardInterrupt: 

In [None]:
metrics_llm_data = {
    'Model': ['GPT2 [Pretrained]','GPT2 [Fine Tuned]', 'GPT J [Fine Tuned]'],
    'BLEU Score': [bleu_score_pre, bleu_score_fine, gptj_bleu_score_fine],
    'Cosine Similarity': [cos_sim_pre, cos_sim_fine, gptj_cos_sim_fine],
    'Rouge-L Precision': [rouge_l_precision_pre, rouge_l_precision_fine, gptj_rouge_l_precision_fine],
    'Rouge-L Recall': [rouge_l_recall_pre, rouge_l_recall_fine, gptj_rouge_l_recall_fine],
    'Rouge-L F1 Score': [rouge_l_f1_pre, rouge_l_f1_fine, gptj_rouge_l_f1_fine],
    'Response Time (s)': [avg_response_time_pre, avg_response_time_fine, gptj_avg_response_time_fine]
}

metrics_llm_df = pd.DataFrame(metrics_llm_data)
metrics_llm_df

Unnamed: 0,Model,BLEU Score,Cosine Similarity,Rouge-L Precision,Rouge-L Recall,Rouge-L F1 Score,Response Time (s)
0,GPT2 [Pretrained],0.029,0.5742,0.0986,0.3918,0.1562,3.608155
1,GPT2 [Fine Tuned],0.714037,0.824349,0.766786,0.764071,0.763817,0.659231
2,GPT J [Fine Tuned],0.266813,0.693709,0.493334,0.51228,0.490645,3.780393
