<a href="https://colab.research.google.com/github/jeffreyong15/Counsel.NLP/blob/main/Model/Fine_Tune_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install library

In [3]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

## Import library


In [4]:
import json
import torch
import pandas as pd
from datasets import Dataset
from langchain.schema import Document
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, AutoTokenizer
import os

In [5]:
os.environ['HF_TOKEN'] = "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"

## Fine Tune Model

In [4]:
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)

    if not json_data:
        raise ValueError("JSON data is empty")

    print(f"Successfully loaded {len(json_data)} courses")
    return json_data

In [6]:
df = pd.read_csv("QA_Dataset.csv", index_col=0)
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['Context', 'Question', 'Answer', '__index_level_0__'],
        num_rows: 21388
    })
    test: Dataset({
        features: ['Context', 'Question', 'Answer', '__index_level_0__'],
        num_rows: 5347
    })
})

In [7]:
T5_model_fine = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to EOS for T5

def preprocess(row):
    context = row['Context']
    question = row['Question']
    answer = row['Answer']

    input = f"Answer this question based on the context.\nContext: {context}\nQuestion: {question}"
    output = answer
    return {"input_text": input, "output_text": output}

format_dataset = dataset.map(preprocess, remove_columns = dataset["train"].column_names)
format_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Map:   0%|          | 0/21388 [00:00<?, ? examples/s]

Map:   0%|          | 0/5347 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 21388
    })
    test: Dataset({
        features: ['input_text', 'output_text'],
        num_rows: 5347
    })
})

In [8]:
def tokenize_function(example):
    model_inputs = tokenizer(example["input_text"], truncation=True, padding="max_length", max_length=512)
    labels = tokenizer(example["output_text"], truncation=True, padding="max_length", max_length=512)

    model_inputs["labels"] = labels["input_ids"]  # Supervised learning expects input-output pairs
    return model_inputs

# Tokenize dataset
tokenized_dataset = format_dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/21388 [00:00<?, ? examples/s]

Map:   0%|          | 0/5347 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_text', 'output_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 21388
    })
    test: Dataset({
        features: ['input_text', 'output_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5347
    })
})

In [9]:
data = tokenized_dataset.remove_columns(["input_text", "output_text"])

In [10]:
train_data = data['train']
eval_data = data['test']

In [4]:
def process_data(json_data, tokenizer):
    processed_data = []

    for item in json_data:
        # Construct the input text
        content = [
            f"Title: {item.get('title', 'N/A')}",
            f"Units: {item.get('units', 'N/A')}",
            f"Description: {item.get('description', 'N/A')}",
            f"Grading: {item.get('grading', 'N/A')}",
            f"Class Structure: {item.get('class_structure', 'Class structure not found')}"
        ]

        # Add prerequisites, corequisites, and notes
        if item.get('prerequisite(s)'):
            content.append("Prerequisite(s): " + ", ".join(item['prerequisite(s)']))

        if item.get('corequisite(s)'):
            content.append("Corequisite(s): " + ", ".join(item['corequisite(s)']))

        if item.get('pre/corequisite(s)'):
            content.append("Pre/Corequisite(s): " + ", ".join(item['pre/corequisite(s)']))

        if item.get('notes'):
            content.append("Note(s): " + ", ".join(item['notes']))

        # Handle core courses (for majors)
        if 'core_courses' in item:
            content.append("\nCore Courses:")
            for course in item.get('core_courses', []):
                content.append(f"- {course['course']}: {course['title']} ({course['units']} units)")

        # Handle specialization tracks (for majors)
        if 'specialization_tracks' in item:
            content.append("\nSpecialization Tracks:")
            for specialization, details in item['specialization_tracks'].items():
                content.append(f"\n- {specialization}:")
                if isinstance(details, list):  # MSAI-style specialization (direct list of courses)
                    for course in details:
                        content.append(f"  - {course['course']}: {course['title']} ({course['units']} units)")
                elif isinstance(details, dict):  # MSSE-style specialization (nested dictionary)
                    if 'overview' in details:
                        content.append(f"  Overview: {details['overview']}")
                    if 'required_core_courses' in details:
                        content.append("\n  Required Core Courses:")
                        for course in details['required_core_courses']:
                            content.append(f"    - {course['course']}: {course['title']} ({course['units']} units)")
                    if 'specialization_choice_courses' in details:
                        content.append("\n  Specialization Choice Courses:")
                        for course in details['specialization_choice_courses']:
                            content.append(f"    - {course['course']}: {course['title']} ({course['units']} units)")

        # Handle elective courses (for majors)
        if 'elective_courses' in item:
            content.append("\nElective Courses:")
            if 'overview' in item['elective_courses']:
                content.append(f"  Overview: {item['elective_courses']['overview']}")
                if 'restricted_courses' in item['elective_courses']:
                    content.append("\n  Restricted Courses (cannot be taken as electives):")
                    for course in item['elective_courses']['restricted_courses']:
                        if isinstance(course, dict):
                            content.append(f"    - {course['course']}: {course['title']} ({course['units']} units)")
                        elif isinstance(course, str):
                            content.append(f"    - {course}")
            else:
                for area, courses in item['elective_courses'].items():
                    content.append(f"\n- {area}:")
                    for course in courses:
                        if isinstance(course, dict):
                            content.append(f"  - {course['course']}: {course['title']} ({course['units']} units)")
                        elif isinstance(course, str):
                            content.append(f"  - {course}")

        # Handle graduate writing requirement (for majors)
        if 'graduate_writing_requirement' in item:
            content.append("\nGraduate Writing Requirement:")
            gww = item['graduate_writing_requirement']
            if 'courses' in gww:
                for course in gww['courses']:
                    content.append(f"  - {course['course']}: {course['title']} ({course['units']} units)")
                    if 'description' in course:
                        content.append(f"    Description: {course['description']}")
            elif 'course' in gww:
                content.append(f"  - {gww['course']}: {gww['title']} ({gww['units']} units)")

        # Handle culminating experience (for majors)
        if 'culminating_experience' in item:
            content.append("\nCulminating Experience Options:")
            for option, courses in item['culminating_experience'].items():
                content.append(f"\n- {option}:")
                for course in courses:
                    if isinstance(course, dict):
                        content.append(f"  - {course['course']}: {course['title']} ({course['units']} units)")

        input_text = "\n".join(content)

        # Define the target text (e.g., a summary of the program or course)
        if 'core_courses' in item:  # Major
            target_text = (
                f"The {item.get('title', 'N/A')} program requires {item.get('units', 'N/A')} units, "
                f"including core courses, specialization tracks, elective courses, a graduate writing requirement, "
                f"and a culminating experience."
            )
        else:  # Course
            target_text = (
                f"{item.get('title', 'N/A')} is a {item.get('units', 'N/A')} course. "
                f"Description: {item.get('description', 'N/A')}"
            )

        # Tokenize the input and target text
        inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
        targets = tokenizer(target_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

        # Add to the processed dataset
        processed_data.append({
            "input_ids": inputs["input_ids"].squeeze().tolist(),  # Convert tensor to list
            "attention_mask": inputs["attention_mask"].squeeze().tolist(),  # Convert tensor to list
            "labels": targets["input_ids"].squeeze().tolist()  # Convert tensor to list
        })

    return processed_data

In [5]:
# Load the tokenizer for Flan-T5-Base
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

# Load the model
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
# Load and process the dataset
json_data = load_json_data("SJSU_courses_majors_dataset.json")
processed_data = process_data(json_data, tokenizer)

Successfully loaded 5350 courses


In [7]:
# Convert processed data to a Hugging Face Dataset
dataset = Dataset.from_dict({
    "input_ids": [item["input_ids"] for item in processed_data],
    "attention_mask": [item["attention_mask"] for item in processed_data],
    "labels": [item["labels"] for item in processed_data]
})

dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5350
})

In [8]:
print(dataset['labels'][0])

[480, 3162, 209, 3, 18, 3, 14808, 15, 26, 15576, 23919, 19, 3, 9, 209, 1745, 599, 7, 61, 503, 5, 7726, 10, 21627, 26, 3, 30187, 1722, 1087, 12, 3391, 1722, 87, 18271, 4639, 11, 1344, 46, 1676, 6, 533, 18, 9442, 4026, 21, 481, 3, 6319, 12, 3716, 16, 8, 879, 1756, 478, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [9]:
# Split the dataset
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Convert to PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [12]:
# Clear GPU memory
torch.cuda.empty_cache()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./flan-t5-finetuned",
    eval_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=2,   # Reduce batch size
    gradient_accumulation_steps=1,  # Gradient accumulation
    fp16=False,  # Enable mixed precision
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    run_name="flan-t5-base-finetuned-run",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=T5_model_fine)

# Initialize the Trainer
trainer = Trainer(
    model=T5_model_fine,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    data_collator=data_collator,
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0018,0.000819
2,0.001,0.000209
3,0.0003,0.000115
4,0.0002,7.3e-05
5,0.0,7.9e-05


TrainOutput(global_step=26735, training_loss=0.0008385262878363308, metrics={'train_runtime': 8886.3341, 'train_samples_per_second': 12.034, 'train_steps_per_second': 3.009, 'total_flos': 7.322795127078912e+16, 'train_loss': 0.0008385262878363308, 'epoch': 5.0})

In [13]:
T5_model_fine.save_pretrained("./flan-t5-finetuned")
tokenizer.save_pretrained("./flan-t5-finetuned")

('./flan-t5-finetuned/tokenizer_config.json',
 './flan-t5-finetuned/special_tokens_map.json',
 './flan-t5-finetuned/spiece.model',
 './flan-t5-finetuned/added_tokens.json',
 './flan-t5-finetuned/tokenizer.json')

In [20]:
# Load the fine-tuned model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained("./flan-t5-finetuned")
tokenizer = AutoTokenizer.from_pretrained("./flan-t5-finetuned")

# Example input
context = df['Context'][0]
input_text = f"Answer this question based on the context.\nContext: {context}\nQuestion: Are there any prerequisites in KIN 1?"
# Tokenize input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate output
outputs = model.generate(input_ids, max_new_tokens=200)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Output:", decoded_output)

Generated Output: No prerequisites listed
