<a href="https://colab.research.google.com/github/jeffreyong15/Counsel.NLP/blob/main/Model/Fine_Tune_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install library

In [1]:
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

## Import library


In [2]:
import json
import torch
from datasets import Dataset
from langchain.schema import Document
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq

## Fine Tune Model

In [3]:
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)

    if not json_data:
        raise ValueError("JSON data is empty")

    print(f"Successfully loaded {len(json_data)} courses")
    return json_data

In [27]:
def process_data(json_data, tokenizer):
    processed_data = []

    for item in json_data:
        # Construct the input text
        content = [
            f"Title: {item.get('title', 'N/A')}",
            f"Units: {item.get('units', 'N/A')}",
            f"Description: {item.get('description', 'N/A')}",
            f"Grading: {item.get('grading', 'N/A')}",
            f"Class Structure: {item.get('class_structure', 'Class structure not found')}"
        ]

        # Add prerequisites, corequisites, and notes
        if item.get('prerequisite(s)'):
            content.append("Prerequisite(s): " + ", ".join(item['prerequisite(s)']))

        if item.get('corequisite(s)'):
            content.append("Corequisite(s): " + ", ".join(item['corequisite(s)']))

        if item.get('pre/corequisite(s)'):
            content.append("Pre/Corequisite(s): " + ", ".join(item['pre/corequisite(s)']))

        if item.get('notes'):
            content.append("Note(s): " + ", ".join(item['notes']))

        # Handle core courses (for majors)
        if 'core_courses' in item:
            content.append("\nCore Courses:")
            for course in item.get('core_courses', []):
                content.append(f"- {course['course']}: {course['title']} ({course['units']} units)")

        # Handle specialization tracks (for majors)
        if 'specialization_tracks' in item:
            content.append("\nSpecialization Tracks:")
            for specialization, details in item['specialization_tracks'].items():
                content.append(f"\n- {specialization}:")
                if isinstance(details, list):  # MSAI-style specialization (direct list of courses)
                    for course in details:
                        content.append(f"  - {course['course']}: {course['title']} ({course['units']} units)")
                elif isinstance(details, dict):  # MSSE-style specialization (nested dictionary)
                    if 'overview' in details:
                        content.append(f"  Overview: {details['overview']}")
                    if 'required_core_courses' in details:
                        content.append("\n  Required Core Courses:")
                        for course in details['required_core_courses']:
                            content.append(f"    - {course['course']}: {course['title']} ({course['units']} units)")
                    if 'specialization_choice_courses' in details:
                        content.append("\n  Specialization Choice Courses:")
                        for course in details['specialization_choice_courses']:
                            content.append(f"    - {course['course']}: {course['title']} ({course['units']} units)")

        # Handle elective courses (for majors)
        if 'elective_courses' in item:
            content.append("\nElective Courses:")
            if 'overview' in item['elective_courses']:
                content.append(f"  Overview: {item['elective_courses']['overview']}")
                if 'restricted_courses' in item['elective_courses']:
                    content.append("\n  Restricted Courses (cannot be taken as electives):")
                    for course in item['elective_courses']['restricted_courses']:
                        if isinstance(course, dict):
                            content.append(f"    - {course['course']}: {course['title']} ({course['units']} units)")
                        elif isinstance(course, str):
                            content.append(f"    - {course}")
            else:
                for area, courses in item['elective_courses'].items():
                    content.append(f"\n- {area}:")
                    for course in courses:
                        if isinstance(course, dict):
                            content.append(f"  - {course['course']}: {course['title']} ({course['units']} units)")
                        elif isinstance(course, str):
                            content.append(f"  - {course}")

        # Handle graduate writing requirement (for majors)
        if 'graduate_writing_requirement' in item:
            content.append("\nGraduate Writing Requirement:")
            gww = item['graduate_writing_requirement']
            if 'courses' in gww:
                for course in gww['courses']:
                    content.append(f"  - {course['course']}: {course['title']} ({course['units']} units)")
                    if 'description' in course:
                        content.append(f"    Description: {course['description']}")
            elif 'course' in gww:
                content.append(f"  - {gww['course']}: {gww['title']} ({gww['units']} units)")

        # Handle culminating experience (for majors)
        if 'culminating_experience' in item:
            content.append("\nCulminating Experience Options:")
            for option, courses in item['culminating_experience'].items():
                content.append(f"\n- {option}:")
                for course in courses:
                    if isinstance(course, dict):
                        content.append(f"  - {course['course']}: {course['title']} ({course['units']} units)")

        input_text = "\n".join(content)

        # Define the target text (e.g., a summary of the program or course)
        if 'core_courses' in item:  # Major
            target_text = (
                f"The {item.get('title', 'N/A')} program requires {item.get('units', 'N/A')} units, "
                f"including core courses, specialization tracks, elective courses, a graduate writing requirement, "
                f"and a culminating experience."
            )
        else:  # Course
            target_text = (
                f"{item.get('title', 'N/A')} is a {item.get('units', 'N/A')} course. "
                f"Description: {item.get('description', 'N/A')}"
            )

        # Tokenize the input and target text
        inputs = tokenizer(input_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
        targets = tokenizer(target_text, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

        # Add to the processed dataset
        processed_data.append({
            "input_ids": inputs["input_ids"].squeeze().tolist(),  # Convert tensor to list
            "attention_mask": inputs["attention_mask"].squeeze().tolist(),  # Convert tensor to list
            "labels": targets["input_ids"].squeeze().tolist()  # Convert tensor to list
        })

    return processed_data

In [28]:
# Load the tokenizer for Flan-T5-Base
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")

# Load the model
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

In [29]:
# Load and process the dataset
json_data = load_json_data("SJSU_courses_majors_dataset.json")
processed_data = process_data(json_data, tokenizer)

Successfully loaded 5350 courses


In [30]:
# Convert processed data to a Hugging Face Dataset
dataset = Dataset.from_dict({
    "input_ids": [item["input_ids"] for item in processed_data],
    "attention_mask": [item["attention_mask"] for item in processed_data],
    "labels": [item["labels"] for item in processed_data]
})

dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5350
})

In [31]:
# Split the dataset
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

# Convert to PyTorch tensors
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [33]:
# Clear GPU memory
torch.cuda.empty_cache()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./flan-t5-finetuned",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,  # Reduce batch size
    per_device_eval_batch_size=2,   # Reduce batch size
    gradient_accumulation_steps=1,  # Gradient accumulation
    fp16=False,  # Enable mixed precision
    num_train_epochs=1,
    weight_decay=0.01,
    save_total_limit=2,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=10,
    run_name="flan-t5-base-finetuned-run",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0,


TrainOutput(global_step=2408, training_loss=0.0, metrics={'train_runtime': 916.468, 'train_samples_per_second': 5.254, 'train_steps_per_second': 2.627, 'total_flos': 3297106652037120.0, 'train_loss': 0.0, 'epoch': 1.0})

In [None]:
model.save_pretrained("./flan-t5-finetuned")
tokenizer.save_pretrained("./flan-t5-finetuned")

In [None]:
# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("./flan-t5-finetuned")
tokenizer = T5Tokenizer.from_pretrained("./flan-t5-finetuned")

# Example input
input_text = "Title: KIN 1 - Adapted Physical Activities\nDescription: Structured individualized physical activities to enhance physical/motor fitness and develop an active, health-oriented lifestyle for students unable to participate in the general activity program."

# Tokenize input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

# Generate output
outputs = model.generate(input_ids)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Output:", decoded_output)