# Optimized Fine-Tuning for GWU Course Search

This notebook implements an **optimized fine-tuning approach** with:
- Train/Validation Split
- Early Stopping
- Better Hyperparameters (Cosine Annealing, Optimal LR)
- Evaluation Metrics
- Better Data Handling
- Improved Inference Setup


### Setup Environment


In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install evaluate


### Load Model with Optimized Settings


In [None]:
from unsloth import FastLanguageModel
import torch

# Optimized settings
max_seq_length = 2048  # Sufficient for course Q&A
dtype = None  # Auto-detect (Bfloat16 for Ampere+)
load_in_4bit = True  # Memory efficient

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


### Configure LoRA with Optimized Parameters


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,  # Increased from 16 for better capacity
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,  # Match r value (r:alpha = 1:1 is optimal)
    lora_dropout = 0.05,  # Small dropout for regularization
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)


### Load and Prepare Dataset with Train/Validation Split


In [None]:
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template

# Load dataset
dataset = load_dataset("json", data_files="course_finetune.jsonl", split="train")

# Setup Llama 3.1 template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

# Format dataset
dataset = dataset.map(formatting_prompts_func, batched = True)

# Create train/validation split (80/20)
dataset = dataset.train_test_split(test_size=0.2, seed=3407)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print(f"Train examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")
print(f"\nSample training example:")
print(train_dataset[0]["text"][:500] + "...")


### Configure Optimized Training Parameters


In [None]:
from trl import SFTConfig, SFTTrainer
from transformers import EarlyStoppingCallback

# Optimized training configuration
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,  # Validation set for early stopping
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False,  # False for better quality with variable-length sequences
    args = SFTConfig(
        # Batch size - optimized for A100
        per_device_train_batch_size = 4,  # Increased from 2
        per_device_eval_batch_size = 4,
        gradient_accumulation_steps = 2,  # Adjusted to maintain effective batch size
        
        # Learning rate - optimal for LoRA fine-tuning
        learning_rate = 1e-4,  # Lower, more stable (was 2e-4)
        lr_scheduler_type = "cosine",  # Cosine annealing (better than linear)
        warmup_ratio = 0.1,  # 10% warmup (better than fixed steps)
        
        # Training duration
        num_train_epochs = 5,  # More epochs, but early stopping will prevent overfitting
        max_steps = -1,  # Use epochs instead
        
        # Optimization
        optim = "adamw_8bit",
        weight_decay = 0.01,  # Slightly higher for regularization
        adam_beta1 = 0.9,
        adam_beta2 = 0.999,
        
        # Evaluation and logging
        eval_strategy = "steps",  # Evaluate during training
        eval_steps = 100,  # Evaluate every 100 steps
        save_strategy = "steps",
        save_steps = 200,
        logging_steps = 10,  # More frequent logging
        report_to = "none",
        
        # Output
        output_dir = "outputs_optimized",
        seed = 3407,
        fp16 = False,  # Use bfloat16 (handled by unsloth)
        bf16 = True,
        
        # Early stopping
        load_best_model_at_end = True,
        metric_for_best_model = "eval_loss",  # Lower is better
        greater_is_better = False,
    ),
)

# Add early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience = 3,  # Stop if no improvement for 3 evaluations
    early_stopping_threshold = 0.001,  # Minimum improvement threshold
)
trainer.add_callback(early_stopping)


In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")


### Train Model with Early Stopping


In [None]:
trainer_stats = trainer.train()


### Training Statistics


In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"Training completed!")
print(f"Runtime: {trainer_stats.metrics['train_runtime']:.2f} seconds ({trainer_stats.metrics['train_runtime']/60:.2f} minutes)")
print(f"Peak reserved memory: {used_memory} GB ({used_percentage}%)")
print(f"Training memory: {used_memory_for_lora} GB ({lora_percentage}%)")
print(f"\nFinal training loss: {trainer_stats.metrics.get('train_loss', 'N/A')}")
print(f"Best validation loss: {trainer_stats.metrics.get('eval_loss', 'N/A')}")


### Test Inference with Optimized Settings


In [None]:
FastLanguageModel.for_inference(model)

# Test queries
test_queries = [
    "Who Teaches Machine Learning?",
    "What courses are available on Tuesdays?",
    "Tell me about CSCI 1012.",
    "What is the schedule for CSCI 4244?",
]

for query in test_queries:
    print(f"\n{'='*60}")
    print(f"Query: {query}")
    print(f"{'='*60}")
    
    messages = [
        {"role": "system", "content": "You are a helpful assistant providing information about GWU Computer Science courses for Spring 2026."},
        {"role": "user", "content": query},
    ]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True,
        return_tensors = "pt",
    ).to("cuda")
    
    attention_mask = torch.ones_like(inputs)
    
    from transformers import TextStreamer
    text_streamer = TextStreamer(tokenizer, skip_prompt = True)
    
    outputs = model.generate(
        input_ids = inputs,
        attention_mask = attention_mask,
        streamer = text_streamer,
        max_new_tokens = 256,
        temperature = 0.1,  # Low temperature for factual accuracy
        do_sample = True,
        pad_token_id = tokenizer.eos_token_id,
        eos_token_id = tokenizer.eos_token_id,
        repetition_penalty = 1.2,  # Prevent repetition
        top_p = 0.9,  # Nucleus sampling
    )
    
    # Clean output
    output_text = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    print(f"\nClean Answer: {output_text.strip()}")
    print()


### Save Model


In [None]:
# Save LoRA adapters
model.save_pretrained("lora_model_optimized")
tokenizer.save_pretrained("lora_model_optimized")
print("LoRA adapters saved to 'lora_model_optimized'.")


### Export Merged Model (Optional)


In [None]:
# Export to merged 16-bit model for easier deployment
save_method = "merged_16bit"
print(f"Saving {save_method} locally...")
model.save_pretrained_merged("merged_model_optimized", tokenizer, save_method=save_method)
print("Merged model saved to 'merged_model_optimized'.")


### Upload to Hugging Face Hub (Optional)

If you want to share your model or use it in other environments, you can upload it to Hugging Face Hub.


In [None]:
# Optional: Upload to Hugging Face Hub
push_to_hub = False  # Set to True to enable upload
hf_repo_name = "itsmepraks/gwcoursesfinetuned-optimized"  # Change to your username/repo
hf_token = None  # Will try to get from environment or Colab secrets

if push_to_hub:
    from huggingface_hub import HfApi
    import os
    
    # Try to get token from Colab secrets or environment
    try:
        from google.colab import userdata
        hf_token = userdata.get("HF_TOKEN")
        print("Loaded HF_TOKEN from Colab Secrets.")
    except:
        hf_token = os.getenv("HF_TOKEN")
        if not hf_token:
            raise ValueError("HF_TOKEN not found! Please add 'HF_TOKEN' to Colab Secrets or set as environment variable.")
    
    api = HfApi(token=hf_token)
    
    # Create repository if it doesn't exist
    print(f"Ensuring repository {hf_repo_name} exists...")
    api.create_repo(repo_id=hf_repo_name, repo_type="model", exist_ok=True)
    
    # Upload merged model
    print(f"Uploading merged model to {hf_repo_name}...")
    api.upload_folder(
        folder_path="merged_model_optimized",
        repo_id=hf_repo_name,
        repo_type="model",
    )
    
    print(f"âœ… Model uploaded successfully to https://huggingface.co/{hf_repo_name}")
else:
    print("Skipping Hugging Face upload. Set push_to_hub=True to enable.")
