# FUT QA Assistant - Model Training

This notebook trains a fine-tuned question-answering model for the Federal University of Technology (FUT) QA Assistant system.

## Steps:
1. Install dependencies
2. Mount Google Drive
3. Load and prepare training data
4. Train the model
5. Save the trained model


In [None]:
# Install required dependencies
!pip install transformers torch datasets accelerate evaluate
!pip install fastapi uvicorn pyngrok


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("Google Drive mounted successfully!")


In [None]:
# Import required libraries
import torch
from transformers import (
    AutoTokenizer, AutoModelForQuestionAnswering, 
    TrainingArguments, Trainer, DefaultDataCollator
)
from datasets import Dataset
import json
import os
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm import tqdm

print("Libraries imported successfully!")


In [None]:
# Configuration
MODEL_NAME = "distilbert/distilbert-base-cased-distilled-squad"
OUTPUT_DIR = "/content/drive/MyDrive/fut_qa_model"
MAX_LENGTH = 512
STRIDE = 128
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3

print(f"Model: {MODEL_NAME}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Max length: {MAX_LENGTH}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs: {NUM_EPOCHS}")


In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("Tokenizer loaded successfully!")


In [None]:
# Sample training data - Replace this with your actual FUT data
sample_data = [
    {
        "context": "Federal University of Technology (FUT) is a Nigerian university focused on technology and engineering education. The university offers various programs in engineering, technology, and applied sciences.",
        "question": "What is FUT known for?",
        "answers": {
            "text": ["technology and engineering education"],
            "answer_start": [45]
        }
    },
    {
        "context": "The Computer Science department at FUT offers programs in software engineering, artificial intelligence, and data science. Students learn programming languages like Python, Java, and C++.",
        "question": "What programming languages are taught in Computer Science?",
        "answers": {
            "text": ["Python, Java, and C++"],
            "answer_start": [145]
        }
    },
    {
        "context": "FUT has multiple campuses and offers both undergraduate and postgraduate programs. The university is known for its practical approach to education and industry partnerships.",
        "question": "What types of programs does FUT offer?",
        "answers": {
            "text": ["undergraduate and postgraduate programs"],
            "answer_start": [45]
        }
    }
]

print(f"Sample data created with {len(sample_data)} examples")
print("\nSample example:")
print(f"Context: {sample_data[0]['context']}")
print(f"Question: {sample_data[0]['question']}")
print(f"Answer: {sample_data[0]['answers']['text'][0]}")


In [None]:
# Function to load your actual training data
def load_training_data():
    """
    Load training data from Google Drive or create sample data
    """
    # Check multiple possible locations for your training data
    possible_paths = [
        "/content/drive/MyDrive/fut_training_data_for_colab.json",
        "/content/drive/MyDrive/fut_qa_training_data.json",
        "/content/drive/MyDrive/fut_comprehensive_training_data.json"
    ]
    
    for data_path in possible_paths:
        if os.path.exists(data_path):
            print(f"Loading training data from {data_path}")
            with open(data_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            print(f"✅ Loaded {len(data)} training examples")
            return data
    
    print("No training data found in Google Drive, using sample data")
    print("📁 To use your own data:")
    print("1. Upload 'fut_training_data_for_colab.json' to your Google Drive")
    print("2. Make sure it's in the root of your Google Drive")
    print("3. Re-run this cell")
    return sample_data

# Load training data
training_data = load_training_data()
print(f"Loaded {len(training_data)} training examples")


In [None]:
# Data preprocessing functions
def preprocess_function(examples):
    """
    Preprocess the training data for the model
    """
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation=True,
        padding=True,
        return_offsets_mapping=True,
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while idx < len(sequence_ids) and sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

print("Data preprocessing functions defined!")


In [None]:
# Prepare datasets
print("Preparing datasets...")

# Convert to Hugging Face Dataset format
dataset = Dataset.from_list(training_data)

# Split into train and validation sets
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Apply preprocessing
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=eval_dataset.column_names,
)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Evaluation dataset size: {len(eval_dataset)}")
print("Datasets prepared successfully!")


In [None]:
# Load the model
print("Loading model...")
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
print("Model loaded successfully!")

# Data collator
data_collator = DefaultDataCollator()


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

print("Training arguments configured!")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Epochs: {NUM_EPOCHS}")


In [None]:
# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("Trainer initialized successfully!")


In [None]:
# Start training
print("Starting training...")
print("This may take several minutes depending on your data size and hardware.")

trainer.train()

print("Training completed successfully!")


In [None]:
# Save the trained model
print("Saving the trained model...")

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Save the model and tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"Model saved successfully to {OUTPUT_DIR}")
print("You can now use this model in your FastAPI backend!")


In [None]:
# Test the trained model
print("Testing the trained model...")

# Load the trained model
trained_model = AutoModelForQuestionAnswering.from_pretrained(OUTPUT_DIR)
trained_tokenizer = AutoTokenizer.from_pretrained(OUTPUT_DIR)

# Create a pipeline for testing
qa_pipeline = pipeline("question-answering", model=trained_model, tokenizer=trained_tokenizer)

# Test with a sample question
test_context = "Federal University of Technology (FUT) is a Nigerian university focused on technology and engineering education. The university offers various programs in engineering, technology, and applied sciences."
test_question = "What is FUT known for?"

result = qa_pipeline(question=test_question, context=test_context)

print(f"Question: {test_question}")
print(f"Answer: {result['answer']}")
print(f"Confidence: {result['score']:.4f}")

print("\nModel testing completed!")
