# HenriAI Fine-Tuining on GPT-J 6B

In [None]:
!pip install -q bitsandbytes transformers peft accelerate scipy

In [None]:
import os
import torch
from datetime import datetime
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from google.colab import drive
import json
import time
import gc

In [None]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [None]:
drive.mount('/content/drive')

## Defined hyperparams: Parameters we can adjust when training the model

In [None]:
BATCH_SIZE = 28
GRAD_ACCUMULATION_STEPS = 1
MAX_LENGTH = 512
LORA_R = 72
LORA_ALPHA = 144
LORA_DROPOUT = 0.1
LEARNING_RATE = 3e-4
NUM_EPOCHS = 5

In [None]:
def clear_memory():
    """Clear GPU memory"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

In [None]:
class QADataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.inputs = []
        self.attn_masks = []

        for item in data:
            combined_text = f"Question: {item['question']}\nAnswer: {item['answer']}"

            encodings_dict = tokenizer(
                combined_text,
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_tensors="pt"
            )

            self.inputs.append(encodings_dict['input_ids'].squeeze())
            self.attn_masks.append(encodings_dict['attention_mask'].squeeze())

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx],
            'attention_mask': self.attn_masks[idx],
            'labels': self.inputs[idx].clone()
        }

In [None]:
def load_training_data():
    file_paths = [
        '/content/drive/MyDrive/HenriAI/Datasets/JSONFiles/commonsense_data.json',
        '/content/drive/MyDrive/HenriAI/Datasets/JSONFiles/henri_academic_essays.json',
        '/content/drive/MyDrive/HenriAI/Datasets/JSONFiles/henri_instructions.json',
        '/content/drive/MyDrive/HenriAI/Datasets/JSONFiles/henri_natural1.json',
        '/content/drive/MyDrive/HenriAI/Datasets/JSONFiles/henri_natural2.json',
        '/content/drive/MyDrive/HenriAI/Datasets/JSONFiles/henri_natural_Intro_Outro.json',
    ]

    all_data = []
    for path in file_paths:
        try:
            with open(path, 'r') as f:
                data = json.load(f)
                all_data.extend(data)
                print(f"Loaded {len(data)} samples from {path}")
        except Exception as e:
            print(f"Error loading {path}: {e}")

    return all_data

In [None]:
def create_qlora_model():
    """Create and configure model with QLoRA"""

    # Configure 4-bit quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    # Load base model with quantization
    model = AutoModelForCausalLM.from_pretrained(
        "EleutherAI/gpt-j-6B",
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
    )

    # Enable gradient checkpointing after model creation
    model.gradient_checkpointing_enable()

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA
    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "fc_in", "fc_out"]
    )

    # Apply LoRA
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    return model

In [None]:
def train_model(model, train_dataset, device):
    train_loader = DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        pin_memory=True
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
      optimizer,
      T_max=len(train_loader) * NUM_EPOCHS
    )

    print(f"Starting training with {len(train_dataset)} samples")
    start_time = time.time()

    try:
        for epoch in range(NUM_EPOCHS):
            model.train()
            total_loss = 0

            for batch_idx, batch in enumerate(train_loader):
                try:
                    # Move batch to device
                    batch = {k: v.to(device) for k, v in batch.items()}

                    # Forward pass
                    outputs = model(
                        input_ids=batch['input_ids'],
                        attention_mask=batch['attention_mask'],
                        labels=batch['labels']
                    )

                    loss = outputs.loss / GRAD_ACCUMULATION_STEPS
                    loss.backward()

                    # Update weights if gradient accumulation is complete
                    if (batch_idx + 1) % GRAD_ACCUMULATION_STEPS == 0:
                        optimizer.step()
                        scheduler.step()
                        optimizer.zero_grad()
                        clear_memory()

                    total_loss += loss.item() * GRAD_ACCUMULATION_STEPS

                    if batch_idx % 10 == 0:
                        elapsed = time.time() - start_time
                        print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Batch {batch_idx}/{len(train_loader)}, "
                              f"Loss: {loss.item():.4f}, Time: {elapsed:.2f}s")

                except Exception as e:
                    print(f"Error processing batch {batch_idx}: {str(e)}")
                    continue

            # Save adapter weights at end of epoch
            adapter_path = f"/content/drive/MyDrive/HenriAI/Models/Version 1/adapters/epoch_{epoch}"
            os.makedirs(adapter_path, exist_ok=True)
            model.save_pretrained(adapter_path)

            avg_loss = total_loss / len(train_loader)
            print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Average Loss: {avg_loss:.4f}")

            # Clear memory at end of epoch
            clear_memory()

    except Exception as e:
        print(f"Training interrupted with error: {str(e)}")
        # Save adapter weights on error
        model.save_pretrained("/content/drive/MyDrive/HenriAI/Models/Version 1/adapters/interrupted_training")
        raise

In [None]:
def main():
    os.makedirs("/content/drive/MyDrive/HenriAI/Models/Version 1/adapters", exist_ok=True)

    print(f"Starting training at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Clear GPU memory before loading model
    clear_memory()

    print("Loading tokenizer and model...")
    tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-j-6B')
    tokenizer.pad_token = tokenizer.eos_token

    # Create QLoRA model
    model = create_qlora_model()

    # Load and prepare data
    print("Loading training data...")
    data = load_training_data()

    # Create dataset
    print("Creating dataset...")
    dataset = QADataset(data, tokenizer)
    print(f"Dataset size: {len(dataset)} samples")

    # Train model
    print("Starting training...")
    train_model(model, dataset, device)

    print(f"Training completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}!")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print(f"Training failed with error: {str(e)}")
        raise