# Jais Fine-Tuning for Hassaniya Dialect

This notebook fine-tunes the Jais Arabic LLM on Hassaniya dialect data using QLoRA.

**Requirements:**
- GPU with 10GB+ VRAM (A100, RTX 4090, A10)
- Hugging Face account with Jais model access

**Estimated Time:** 2-4 hours for Jais-13b

## 1. Setup Environment

In [None]:
# Install dependencies
!pip install -q torch transformers accelerate datasets
!pip install -q peft bitsandbytes
!pip install -q sentencepiece protobuf scipy einops

In [None]:
# Check GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

## 2. Clone Repository & Load Data

In [None]:
# Clone the repository
!git clone https://github.com/lemneya/hassania-qwen-finetune.git
%cd hassania-qwen-finetune

In [None]:
# Check training data
!wc -l hdrp/data/processed/exports/jais/jais_train.jsonl
!head -1 hdrp/data/processed/exports/jais/jais_train.jsonl

## 3. Hugging Face Login

You need to:
1. Create a Hugging Face account at https://huggingface.co
2. Accept the Jais model license at https://huggingface.co/inceptionai/jais-13b-chat
3. Create an access token at https://huggingface.co/settings/tokens

In [None]:
from huggingface_hub import login

# Option 1: Login with token
# login(token="your_token_here")

# Option 2: Interactive login
login()

## 4. Load Model with QLoRA

In [None]:
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset

# Configuration
MODEL_ID = "inceptionai/jais-13b-chat"  # or "core42/jais-30b-chat-v3"
OUTPUT_DIR = "models/jais-hassaniya"
EPOCHS = 3
BATCH_SIZE = 2
LEARNING_RATE = 2e-5
MAX_LENGTH = 1024

In [None]:
# QLoRA configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model with 4-bit quantization...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

print("Model loaded successfully!")

In [None]:
# Prepare for training
model = prepare_model_for_kbit_training(model)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", 
                   "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 5. Prepare Dataset

In [None]:
# Load Jais-formatted training data
dataset = load_dataset(
    'json', 
    data_files='hdrp/data/processed/exports/jais/jais_train.jsonl', 
    split='train'
)

print(f"Loaded {len(dataset)} training examples")
print(f"Sample: {dataset[0]['text'][:200]}...")

In [None]:
# Tokenize
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length',
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dataset.column_names,
)

# Split
split_dataset = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
print(f"Train: {len(split_dataset['train'])}, Eval: {len(split_dataset['test'])}")

## 6. Train

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=4,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=50,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none",
    save_total_limit=2,
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset['train'],
    eval_dataset=split_dataset['test'],
    data_collator=data_collator,
)

In [None]:
# Start training
print("Starting training...")
print(f"This will take approximately 2-4 hours for Jais-13b")
trainer.train()

In [None]:
# Save the model
trainer.save_model(f"{OUTPUT_DIR}/final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
print(f"Model saved to {OUTPUT_DIR}/final")

## 7. Test the Model

In [None]:
# Test prompts
test_prompts = [
    "كيف نقول 'مرحبا، كيف حالك؟' بالحسانية؟",
    "Translate 'I want to buy a car' to Hassaniya dialect.",
    "اشحالك؟",
    "كيف أسأل عن سعر شيء في السوق بالحسانية؟",
]

SYSTEM_PROMPT = "اسمك مساعد حسانية، متخصص في اللهجة الحسانية الموريتانية."

def format_prompt(user_msg):
    return f"""### Instruction: {SYSTEM_PROMPT}
أكمل المحادثة أدناه بين [|Human|] و [|AI|]:
### Input: [|Human|] {user_msg}
### Response: [|AI|]"""

for prompt in test_prompts:
    print(f"\nUser: {prompt}")
    formatted = format_prompt(prompt)
    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("### Response: [|AI|]")[-1].strip()
    print(f"Jais: {response}")
    print("-" * 50)

## 8. Download Model

Run this to create a downloadable zip file of your fine-tuned model.

In [None]:
!zip -r jais-hassaniya-model.zip models/jais-hassaniya/final
print("Model zipped! Download jais-hassaniya-model.zip")