# BitNet-Rust AI Model Finetuning Notebook

This notebook collects data as specified in the AI Model Training Research Guide for the BitNet-Rust project.
It uses Crawl4AI for web scraping, allows uploading a codebase, prepares a dataset, and finetunes Gemma-3-4B using Unsloth.
After finetuning, the model is tested and can be exported for Ollama.

**Note:** This notebook is designed to run on Google Colab with GPU enabled.

## Install Dependencies

Installing all required packages. This may take a few minutes.

In [None]:
# Install core dependencies
!pip install crawl4ai gitpython transformers torch datasets huggingface_hub trl peft accelerate bitsandbytes PyPDF2 requests

# Install Unsloth for fast training
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Install git-lfs for large file support
!apt install git-lfs -y

print("✅ All dependencies installed successfully!")

## Import Libraries

In [None]:
import os
import git
from crawl4ai import AsyncWebCrawler
import asyncio
import json
import requests
import PyPDF2
from google.colab import files
from datasets import Dataset
from huggingface_hub import login
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

print("📚 All libraries imported successfully!")

## Configuration

Define what data sources to collect from.

In [None]:
# Repositories to clone
repos = {
    'bitnet_rust': 'https://github.com/ocentra/bitnet.rs',
    'mlx': 'https://github.com/ml-explore/mlx'
}

# URLs to crawl for documentation and tutorials
urls_to_crawl = [
    'https://doc.rust-lang.org/book/',  # Rust Book
    'https://developer.apple.com/metal/',  # Metal Programming Guide
    'https://users.rust-lang.org/',  # Rust Users Forum
]

# Research papers to download
papers = [
    'https://arxiv.org/pdf/2310.11453'  # BitNet paper
]

print(f"📋 Configured {len(repos)} repos, {len(urls_to_crawl)} URLs, and {len(papers)} papers to process")

## Step 1: Clone Repositories and Extract Code

In [None]:
data_dir = 'training_data'
os.makedirs(data_dir, exist_ok=True)

def clone_repo(url, name):
    """Clone a repository and return its path"""
    path = os.path.join(data_dir, name)
    if not os.path.exists(path):
        print(f"🔄 Cloning {name} from {url}...")
        git.Repo.clone_from(url, path)
    else:
        print(f"✅ {name} already exists, skipping clone")
    return path

def extract_code_files(path):
    """Extract relevant files from repository"""
    texts = []
    file_extensions = ('.rs', '.md', '.toml', '.py', '.cpp', '.h')

    for root, _, files in os.walk(path):
        # Skip hidden directories and common ignore patterns
        if any(skip in root for skip in ['.git', 'target', '__pycache__', 'node_modules']):
            continue

        for file in files:
            if file.endswith(file_extensions):
                try:
                    file_path = os.path.join(root, file)
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        content = f.read()
                        if content.strip():  # Only add non-empty files
                            texts.append(content)
                except Exception as e:
                    print(f"⚠️ Warning: Could not read {file}: {e}")
    return texts

# Process repositories
repo_texts = []
for name, url in repos.items():
    try:
        path = clone_repo(url, name)
        texts = extract_code_files(path)
        repo_texts.extend(texts)
        print(f"📁 Extracted {len(texts)} files from {name}")
    except Exception as e:
        print(f"❌ Error processing {name}: {e}")

print(f"\n✅ Total files extracted from repositories: {len(repo_texts)}")

## Step 2: Scrape Web Pages with Crawl4AI

In [None]:
async def crawl_url_safe(crawler, url):
    """Safely crawl a single URL with error handling"""
    try:
        print(f"🕷️ Crawling {url}...")
        result = await crawler.arun(url=url)
        if result.success and result.markdown:
            print(f"✅ Successfully crawled {url}")
            return result.markdown
        else:
            print(f"⚠️ Warning: No content from {url}")
            return None
    except Exception as e:
        print(f"❌ Error crawling {url}: {e}")
        return None

async def crawl_all_urls():
    """Crawl all configured URLs"""
    async with AsyncWebCrawler(verbose=False) as crawler:
        results = []
        for url in urls_to_crawl:
            content = await crawl_url_safe(crawler, url)
            if content:
                results.append(content)
        return results

# Run the crawler
web_texts = await crawl_all_urls()
print(f"\n✅ Successfully scraped {len(web_texts)} web pages")

## Step 3: Download and Extract Papers

In [None]:
def download_and_extract_pdf(url):
    """Download PDF and extract text content"""
    try:
        print(f"📄 Downloading PDF from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()

        # Save temporarily
        temp_file = 'temp_paper.pdf'
        with open(temp_file, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        # Extract text
        with open(temp_file, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            text = ''
            for page_num, page in enumerate(reader.pages):
                try:
                    text += page.extract_text() + '\n'
                except Exception as e:
                    print(f"⚠️ Warning: Could not extract page {page_num}: {e}")

        # Cleanup
        os.remove(temp_file)

        if text.strip():
            print(f"✅ Successfully extracted text from PDF ({len(text)} characters)")
            return text
        else:
            print(f"⚠️ Warning: No text extracted from PDF")
            return None

    except Exception as e:
        print(f"❌ Error processing PDF from {url}: {e}")
        return None

# Process papers
paper_texts = []
for url in papers:
    content = download_and_extract_pdf(url)
    if content:
        paper_texts.append(content)

print(f"\n✅ Successfully processed {len(paper_texts)} papers")

## Step 4: Upload Your Codebase

Upload a zip file containing your codebase to include in the training data.

In [None]:
print('📁 Please upload your codebase as a zip file:')
print('   - The zip should contain your source code files')
print('   - Supported file types: .rs, .md, .toml, .py, .cpp, .h')
print('   - Large files (>100MB) may take time to process')

try:
    uploaded = files.upload()

    if uploaded:
        codebase_zip = list(uploaded.keys())[0]
        print(f"📦 Processing uploaded file: {codebase_zip}")

        # Extract zip
        !unzip -q "{codebase_zip}" -d codebase

        # Extract code files
        codebase_texts = extract_code_files('codebase')
        print(f"✅ Extracted {len(codebase_texts)} files from uploaded codebase")

    else:
        print("⚠️ No file uploaded, continuing without user codebase")
        codebase_texts = []

except Exception as e:
    print(f"❌ Error processing upload: {e}")
    print("Continuing without user codebase...")
    codebase_texts = []

## Step 5: Prepare Training Dataset

In [None]:
# Combine all collected data
all_texts = repo_texts + web_texts + paper_texts + codebase_texts

# Filter out empty or very short texts
filtered_texts = [text for text in all_texts if len(text.strip()) > 50]

print(f"📊 Dataset Statistics:")
print(f"   - Repository files: {len(repo_texts)}")
print(f"   - Web pages: {len(web_texts)}")
print(f"   - Research papers: {len(paper_texts)}")
print(f"   - User codebase: {len(codebase_texts)}")
print(f"   - Total samples: {len(filtered_texts)}")

if not filtered_texts:
    raise ValueError("❌ No training data collected! Please check your data sources.")

# Create dataset
dataset = Dataset.from_dict({'text': filtered_texts})
dataset = dataset.shuffle(seed=42)

# Split for evaluation
if len(filtered_texts) > 10:
    dataset = dataset.train_test_split(test_size=0.1, seed=42)
    print(f"📈 Split into {len(dataset['train'])} training and {len(dataset['test'])} evaluation samples")
else:
    # Too few samples for split
    dataset = {'train': dataset, 'test': dataset}
    print(f"⚠️ Using same data for training and evaluation due to small dataset size")

print("✅ Dataset prepared successfully!")

## Step 6: Setup Model for Fine-tuning

**Important:** You need to log in to Hugging Face to download the model.

In [None]:
# Login to Hugging Face
print("🔐 Please log in to Hugging Face to download the model:")
login()

# Model configuration
max_seq_length = 2048
dtype = None  # Auto-detect based on GPU
load_in_4bit = True  # Use 4-bit quantization to save memory

print("🤖 Loading Gemma-3-4B model...")
try:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="google/gemma-2-2b-it",  # Using 2B model as it's more stable
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
    )
    print("✅ Model loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("This might be due to:")
    print("  - GPU memory limitations")
    print("  - Hugging Face authentication issues")
    print("  - Network connectivity problems")
    raise

# Setup LoRA adapter for efficient fine-tuning
print("⚙️ Setting up LoRA adapter...")
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("✅ Model setup complete!")

## Step 7: Fine-tune the Model

This will take some time depending on your data size and GPU.

In [None]:
print("🚀 Starting fine-tuning process...")
print("This may take 10-60 minutes depending on data size and GPU speed.")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,  # Adjust based on dataset size
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        eval_steps=25,
        save_steps=50,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="./training_output",
        report_to="none",  # Disable wandb logging
    ),
)

# Start training
trainer.train()
print("🎉 Fine-tuning completed!")

## Step 8: Test the Fine-tuned Model

In [None]:
print("🧪 Testing the fine-tuned model...")

# Prepare model for inference
FastLanguageModel.for_inference(model)

# Test prompts
test_prompts = [
    "What is BitNet and how does it work?",
    "Explain Rust memory management",
    "How do you implement a neural network in Rust?"
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n🔤 Test {i}: {prompt}")
    print("=" * 50)

    try:
        inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        # Remove the original prompt from response
        response = response[len(prompt):].strip()
        print(f"📝 Response: {response}")

    except Exception as e:
        print(f"❌ Error generating response: {e}")

print("\n✅ Model testing completed!")

## Step 9: Save and Export Model

Save the fine-tuned model for later use or export to Ollama.

In [None]:
print("💾 Saving fine-tuned model...")

# Save the model
output_dir = "bitnet_rust_model"
try:
    model.save_pretrained_merged(
        output_dir,
        tokenizer,
        save_method="merged_16bit"
    )
    print(f"✅ Model saved to '{output_dir}' directory")

    # List saved files
    print("\n📂 Saved files:")
    for file in os.listdir(output_dir):
        file_path = os.path.join(output_dir, file)
        size = os.path.getsize(file_path) / (1024*1024)  # Size in MB
        print(f"   - {file} ({size:.1f} MB)")

except Exception as e:
    print(f"❌ Error saving model: {e}")

print("\n" + "="*60)
print("🎊 FINE-TUNING COMPLETE! 🎊")
print("="*60)

print("\n📋 Next Steps:")
print("\n1. 📥 Download your model:")
print("   - Go to Files tab in Colab")
print(f"   - Download the '{output_dir}' folder")

print("\n2. 🦙 For Ollama integration:")
print("   - Install llama.cpp: git clone https://github.com/ggerganov/llama.cpp")
print("   - Convert to GGUF format:")
print(f"     python llama.cpp/convert.py {output_dir} --outtype q8_0 --outfile bitnet_rust.gguf")
print("   - Create Ollama Modelfile and import")

print("\n3. 🚀 Use your model:")
print("   - Load in your applications")
print("   - Deploy to production")
print("   - Continue fine-tuning with more data")

print("\n✨ Happy coding with your BitNet-Rust AI assistant!")