# 🚀 Model Setup for Document Search System

This notebook will help you download and set up GGUF models for the document search system using Hugging Face Hub.

## Step 1: Install Required Libraries

In [1]:
!pip install huggingface_hub transformers

[0mCollecting huggingface_hub
  Using cached huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting transformers
  Using cached transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface_hub)
  Using cached hf_xet-1.1.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
[31mERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: '/usr/local/lib/python3.10/dist-packages/numpy-1.26.3.dist-info/METADATA'
[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


## Step 2: Login to Hugging Face (Optional but recommended)

In [3]:
!pip install huggingface_hub 

Collecting huggingface_hub
  Using cached huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Using cached huggingface_hub-0.33.4-py3-none-any.whl (515 kB)
Installing collected packages: huggingface_hub
Successfully installed huggingface_hub-0.33.4
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [4]:
from huggingface_hub import login

# Uncomment and run this if you want to login to Hugging Face
# This is optional but recommended for better download speeds
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Step 3: Create Models Directory

In [6]:
import os
from pathlib import Path

# Create models directory
models_dir = Path("models")
models_dir.mkdir(exist_ok=True)
print(f"✅ Models directory created: {models_dir.absolute()}")

✅ Models directory created: /workspace/models


## Step 4: Choose and Download a Model

Select one of the following options based on your needs:

### Option 1: Phi-3 Mini (3.8B) - Recommended for speed

In [7]:
from huggingface_hub import hf_hub_download
import shutil

print("📥 Downloading Phi-3 Mini GGUF model...")

# Download Phi-3 Mini Q4 GGUF model
downloaded_file = hf_hub_download(
    repo_id="microsoft/Phi-3-mini-4k-instruct-gguf",
    filename="Phi-3-mini-4k-instruct-q4.gguf",
    local_dir="models",
    local_dir_use_symlinks=False
)

# Move to expected location
target_file = models_dir / "llama-model.gguf"
if target_file.exists():
    target_file.unlink()  # Remove existing file

shutil.move(downloaded_file, target_file)
print(f"✅ Phi-3 Mini model ready at: {target_file}")
print(f"📊 Model size: {target_file.stat().st_size / 1024 / 1024 / 1024:.2f} GB")

📥 Downloading Phi-3 Mini GGUF model...


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Phi-3-mini-4k-instruct-q4.gguf:   0%|          | 0.00/2.39G [00:00<?, ?B/s]

✅ Phi-3 Mini model ready at: models/llama-model.gguf
📊 Model size: 2.23 GB


### Option 2: Llama 3.1 8B - Better quality, larger model

In [8]:
from huggingface_hub import hf_hub_download
import shutil

print("📥 Downloading Llama 3.1 8B GGUF model...")

# Download Llama 3.1 8B Q4_K_M GGUF model
downloaded_file = hf_hub_download(
    repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
    filename="Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
    local_dir="models",
    local_dir_use_symlinks=False
)

# Move to expected location
target_file = models_dir / "llama-model.gguf"
if target_file.exists():
    target_file.unlink()  # Remove existing file

shutil.move(downloaded_file, target_file)
print(f"✅ Llama 3.1 8B model ready at: {target_file}")
print(f"📊 Model size: {target_file.stat().st_size / 1024 / 1024 / 1024:.2f} GB")

📥 Downloading Llama 3.1 8B GGUF model...


Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

KeyboardInterrupt: 

### Option 3: Llama 3.2 1B - Fastest, smallest model

In [None]:
from huggingface_hub import hf_hub_download
import shutil

print("📥 Downloading Llama 3.2 1B GGUF model...")

# Download Llama 3.2 1B Q4_K_M GGUF model
downloaded_file = hf_hub_download(
    repo_id="bartowski/Llama-3.2-1B-Instruct-GGUF",
    filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf",
    local_dir="models",
    local_dir_use_symlinks=False
)

# Move to expected location
target_file = models_dir / "llama-model.gguf"
if target_file.exists():
    target_file.unlink()  # Remove existing file

shutil.move(downloaded_file, target_file)
print(f"✅ Llama 3.2 1B model ready at: {target_file}")
print(f"📊 Model size: {target_file.stat().st_size / 1024 / 1024 / 1024:.2f} GB")

## Step 5: Verify Model Installation

In [16]:
# Check if model file exists and get info
model_path = models_dir / "llama-model.gguf"

if model_path.exists():
    file_size_gb = model_path.stat().st_size / 1024 / 1024 / 1024
    print(f"✅ Model successfully installed!")
    print(f"📁 Location: {model_path.absolute()}")
    print(f"📊 Size: {file_size_gb:.2f} GB")
    
    # Try to load the model to verify it works
    try:
        print("\n🧪 Testing model loading...")
        
        # Import our LLM service
        import sys
        sys.path.append('.')
        
        from config import Config
        from llm_service_cpp import LLMServiceCPP
        
        config = Config()
        llm_service = LLMServiceCPP(config)
        
        if llm_service.check_model_availability():
            print("✅ Model loaded successfully and is working!")
            
            # Test generation
            test_response = llm_service.llm(
                "<|system|>You are a helpful assistant.<|user|>Say hello!<|assistant|>",
                max_tokens=50,
                temperature=0.1
            )
            print(f"🤖 Test response: {test_response['choices'][0]['text'].strip()}")
            
        else:
            print("❌ Model loaded but not responding correctly")
            
    except Exception as e:
        print(f"⚠️  Model file exists but couldn't test loading: {e}")
        print("This might be normal if dependencies aren't fully installed yet.")
        
else:
    print("❌ Model file not found. Please run one of the download options above.")

✅ Model successfully installed!
📁 Location: /workspace/models/llama-model.gguf
📊 Size: 2.23 GB

🧪 Testing model loading...
⚠️  Model file exists but couldn't test loading: No module named 'llama_cpp'
This might be normal if dependencies aren't fully installed yet.


## Step 6: Install System Dependencies

In [17]:
# Install the main system requirements
print("📦 Installing system requirements...")
!pip install --upgrade pip
!pip install -r requirements.txt

print("\n✅ All dependencies installed!")
print("\n🎉 Setup Complete!")
print("\nNext steps:")
print("1. Build index: python run_search.py --build /path/to/your/documents")
print("2. Start web UI: python run_search.py --web")
print("3. Or search directly: python run_search.py --search 'your question'")

📦 Installing system requirements...
Collecting streamlit>=1.28.0 (from -r requirements.txt (line 6))
  Using cached streamlit-1.47.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain>=0.1.0 (from -r requirements.txt (line 7))
  Using cached langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting chromadb>=0.4.0 (from -r requirements.txt (line 8))
  Using cached chromadb-1.0.15-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting llama-cpp-python>=0.2.0 (from -r requirements.txt (line 9))
  Using cached llama_cpp_python-0.3.14-cp310-cp310-linux_x86_64.whl
Collecting altair<6,>=4.0 (from streamlit>=1.28.0->-r requirements.txt (line 6))
  Using cached altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting blinker<2,>=1.5.0 (from streamlit>=1.28.0->-r requirements.txt (line 6))
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit>=1.28.0->-r requirements.txt (line 6))
  Using

## Optional: Test the Complete System

In [14]:
!pip install sentence_transformers

Collecting sentence_transformers
  Using cached sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence_transformers)
  Using cached scikit_learn-1.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Using cached sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
Using cached scikit_learn-1.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
Installing collected packages: scikit-learn, sentence_transformers
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [sentence_transformers]ence_transformers]
[1A[2KSuccessfully installed scikit-learn-1.7.1 sentence_transformers-5.0.0
[0m

In [19]:
# Quick system test
try:
    print("🧪 Testing complete system...")
    
    # Test imports
    from config import Config
    from document_processor import DocumentProcessor
    from embedding_service import EmbeddingService
    from vector_database import VectorDatabase
    from llm_service_cpp import LLMServiceCPP
    
    print("✅ All modules imported successfully")
    
    # Test CUDA availability
    import torch
    if torch.cuda.is_available():
        print(f"✅ CUDA available: {torch.cuda.get_device_name(0)}")
        print(f"   CUDA version: {torch.version.cuda}")
        print(f"   GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    else:
        print("⚠️  CUDA not available - will use CPU mode")
    
    # Test model loading
    config = Config()
    config.create_directories()
    
    print("✅ Configuration loaded")
    print("✅ System ready for document processing!")
    
except Exception as e:
    print(f"❌ System test failed: {e}")
    print("Please check the error and install missing dependencies")

🧪 Testing complete system...
❌ System test failed: No module named 'llama_cpp'
Please check the error and install missing dependencies


In [None]:
# Install llama-cpp-python with CUDA support for RTX 4090
print("📦 Installing llama-cpp-python with CUDA support...")
print("This may take a few minutes to compile...")

# Use the correct CUDA flag (GGML_CUDA instead of LLAMA_CUBLAS)
!CMAKE_ARGS="-DGGML_CUDA=on" pip install llama-cpp-python --upgrade --force-reinstall --no-cache-dir

print("✅ llama-cpp-python with CUDA support installed!")

📦 Installing llama-cpp-python with CUDA support...
This may take a few minutes to compile...
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.14.tar.gz (51.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 MB[0m [31m175.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Downloading typing_extensions-4.14.1-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Downloading numpy-2.2.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting jinja2>=2.11.3 (from llama-cpp-python)
  

In [None]:
# Test the complete system with llama-cpp-python installed
try:
    print("🧪 Testing complete system with llama-cpp-python...")
    
    # Test imports
    from config import Config
    from document_processor import DocumentProcessor
    from embedding_service import EmbeddingService
    from vector_database import VectorDatabase
    from llm_service_cpp import LLMServiceCPP
    
    print("✅ All modules imported successfully")
    
    # Test CUDA availability
    import torch
    if torch.cuda.is_available():
        print(f"✅ CUDA available: {torch.cuda.get_device_name(0)}")
        print(f"   CUDA version: {torch.version.cuda}")
        print(f"   GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    else:
        print("⚠️  CUDA not available - will use CPU mode")
    
    # Test model loading
    config = Config()
    config.create_directories()
    
    print("✅ Configuration loaded")
    
    # Test LLM service
    print("🤖 Testing LLM service...")
    llm_service = LLMServiceCPP(config)
    
    if llm_service.check_model_availability():
        print("✅ LLM model loaded and working!")
        
        # Test a simple generation
        test_response = llm_service.llm(
            "<|system|>You are a helpful assistant.<|user|>Say 'Hello, I am working!'<|assistant|>",
            max_tokens=20,
            temperature=0.1
        )
        print(f"🤖 Test response: {test_response['choices'][0]['text'].strip()}")
        
    else:
        print("❌ LLM model not responding correctly")
    
    print("✅ System ready for document processing!")
    print("\n🎉 Complete setup successful!")
    print("\nNext steps:")
    print("1. Build index: python run_search.py --build /path/to/your/documents")
    print("2. Start web UI: python run_search.py --web")
    print("3. Or search directly: python run_search.py --search 'your question'")
    
except Exception as e:
    print(f"❌ System test failed: {e}")
    import traceback
    traceback.print_exc()
    print("\nTroubleshooting:")
    print("- Make sure all dependencies are installed")
    print("- Check that the model file exists at models/llama-model.gguf")
    print("- Verify CUDA installation if using GPU")