In [1]:
# 1. SETUP AND IMPORTS
# --------------------
import os
from pathlib import Path
from psycopg import connect
from dotenv import load_dotenv
import requests
import json
from tqdm import tqdm


In [2]:
# Load environment variables
load_dotenv(".env")

# Set paths
PROJECT_ROOT = Path(r"C:\Users\chaym\Desktop\projet_Ai\Chatbot-RAG")
DATA_DIR = PROJECT_ROOT / "data" / "TRANS_TXT"

if not DATA_DIR.is_dir():
    raise FileNotFoundError(f"DATA_DIR '{DATA_DIR}' not found.")


In [3]:
# 2. LOAD CORPUS

def read_text_file(filepath):
    """Read text file with multiple encoding fallbacks."""
    encodings = ['utf-8', 'latin-1', 'cp1252']
    for enc in encodings:
        try:
            with open(filepath, 'r', encoding=enc) as f:
                return f.read().strip()
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Could not decode {filepath}")

corpus_list = []
for fname in sorted(os.listdir(DATA_DIR)):
    if fname.endswith(".txt"):
        filepath = DATA_DIR / fname
        text = read_text_file(filepath)
        if text:
            corpus_list.append(text)

print(f" Loaded {len(corpus_list)} documents")

 Loaded 41 documents


In [4]:
from psycopg import connect
from dotenv import load_dotenv
import os

load_dotenv(".env")

DB_URL = os.getenv("DATABASE_URL")
conn = connect(DB_URL)
cur = conn.cursor()


In [5]:
print("Current working dir:", os.getcwd())
print(".env file exists:", os.path.isfile(".env"))
print("DATABASE_URL loaded:", DB_URL is not None)

Current working dir: c:\Users\chaym\Desktop\projet_Ai\Chatbot-RAG\notebook
.env file exists: False
DATABASE_URL loaded: True


In [6]:
# Create table with correct vector dimension for Ollama (768)
conn.rollback()
cur.execute("DROP TABLE IF EXISTS documents;")
cur.execute("""
CREATE TABLE documents (
    id SERIAL PRIMARY KEY,
    source TEXT,
    corpus TEXT,
    embedding VECTOR(768)
);
""")
conn.commit()
print(" Table 'documents' created")

 Table 'documents' created


In [7]:
# 4. EMBEDDING GENERATION (OLLAMA)

def generate_embedding_ollama(text: str):
    """
    Generate embeddings using Ollama's nomic-embed-text model.
    Returns a 768-dimensional vector representing the text meaning.
    """
    url = "http://localhost:11434/api/embeddings"
    payload = {
        "model": "nomic-embed-text",
        "prompt": text
    }
    
    response = requests.post(url, json=payload)
    
    if response.status_code == 200:
        return response.json()["embedding"]
    else:
        raise Exception(f"Ollama error: {response.text}")



In [9]:
# 5. INSERT DOCUMENTS WITH EMBEDDINGS
def insert_documents_with_embeddings():
    """
    Process each document and store in database:
    - Generate embedding (vector representation)
    - Store text + embedding for later retrieval
    """
    for idx, text in enumerate(tqdm(corpus_list, desc="Inserting documents")):
        try:
            embedding = generate_embedding_ollama(text)
            
            cur.execute("""
                INSERT INTO documents (source, corpus, embedding)
                VALUES (%s, %s, %s)
            """, (f"document_{idx}", text, embedding))
            
        except Exception as e:
            print(f" Error on document {idx}: {e}")
            continue
    
    conn.commit()
    print(f" Inserted {len(corpus_list)} documents with embeddings")


In [11]:
# Run insertion (comment out after first run to avoid duplicates)
insert_documents_with_embeddings()

Inserting documents: 100%|██████████| 41/41 [01:36<00:00,  2.36s/it]

 Inserted 41 documents with embeddings





In [12]:
# 6. SIMILARITY SEARCH
# --------------------
def search_similar_documents(question: str, top_k: int = 3):
    """
    Find documents most similar to the question using vector similarity.
    
    How it works:
    1. Convert question to embedding
    2. Compare with all document embeddings using cosine distance
    3. Return top_k most similar results
    """
    question_embedding = generate_embedding_ollama(question)
    
    cur.execute("""
        SELECT 
            id,
            source,
            corpus,
            1 - (embedding <=> %s::vector) AS similarity
        FROM documents
        ORDER BY embedding <=> %s::vector
        LIMIT %s
    """, (question_embedding, question_embedding, top_k))
    
    return cur.fetchall()


In [18]:
# 7. RAG RESPONSE GENERATION
# ---------------------------
def generate_rag_response(question: str):
    """
    Complete RAG pipeline:
    
    RETRIEVAL: Find relevant documents
    AUGMENTATION: Build context from documents
    GENERATION: Generate answer using Ollama LLM
    """
    print(f" Searching relevant documents...")
    similar_docs = search_similar_documents(question, top_k=3)
    
    # Build context from retrieved documents
    context = "\n\n---\n\n".join([doc[2] for doc in similar_docs])
    
    # Create prompt
    prompt = f"""Tu es un assistant qui répond aux questions basées UNIQUEMENT sur les documents fournis.

Documents de référence:
{context}

Question: {question}

Réponds de manière précise en te basant sur les documents. Si l'information n'y est pas, dis-le clairement."""

    # Generate response with Ollama
    print(f" Generating response...")
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": "gemma3:1b",
        "prompt": prompt,
        "stream": False
    }
    
    response = requests.post(url, json=payload)
    
    if response.status_code == 200:
        answer = response.json()["response"]
        return answer, similar_docs
    else:
        raise Exception(f"Ollama error: {response.text}")



In [19]:
# 8. TEST YOUR CHATBOT!

def chat():
    """Interactive chatbot interface."""
    print("\n" + "="*60)
    print(" RAG Chatbot Ready! (type 'quit' to exit)")
    print("="*60)
    
    while True:
        question = input("\n Your question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print(" Goodbye!")
            break
        
        if not question:
            continue
        
        try:
            answer, sources = generate_rag_response(question)
            
            print("\n" + "-"*60)
            print(f" Answer:\n{answer}")
            print("\n Sources:")
            for i, (id, source, text, similarity) in enumerate(sources):
                print(f"   {i+1}. {source} (similarity: {similarity:.3f})")
            print("-"*60)
            
        except Exception as e:
            print(f" Error: {e}")

# Start chatting!
chat()


 RAG Chatbot Ready! (type 'quit' to exit)
 Searching relevant documents...
 Generating response...

------------------------------------------------------------
 Answer:
Bonjour ici Prénom Nom.

 Sources:
   1. document_40 (similarity: 0.703)
   2. document_24 (similarity: 0.668)
   3. document_23 (similarity: 0.642)
------------------------------------------------------------
 Searching relevant documents...
 Generating response...

------------------------------------------------------------
 Answer:
10h00


 Sources:
   1. document_11 (similarity: 0.670)
   2. document_19 (similarity: 0.600)
   3. document_10 (similarity: 0.597)
------------------------------------------------------------
 Goodbye!
