In [1]:
# 1. SETUP AND IMPORTS
# --------------------
import os
from pathlib import Path
from psycopg import connect
from dotenv import load_dotenv
import requests
import json
from tqdm import tqdm

# Load environment variables
load_dotenv()

True

In [None]:
# Set paths
PROJECT_ROOT = Path(r"C:\Users\chaym\Desktop\projet_Ai\Chatbot-RAG")
DATA_DIR = PROJECT_ROOT / "data" / "TRANS_TXT"

if not DATA_DIR.is_dir():
    raise FileNotFoundError(f"DATA_DIR '{DATA_DIR}' not found.")


# 2. LOAD CORPUS
# --------------
def read_text_file(filepath):
    """Read text file with multiple encoding fallbacks."""
    encodings = ['utf-8', 'latin-1', 'cp1252']
    for enc in encodings:
        try:
            with open(filepath, 'r', encoding=enc) as f:
                return f.read().strip()
        except UnicodeDecodeError:
            continue
    raise UnicodeDecodeError(f"Could not decode {filepath}")

corpus_list = []
for fname in sorted(os.listdir(DATA_DIR)):
    if fname.endswith(".txt"):
        filepath = DATA_DIR / fname
        text = read_text_file(filepath)
        if text:
            corpus_list.append((fname, text))  # Store filename too

print(f" Loaded {len(corpus_list)} documents")

✅ Loaded 41 documents


In [None]:
# 3. DATABASE CONNECTION
# -----------------------
DB_URL = os.getenv("DATABASE_URL")
if not DB_URL:
    raise ValueError("DATABASE_URL not found in .env file")

# Create fresh connection
conn = connect(DB_URL, autocommit=False)
cur = conn.cursor()

print(" Connected to database")

# Enable pgvector extension (only once)
try:
    cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
    conn.commit()
    print(" pgvector extension enabled")
except Exception as e:
    conn.rollback()
    print(f"  Extension already exists: {e}")

# Create table (according to instructions: id, corpus, embedding only)
# Note: Added 'source' for tracking - remove if you want strict adherence
try:
    cur.execute("DROP TABLE IF EXISTS embeddings;")
    cur.execute("""
    CREATE TABLE embeddings (
        id SERIAL PRIMARY KEY,
        corpus TEXT NOT NULL,
        embedding VECTOR(768) NOT NULL
    );
    """)
    conn.commit()
    print(" Table 'embeddings' created successfully")
except Exception as e:
    conn.rollback()
    print(f" Error creating table: {e}")



 Connected to database
 pgvector extension enabled
 Table 'embeddings' created successfully


In [4]:
print("Current working dir:", os.getcwd())
print(".env file exists:", os.path.isfile(".env"))
print("DATABASE_URL loaded:", DB_URL is not None)

Current working dir: c:\Users\chaym\Desktop\projet_Ai\Chatbot-RAG\notebook
.env file exists: False
DATABASE_URL loaded: True


In [5]:
# 4. EMBEDDING GENERATION (OLLAMA)
# ---------------------------------
def generate_embedding_ollama(text: str):
    """
    Generate embeddings using Ollama's nomic-embed-text model.
    Returns a 768-dimensional vector representing the text meaning.
    """
    url = "http://localhost:11434/api/embeddings"
    payload = {
        "model": "nomic-embed-text",
        "prompt": text
    }
    
    response = requests.post(url, json=payload)
    
    if response.status_code == 200:
        return response.json()["embedding"]
    else:
        raise Exception(f"Ollama error: {response.text}")



In [6]:
# 4. EMBEDDING GENERATION (OLLAMA)

def generate_embedding_ollama(text: str):
    """
    Generate embeddings using Ollama's nomic-embed-text model.
    Returns a 768-dimensional vector representing the text meaning.
    """
    url = "http://localhost:11434/api/embeddings"
    payload = {
        "model": "nomic-embed-text",
        "prompt": text
    }
    
    response = requests.post(url, json=payload)
    
    if response.status_code == 200:
        return response.json()["embedding"]
    else:
        raise Exception(f"Ollama error: {response.text}")



In [7]:
# 5. INSERT DOCUMENTS WITH EMBEDDINGS
# ------------------------------------
def insert_documents_with_embeddings():
    """
    Process each document and store in database:
    - Generate embedding (vector representation)
    - Store text + embedding for later retrieval
    """
    # Check if data already inserted
    cur.execute("SELECT COUNT(*) FROM embeddings;")
    count = cur.fetchone()[0]
    
    if count > 0:
        print(f"  Table already contains {count} documents")
        response = input("Do you want to re-insert? (yes/no): ").strip().lower()
        if response != 'yes':
            print(" Skipping insertion")
            return
        
        # Clear table if re-inserting
        cur.execute("TRUNCATE TABLE embeddings RESTART IDENTITY;")
        conn.commit()
    
    print(f" Inserting {len(corpus_list)} documents...")
    
    for idx, (filename, text) in enumerate(tqdm(corpus_list, desc="Processing")):
        try:
            # Generate embedding
            embedding = generate_embedding_ollama(text)
            
            # Insert into database (only corpus and embedding as per instructions)
            cur.execute("""
                INSERT INTO embeddings (corpus, embedding)
                VALUES (%s, %s)
            """, (text, embedding))
            
            # Commit every 10 documents to avoid long transactions
            if (idx + 1) % 10 == 0:
                conn.commit()
            
        except Exception as e:
            print(f"\n Error on document {idx} ({filename}): {e}")
            conn.rollback()  # CRITICAL: Rollback on error
            continue
    
    # Final commit
    conn.commit()
    print(f"\n Successfully inserted {len(corpus_list)} documents with embeddings")

# Run insertion (comment out after first successful run)
insert_documents_with_embeddings()

 Inserting 41 documents...


Processing: 100%|██████████| 41/41 [01:37<00:00,  2.38s/it]


 Successfully inserted 41 documents with embeddings





In [None]:
# 6. SIMILARITY SEARCH
# --------------------
def search_similar_documents(question: str, top_k: int = 3):
    conn.rollback()
    """
    Find documents most similar to the question using vector similarity.
    
    How it works:
    1. Convert question to embedding
    2. Compare with all document embeddings using cosine distance
    3. Return top_k most similar results
    """
    try:
        # Ensure clean transaction state
        conn.rollback()
        
        # Generate question embedding
        question_embedding = generate_embedding_ollama(question)
        
        # Search using cosine distance (<=> operator)
        cur.execute("""
            SELECT 
                id,
                corpus,
                1 - (embedding <=> %s::vector) AS similarity
            FROM embeddings
            ORDER BY embedding <=> %s::vector
            LIMIT %s
        """, (question_embedding, question_embedding, top_k))
        
        results = cur.fetchall()
        conn.commit()
        
        return results
        
    except Exception as e:
        conn.rollback()
        raise Exception(f"Search error: {e}")


In [9]:
# 7. RAG RESPONSE GENERATION
# ---------------------------
def generate_rag_response(question: str):
    """
    Complete RAG pipeline:
    
    RETRIEVAL: Find relevant documents
    AUGMENTATION: Build context from documents
    GENERATION: Generate answer using Ollama LLM
    """
    print(f" Searching relevant documents...")
    similar_docs = search_similar_documents(question, top_k=3)
    
    # Build context from retrieved documents
    context = "\n\n---\n\n".join([doc[2] for doc in similar_docs])
    
    # Create prompt
    prompt = f"""Tu es un assistant qui répond aux questions basées UNIQUEMENT sur les documents fournis.

Documents de référence:
{context}

Question: {question}

Réponds de manière précise en te basant sur les documents. Si l'information n'y est pas, dis-le clairement."""

    # Generate response with Ollama
    print(f" Generating response...")
    url = "http://localhost:11434/api/generate"
    
    payload = {
        "model": "gemma3:1b",
        "prompt": prompt,
        "stream": False
    }
    
    response = requests.post(url, json=payload)
    
    if response.status_code == 200:
        answer = response.json()["response"]
        return answer, similar_docs
    else:
        raise Exception(f"Ollama error: {response.text}")



In [10]:
# 8. TEST YOUR CHATBOT!

def chat():
    """Interactive chatbot interface."""
    print("\n" + "="*60)
    print(" RAG Chatbot Ready! (type 'quit' to exit)")
    print("="*60)
    
    while True:
        question = input("\n Your question: ").strip()
        
        if question.lower() in ['quit', 'exit', 'q']:
            print(" Goodbye!")
            break
        
        if not question:
            continue
        
        try:
            answer, sources = generate_rag_response(question)
            
            print("\n" + "-"*60)
            print(f" Answer:\n{answer}")
            print("\n Sources:")
            for i, (id, source, text, similarity) in enumerate(sources):
                print(f"   {i+1}. {source} (similarity: {similarity:.3f})")
            print("-"*60)
            
        except Exception as e:
            print(f" Error: {e}")

# Start chatting!
chat()


 RAG Chatbot Ready! (type 'quit' to exit)
 Searching relevant documents...
 Error: sequence item 0: expected str instance, float found
 Goodbye!
