# Email Wizard Assistant - RAG Implementation Notebook
#
# This notebook details the development and testing of the core Retrieval-Augmented Generation (RAG) pipeline.

### 1. Setup and Imports
### Make sure you have activated your virtual environment and installed dependencies from `requirements.txt`.
### ```bash
### pip install -r requirements.txt
### ```
### For local execution involving the Gemini API, ensure the `GOOGLE_API_KEY` environment variable is set *before* starting Jupyter Lab/Notebook:
### ```bash
### export GOOGLE_API_KEY="YOUR_API_KEY" # Linux/macOS
### set GOOGLE_API_KEY="YOUR_API_KEY"   # Windows CMD
### $env:GOOGLE_API_KEY="YOUR_API_KEY" # Windows PowerShell
### ```

In [1]:
# Core Libraries
import pandas as pd
import numpy as np
import json
import os
import time
from sklearn.metrics.pairwise import cosine_similarity

# Embedding Model Library
from sentence_transformers import SentenceTransformer

# LLM Library (Google Gemini)
from google import genai

  from .autonotebook import tqdm as notebook_tqdm


### 2. Configuration and Gemini Client Initialization

In [None]:
# --- Configuration ---
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
# Use a model compatible with your google-genai setup, e.g., 'gemini-pro'
GEMINI_MODEL_NAME = 'gemini-2.5-flash-preview-04-17'
EMAIL_DATA_PATH = 'data/emails.json'
EMBEDDING_SAVE_PATH = 'data/email_embeddings.npy'

# --- Initialize Gemini Client ---
GEMINI_CLIENT = None
API_KEY = os.environ.get("GOOGLE_API_KEY")

if not API_KEY:
    print("WARNING: GOOGLE_API_KEY environment variable not set.")
    print("Gemini API calls will fail. Please set the environment variable and restart the kernel.")
else:
    try:
        # Using google-genai SDK client initialization
        GEMINI_CLIENT = genai.Client(api_key=API_KEY)
        print(f"Gemini client initialized successfully for model access (using configured key). Target model: {GEMINI_MODEL_NAME}")
    except Exception as e:
        print(f"ERROR: Failed to initialize Gemini client: {e}")
        print("Please ensure your API key is valid and the environment variable is set correctly.")
        GEMINI_CLIENT = None # Ensure client is None if setup failed

Gemini client initialized successfully for model access (using configured key). Target model: gemini-2.5-flash-preview-04-17


### 3. Load and Prepare Email Dataset

In [None]:
emails_df = None
try:
    emails_df = pd.read_json(EMAIL_DATA_PATH)
    # Combine subject and body for embedding
    emails_df['full_text'] = emails_df['subject'] + "\n\n" + emails_df['body']
    print(f"Successfully loaded {len(emails_df)} emails from {EMAIL_DATA_PATH}.")
    print(emails_df.head())
except FileNotFoundError:
    print(f"ERROR: Email data file not found at {EMAIL_DATA_PATH}")
except Exception as e:
    print(f"ERROR: Failed to load or process email data: {e}")

### 4. Load Embedding Model and Embed Emails

In [None]:
# --- Load Embedding Model ---
sbert_model = None
try:
    sbert_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    print(f"Sentence transformer model '{EMBEDDING_MODEL_NAME}' loaded successfully.")
except Exception as e:
    print(f"ERROR: Failed to load sentence transformer model: {e}")

In [None]:
# --- Embed Emails (Load if exists, otherwise generate and save) ---
email_embeddings = np.array([]) # Initialize as empty

if sbert_model is not None and emails_df is not None and not emails_df.empty:
    if os.path.exists(EMBEDDING_SAVE_PATH):
        try:
            print(f"Loading pre-computed embeddings from {EMBEDDING_SAVE_PATH}...")
            email_embeddings = np.load(EMBEDDING_SAVE_PATH)
            print(f"Loaded embeddings. Shape: {email_embeddings.shape}")
            if email_embeddings.shape[0] != len(emails_df):
                print("WARNING: Number of embeddings does not match number of emails. Re-generating...")
                email_embeddings = np.array([]) # Force regeneration
        except Exception as e:
            print(f"ERROR loading embeddings: {e}. Will attempt to regenerate.")
            email_embeddings = np.array([]) # Force regeneration

    if email_embeddings.size == 0: # If loading failed or file didn't exist
        try:
            print(f"Generating embeddings for {len(emails_df)} email texts...")
            email_contents_to_embed = emails_df['full_text'].tolist()
            email_embeddings = sbert_model.encode(email_contents_to_embed)
            print(f"Embeddings generated. Shape: {email_embeddings.shape}")
            # Save the generated embeddings
            os.makedirs(os.path.dirname(EMBEDDING_SAVE_PATH), exist_ok=True) # Ensure data dir exists
            np.save(EMBEDDING_SAVE_PATH, email_embeddings)
            print(f"Embeddings saved to {EMBEDDING_SAVE_PATH}")
        except Exception as e:
            print(f"ERROR: Failed to generate or save embeddings: {e}")
            email_embeddings = np.array([]) # Ensure it's empty on failure
else:
    if sbert_model is None:
        print("Skipping embedding generation: SBERT model not loaded.")
    if emails_df is None or emails_df.empty:
        print("Skipping embedding generation: Email data not loaded.")