In [None]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_classic.chains import RetrievalQA
from langchain_classic.prompts import PromptTemplate
import mercury as mr
from langchain_community.document_loaders import PyPDFLoader, CSVLoader, JSONLoader
import pandas as pd
import json
import PyPDF2
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, message=".*Python version.*google.api_core.*")

# Add these imports after your existing imports
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from datetime import datetime, timedelta
import pickle


In [None]:
local_directory_data = 'C:/Users/maran/OneDrive/Documents/Git Profile/Data-Projects/Inference Group/rag_data/'
state_of_union_file = '2024_State_of_the_Union.txt'
rag_data_dir = '/rag_data'
vector_store_faiss = '/faiss_index'
path_to_faiss_dir = local_directory_data + rag_data_dir + vector_store_faiss
path_to_state_of_union = local_directory_data + state_of_union_file

# Custom prompt template
prompt_template = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:
{context}

Question: {question}
Answer: """

opening_question = "What do you want to know?"
chat_messages = []
chat_messages.append(opening_question)

web_crawl_dir = '/web_crawled_data'
path_to_web_crawl_dir = local_directory_data + rag_data_dir + web_crawl_dir

# Create directory if it doesn't exist
os.makedirs(path_to_web_crawl_dir, exist_ok=True)

# Add these variables after your existing variable definitions
calendar_data_dir = '/calendar_data'
path_to_calendar_dir = local_directory_data + rag_data_dir + calendar_data_dir
os.makedirs(path_to_calendar_dir, exist_ok=True)

SCOPES = ['https://www.googleapis.com/auth/calendar.readonly']

In [None]:
app = mr.App(
    title="Simple RAG Query System", 
    description="Ask questions about your documents",
    show_code=False,
    static_notebook=False 
)

question = mr.Text(value=None, label=opening_question)

search_result_choices = mr.Slider(value=1, min=1, max=5, label="Number of results")
show_sources = mr.Checkbox(value=False, label="Show Sources")
file_upload = mr.File(label="File upload", max_file_size="10MB")

url_input = mr.Text(value=None, label="Enter website URL to crawl")
crawl_depth = mr.Slider(value=1, min=1, max=3, label="Crawl depth")
max_pages = mr.Slider(value=10, min=1, max=50, label="Max pages to crawl")

calendar_connect = mr.Checkbox(value=False, label="Connect to Google Calendar")
days_to_fetch = mr.Slider(value=7, min=1, max=30, label="Days of events to fetch")
calendar_credentials = mr.File(label="Upload credentials.json", max_file_size="1MB")
calendar_status = mr.Text(value="Calendar not connected", label="Status")

In [None]:
# Add this cell for web crawling functionality
def crawl_website(start_url, max_depth=1, max_pages=10):
    """
    Crawl a website and extract text content from pages
    """
    visited_urls = set()
    pages_content = []
    
    def extract_page_content(url, depth):
        if (len(visited_urls) >= max_pages or 
            url in visited_urls or 
            depth > max_depth):
            return
        
        try:
            print(f"Crawling: {url} (depth {depth})")
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()
            
            # Get text content
            text = soup.get_text()
            
            # Clean up text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = ' '.join(chunk for chunk in chunks if chunk)
            
            if text:
                pages_content.append({
                    'url': url,
                    'content': text,
                    'depth': depth
                })
            
            visited_urls.add(url)
            
            # Extract and follow links if within depth limit
            if depth < max_depth:
                for link in soup.find_all('a', href=True):
                    next_url = urljoin(url, link['href'])
                    
                    # Basic URL filtering
                    parsed_url = urlparse(next_url)
                    parsed_start = urlparse(start_url)
                    
                    # Only follow links from same domain
                    if (parsed_url.netloc == parsed_start.netloc and
                        next_url not in visited_urls and
                        len(visited_urls) < max_pages):
                        
                        extract_page_content(next_url, depth + 1)
                        
        except Exception as e:
            print(f"Error crawling {url}: {str(e)}")
    
    extract_page_content(start_url, 0)
    return pages_content

def save_crawled_data(pages_content, filename_prefix="crawled"):
    """
    Save crawled content to files
    """
    timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{filename_prefix}_{timestamp}.txt"
    filepath = os.path.join(path_to_web_crawl_dir, filename)
    
    all_content = ""
    for i, page in enumerate(pages_content):
        all_content += f"--- Page {i+1}: {page['url']} (Depth {page['depth']}) ---\\n"
        all_content += page['content'] + "\\n\\n"
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(all_content)
    
    return filepath, all_content

In [None]:
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

try:
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": "Say 'API is working!'"}],
        max_tokens=10
    )
    # print("‚úÖ API Response:", response.choices[0].message.content)

except Exception as e:
    print("‚ùå API Error:", e)

In [None]:
if file_upload.filepath is not None:
    file_extension = os.path.splitext(file_upload.filepath)[1].lower()
    file_content = ""
    
    try:
        if file_extension == '.csv':
            df = pd.read_csv(file_upload.filepath)
            file_content = df.to_string(index=False)
            
        elif file_extension in ['.xlsx', '.xls']:
            df = pd.read_excel(file_upload.filepath)
            file_content = df.to_string(index=False)
            
        elif file_extension == '.pdf':
            with open(file_upload.filepath, 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                for page in pdf_reader.pages:
                    file_content += page.extract_text() + "\n"
                    
        elif file_extension == '.json':
            with open(file_upload.filepath, 'r', encoding='utf-8') as json_file:
                json_data = json.load(json_file)
                file_content = json.dumps(json_data, indent=2)
                
        elif file_extension == '.txt':
            with open(file_upload.filepath, 'r', encoding='utf-8') as text_file:
                file_content = text_file.read()
                
        else:
            with open(file_upload.filepath, 'r', encoding='utf-8') as file_obj:
                file_content = file_obj.read()
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=100
        )
        
        example_texts = text_splitter.split_text(file_content)
        documents = [Document(page_content=text) for text in example_texts]
        
        # Create embeddings and vector store
        embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
        
        vector_store = FAISS.from_documents(documents, embeddings)
        vector_store.save_local(path_to_faiss_dir)
        
        retriever_object = vector_store.as_retriever(
            search_type="similarity",
            search_kwargs={"k": search_result_choices.value} 
        )
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        raise

In [None]:
# Add this cell to handle web crawling in your main flow
crawled_content = ""
crawled_file_path = ""

if url_input.value and url_input.value.strip():
    try:
        pages_content = crawl_website(
            start_url=url_input.value.strip(),
            max_depth=crawl_depth.value,
            max_pages=max_pages.value
        )
        
        if pages_content:
            crawled_file_path, crawled_content = save_crawled_data(pages_content)
            # mr.Output(f"‚úÖ Successfully crawled {len(pages_content)} pages from {url_input.value}")
            
            # Process the crawled content for RAG
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=1000,
                chunk_overlap=100
            )
            
            crawled_texts = text_splitter.split_text(crawled_content)
            crawled_documents = [Document(page_content=text) for text in crawled_texts]
            
            # Create embeddings and vector store for crawled content
            embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
            vector_store = FAISS.from_documents(crawled_documents, embeddings)
            vector_store.save_local(path_to_faiss_dir)
            
            retriever_object = vector_store.as_retriever(
                search_type="similarity",
                search_kwargs={"k": search_result_choices.value}
            )
            
        else:
            pass
            
    except Exception as e:
        print(f"‚ùå Error crawling website: {str(e)}")

In [None]:
# Add this cell after your web crawling cell (around line 40)
calendar_content = ""
calendar_connected = False

if calendar_connect.value and calendar_credentials.filepath is not None:
    calendar_status.value = "Authenticating with Google Calendar..."
    
    creds_path = os.path.join(path_to_calendar_dir, 'credentials.json')
    with open(calendar_credentials.filepath, 'rb') as src_file:
        with open(creds_path, 'wb') as dst_file:
            dst_file.write(src_file.read())

    creds = None
    token_path = os.path.join(path_to_calendar_dir, 'token.pickle')
    
    if os.path.exists(token_path):
        with open(token_path, 'rb') as token:
            creds = pickle.load(token)
    
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(creds_path, SCOPES)
            creds = flow.run_local_server(port=0)
        
        with open(token_path, 'wb') as token:
            pickle.dump(creds, token)

    service = build('calendar', 'v3', credentials=creds)
    now = datetime.utcnow().isoformat() + 'Z'
    end_date = (datetime.utcnow() + timedelta(days=days_to_fetch.value)).isoformat() + 'Z'
    
    events_result = service.events().list(
        calendarId='primary',
        timeMin=now,
        timeMax=end_date,
        maxResults=50,
        singleEvents=True,
        orderBy='startTime'
    ).execute()
    
    events = events_result.get('items', [])
    
    if not events:
        calendar_content = "No upcoming events found."
    else:
        calendar_content = f"UPCOMING CALENDAR EVENTS (Next {days_to_fetch.value} days):\n\n"
        for event in events:
            start = event['start'].get('dateTime', event['start'].get('date'))
            end = event['end'].get('dateTime', event['end'].get('date'))
            summary = event.get('summary', 'No title')
            description = event.get('description', 'No description')
            
            calendar_content += f"Event: {summary}\n"
            calendar_content += f"Time: {start} to {end}\n"
            calendar_content += f"Description: {description}\n"
            calendar_content += "‚îÄ" * 50 + "\n"
    
    calendar_connected = True
    calendar_status.value = f"‚úÖ Connected - {len(events)} events loaded"
    
elif calendar_connect.value and calendar_credentials.filepath is None:
    calendar_status.value = "‚ùå Please upload credentials.json file"
    calendar_connected = False
else:
    calendar_connected = False

In [None]:
# QA Chain setup with all data sources
llm_object = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    openai_api_key=os.getenv('OPENAI_API_KEY')
)

# Determine which retriever to use (priority: file > calendar > web > none)
current_retriever = None
context_source = "general knowledge"

if file_upload.filepath is not None:
    # Process file upload
    current_retriever = retriever_object
    context_source = "uploaded file"
    
elif calendar_connected and calendar_content:
    # Process calendar data for RAG
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100
    )
    
    calendar_texts = text_splitter.split_text(calendar_content)
    calendar_documents = [Document(page_content=text) for text in calendar_texts]
    
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    vector_store = FAISS.from_documents(calendar_documents, embeddings)
    vector_store.save_local(path_to_faiss_dir)
    
    current_retriever = vector_store.as_retriever(
        search_type="similarity",
        search_kwargs={"k": search_result_choices.value}
    )
    context_source = "calendar data"
    
elif url_input.value and url_input.value.strip() and crawled_content:
    # Process web data
    current_retriever = retriever_object
    context_source = "web data"
    
else:
    # No context
    current_retriever = None
    context_source = "general knowledge"

# Create the appropriate QA chain
if current_retriever is None:
    normal_prompt = PromptTemplate(
        input_variables=["question"],
        template="Answer the following question: {question}"
    )
    qa_chain = normal_prompt | llm_object
else:
    PROMPT = PromptTemplate(
        template=prompt_template, 
        input_variables=["context", "question"]
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm_object,
        chain_type="stuff",
        retriever=current_retriever,
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=show_sources.value
    )

In [None]:
# Update your final chat display cell to include calendar context
if url_input.value and url_input.value.strip() and crawled_content:
    chat_messages.append(f"üåê Using content crawled from: {url_input.value}")
    chat_messages.append(f"üìÑ Crawled {len(crawled_content.split('--- Page'))} pages")

if calendar_connected and calendar_content:
    chat_messages.append(f"üìÖ Using calendar data with {len(calendar_content.split('Event:'))-1} events")

if file_upload.filepath is None and not (url_input.value and url_input.value.strip()) and not calendar_connected:
    normal_no_context_result = qa_chain.invoke({"question": question.value})
    llm_result = {'result': normal_no_context_result.content}
else:
    llm_result = qa_chain.invoke({"query": question.value})

if question.value != '':
    if show_sources.value:
        sources_result_string = f"\nüìö SOURCES ({len(llm_result.get('source_documents', []))} documents):"
        for i, doc in enumerate(llm_result.get('source_documents', [])):
            sources_result_string += f"\n\nSource {i+1}:"
            sources_result_string += f"\nContent: {doc.page_content}..."
            if hasattr(doc, 'metadata') and doc.metadata:
                sources_result_string += f"\nMetadata: {doc.metadata}"
    else:
        sources_result_string = ''

    chat_messages.append(question.value)
    chat_messages.append(llm_result['result'] + sources_result_string)

mr.Chat(chat_messages)