In [13]:
##Needed only when run localy --- update using path to your project folder 
import os
import sys
from pathlib import Path

#Update path HERE - dont forget the r in the begining  
project_path = Path(r"C:\Users\genna\Documents\Cam-Course\BoE_RAGS\Bank_of_England_NLP")
os.chdir(project_path)
sys.path.insert(0, str(project_path))

print(f"Current directory: {os.getcwd()}")

Current directory: C:\Users\genna\Documents\Cam-Course\BoE_RAGS\Bank_of_England_NLP


In [14]:
# Cell 1: Imports & Basic Setup
import json
import numpy as np
import faiss
from typing import List, Dict, Any
import openai
from dotenv import load_dotenv

In [15]:

# CELL 2: Set up OpenAI credentials and cashing

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Caching Embeddings for Stability
embedding_cache = {}

import hashlib
import os
import pickle
import openai

# ✅ Caching directory
CACHE_DIR = "embedding_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

def get_embedding(text: str) -> np.ndarray:
    """
    Retrieve embedding from OpenAI API or cache to avoid recomputation.
    """
    # Convert text into a hashable key using SHA256
    text_key = hashlib.sha256(text.encode()).hexdigest()
    cache_path = os.path.join(CACHE_DIR, f"{text_key}.pkl")

    # Check if embedding exists in cache
    if os.path.exists(cache_path):
        with open(cache_path, "rb") as f:
            return pickle.load(f)

    # Updated OpenAI API call (v1.0+ format)
    response = openai.embeddings.create(
        model="text-embedding-ada-002",
        input=[text]  # Must be a list of strings
    )

    embedding = response.data[0].embedding  # ✅ Access embedding correctly

    # Convert to NumPy array and normalize
    embedding = np.array(embedding, dtype=np.float32)
    embedding = embedding / np.linalg.norm(embedding)

    # Save to cache
    with open(cache_path, "wb") as f:
        pickle.dump(embedding, f)

    return embedding




In [16]:
# Cell 3: Load Q&A Documents
def load_qna_documents(folder_path: str) -> List[Dict[str, Any]]:
    """
    Loads all JSON files in folder_path, flattens the Q&A items
    into a list of docs. Each doc is a dict with keys:
        'text': the question/answer text
        'metadata': any extra info (Type, Person, Pages, Filename, etc.)
    """
    all_docs = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".json"):
            full_path = os.path.join(folder_path, filename)
            with open(full_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            first_list = data[0]

            for qa_obj in first_list:
                txt = qa_obj.get("Text", "")
                topics_list = [list(topic.keys())[0] for topic in qa_obj.get("Topics", [])]
                sentiment_key = list(qa_obj.get("Sentiment", {}).keys())[0] if qa_obj.get("Sentiment", {}) else None
                
                meta = {
                    "Type": qa_obj.get("Type", ""),
                    "Person": qa_obj.get("Person", ""),
                    "Role": qa_obj.get("Role", ""),
                    "Affiliation": qa_obj.get("Affiliation", ""),
                    "Page": qa_obj.get("Page", []),
                    "Filename": filename,
                    "Topics": topics_list,
                    "Sentiment": sentiment_key
                }
                
                doc = {
                    "text": txt,
                    "metadata": meta
                }
                all_docs.append(doc)
    
    print(f"Loaded {len(all_docs)} Q/A segments from: {folder_path}")
    return all_docs

# Set folder path
folder_path = "1_data_and_preprocess/1.0_raw/processed_QnA_correct_pages"
docs = load_qna_documents(folder_path)


Loaded 212 Q/A segments from: 1_data_and_preprocess/1.0_raw/processed_QnA_correct_pages


In [17]:
# Cell 4: FAISS Index Class

class FaissIndex:
    def __init__(self, embedding_dim=1536):
        """
        Initializes a FAISS index with normalized embeddings.
        """
        self.embedding_dim = embedding_dim
        self.index = faiss.IndexFlatL2(self.embedding_dim)  # ✅ Use L2 distance for better stability
        self.documents = []  # Store document references
        self.embeddings = None
    
    def add_embeddings(self, embeddings: np.ndarray, docs: List[Dict]):
        """
        Add document embeddings + references to FAISS.
        """
        if not self.index.is_trained:
            raise ValueError("Index is not trained. IndexFlatL2 should be trained by default.")

        self.index.add(embeddings)
        self.documents.extend(docs)
        
        if self.embeddings is None:
            self.embeddings = embeddings
        else:
            self.embeddings = np.vstack([self.embeddings, embeddings])

    def rebuild_index(self):
        """
        Rebuild FAISS index to ensure consistency after updates.
        """
        self.index = faiss.IndexFlatL2(self.embedding_dim)
        self.index.add(self.embeddings)
    
    def search(self, query_embedding: np.ndarray, top_k=3):
        """
        Perform FAISS search using cached and normalized embeddings.
        """
        if not isinstance(query_embedding, np.ndarray):
            raise TypeError(f"Expected numpy.ndarray but got {type(query_embedding)}")

        query_embedding_2d = np.expand_dims(query_embedding, axis=0)
        scores, indices = self.index.search(query_embedding_2d, top_k)
        
        results = []
        for i, idx in enumerate(indices[0]):
            doc = self.documents[idx]
            score = float(scores[0][i])
            results.append((doc, score))
        
        return results



In [18]:
# Cell 5: RAG Pipeline with dynamic top_k

class RAGPipeline:
    def __init__(self, openai_api_key: str, embedding_dim=1536, top_k=3):
        """
        openai_api_key: Your OpenAI API key
        embedding_dim: Must match the embedding model size (1536 for ada-002)
        top_k: Number of top results to retrieve (default = 3)
        """
        openai.api_key = openai_api_key
        self.embedding_dim = embedding_dim
        self.top_k = top_k  # ✅ Store top_k as an instance variable
        self.index = FaissIndex(embedding_dim=embedding_dim)

    def preprocess_document(self, doc):
        """
        Combine metadata and text into a single searchable string.
        This allows FAISS to retrieve results based on metadata and text together.
        """
        metadata_text = f"Person: {doc['metadata'].get('Person', 'Unknown')}. " \
                        f"Role: {doc['metadata'].get('Role', 'Unknown')}. " \
                        f"Affiliation: {doc['metadata'].get('Affiliation', 'Unknown')}. " \
                        f"Topics: {', '.join(doc['metadata'].get('Topics', []))}. " \
                        f"Sentiment: {doc['metadata'].get('Sentiment', '0.0')}."
        
        return f"{metadata_text} {doc['text']}"  # Append metadata to text

    def embed_text(self, text: str) -> np.ndarray:
        """
        Get 1536-dim embedding using OpenAI's latest embedding API.
        """
        try:
            response = openai.embeddings.create(
                model="text-embedding-ada-002",
                input=[text]  # Input must be a list of strings
            )
            embedding = response.data[0].embedding  # Access the embedding array
            return np.array(embedding, dtype=np.float32)
        except Exception as e:
            print(f"Error generating embedding: {e}")
            return np.zeros(self.embedding_dim, dtype=np.float32)  # Return zero-vector in case of error

    def build_index(self, all_docs: List[Dict[str, Any]]):
        """
        1) Embed each doc (including metadata)
        2) Store embeddings + docs in FAISS
        """
        embeddings_list = []
        processed_docs = []  # Store processed documents

        for doc in all_docs:
            processed_text = self.preprocess_document(doc)  
            emb = self.embed_text(processed_text)  # Create embedding for processed text
            embeddings_list.append(emb)

            # Store the processed text as well (for debugging or retrieval)
            doc["processed_text"] = processed_text
            processed_docs.append(doc)

        embeddings_array = np.vstack(embeddings_list)
        self.index.add_embeddings(embeddings_array, processed_docs)
        print(f"Index built with {len(all_docs)} documents.")

    def retrieve_topk(self, query: str, top_k=None):
        """
        1) Embed the user query
        2) Find top_k docs
        """
        if top_k is None:
            top_k = self.top_k  # Use default top_k if not provided

        # ✅ Ensure query is embedded before search
        query_emb = self.embed_text(query)

        if not isinstance(query_emb, np.ndarray):
            raise TypeError(f"Expected numpy.ndarray for query embedding but got {type(query_emb)}")

        results = self.index.search(query_emb, top_k=top_k)
        return results


    def _chat_completion(self, system_prompt: str, user_prompt: str) -> str:
        """
        Helper to call OpenAI ChatCompletion.
        """
        try:
            response = openai.chat.completions.create(  
                model="gpt-4o-mini",  # or "gpt-4o"
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                temperature=0.0,
                max_tokens=4000
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            print(f"Error in ChatCompletion: {e}")
            return "I'm sorry, I couldn't process your request."

    def answer_query(self, query: str, top_k=None) -> Dict[str, Any]:
        """
        1) Retrieve top_k doc chunks
        2) Feed them into ChatCompletion
        3) Return the final answer
        """
        retrieved = self.retrieve_topk(query, top_k=top_k)
        
        # Build a context from top_k docs
        context_snippets = []
        for (doc, score) in retrieved:
            snippet = f"Text: {doc['text']}\nMetadata: {doc['metadata']}\nScore: {score}\n---\n"
            context_snippets.append(snippet)
        
        combined_context = "\n".join(context_snippets)
        
        # Construct final prompt
        system_prompt = (
            "You are a financial analyst helping the PRA, Bank of England."
            "Your role is to provide accurate, concise, and well-supported insights."
            "If the context does not contain enough information, state that explicitly."
        )
        user_prompt = (
            f"CONTEXT:\n{combined_context}\n\n"
            f"USER QUESTION: {query}\n\n"
            "Please give me a concise answer using the context above:"
        )
        
        final_answer = self._chat_completion(system_prompt, user_prompt)
        
        return {
            "query": query,
            "top_docs": retrieved,
            "combined_context": combined_context,
            "answer": final_answer
        }



In [19]:
# Putting It All Together (Demo)

# 1) Your folder path containing JSON QnA
folder_path = "1_data_and_preprocess/1.0_raw/processed_QnA_correct_pages"

# 2) Load data
docs = load_qna_documents(folder_path)
print(f"Loaded {len(docs)} documents.")

# 3) Create pipeline (uses OpenAI key)
rag = RAGPipeline(openai_api_key=os.getenv("OPENAI_API_KEY"), embedding_dim=1536, top_k=5)

# 4) Build FAISS index
rag.build_index(docs)

# Initialize FAISS Index
faiss_index = FaissIndex()

# Generate and add embeddings to FAISS
document_embeddings = np.array([get_embedding(doc["text"]) for doc in docs])
faiss_index.add_embeddings(document_embeddings, docs)




Loaded 212 Q/A segments from: 1_data_and_preprocess/1.0_raw/processed_QnA_correct_pages
Loaded 212 documents.
Index built with 212 documents.


In [21]:
import pandas as pd
from io import StringIO
import IPython.display as display

# Define user query
user_query = """
Identify discussions by bank executives related to financial stability, capital adequacy, or liquidity. Specifically, look for mentions of key risk terms, including: 'liquidity crunch,' 'funding stress,' 'capital shortfall,' 'Common Equity Tier 1 (CET1) capital,' 'regulatory capital buffer,' 'countercyclical capital buffer,' 'credit risk exposure,' 'loan loss provisions,' 'sovereign risk,' 'interest rate risk,' 'economic downturn,' 'recession impact,' 'market volatility,' 'wholesale funding risk,' 'counterparty risk,' 'systemic risk,' 'stress test results,' or 'macroprudential risk.' 

Return results in **CSV format** with the following columns:
1. **Bank Name** (Extracted from the filename)
2. **Time Period** (Extracted from the filename)
3. **Page Numbers** (Comma-separated if multiple)
4. **Specific Risk Terms Mentioned** (Comma-separated)
5. **Text** (Extracted discussion)
6. **Summary of the Discussion** (Concise summary of risk discussion)

Ensure that the response **strictly follows CSV formatting** with a header row and values properly enclosed in double quotes if necessary.
"""

# Get response from RAG model
response = rag.answer_query(user_query, top_k=10)

# Extract the CSV content
file_content = response["answer"]

# Preprocess CSV: Remove Markdown artifacts
if file_content.startswith("```csv") and file_content.endswith("```"):
    file_content = file_content[7:-3].strip()

# Convert CSV string to Pandas DataFrame
df = pd.read_csv(StringIO(file_content))

# Display DataFrame nicely in Jupyter Notebook
display.display(df)


Unnamed: 0,Bank Name,Time Period,Page Numbers,Specific Risk Terms Mentioned,Text,Summary of the Discussion
0,JPMorgan Chase & Co.,Q2 2024,"5, 6","stress capital buffer, capital management","Yeah. So, I'm not going to comment about any c...",Discussion on the volatility and challenges of...
1,Deutsche Bank AG,Q3 2024,"19, 20","capital plan, operational risk","Thanks, Piers. No, not much in the way of new ...",Discussion on capital estimates and operationa...
2,JPMorgan Chase,Q4 2024,11,"credit quality, economic risks",I would just point the biggest driver of credi...,Discussion on the impact of unemployment on cr...


In [23]:
import pandas as pd
from io import StringIO
import IPython.display as display

# Define user query
user_query = """
Identify discussions by bank executives that would be most interesting to the PRA, Bank of England. 

Return results in **CSV format** with the following columns:
1. **Bank Name** (Extracted from the filename)
2. **Time Period** (Extracted from the filename)
3. **Page Numbers** (Comma-separated if multiple)
4. **Specific Risk Terms Mentioned** (Comma-separated)
5. **Text** (Extracted discussion)
6. **Summary of the Discussion** (Concise summary of risk discussion)

Ensure that the response **strictly follows CSV formatting** with a header row and values properly enclosed in double quotes if necessary.
"""

# Get response from RAG model
response = rag.answer_query(user_query, top_k=10)

# Extract the CSV content
file_content = response["answer"]

# Preprocess CSV: Remove Markdown artifacts
if file_content.startswith("```csv") and file_content.endswith("```"):
    file_content = file_content[7:-3].strip()

# Convert CSV string to Pandas DataFrame
df = pd.read_csv(StringIO(file_content))

# Display DataFrame nicely in Jupyter Notebook
display.display(df)


Unnamed: 0,Bank Name,Time Period,Page Numbers,Specific Risk Terms Mentioned,Text,Summary of the Discussion
0,Deutsche Bank AG,Q3-2024,"19, 20","Basel IV Guidance, Leverage Finance Reviews","Thanks, Piers. No, not much in the way of new ...",Discussion on Basel IV guidance and leverage f...
1,Deutsche Bank AG,Q3-2024,20,Leverage Finance Reviews,"Well, we should only say that we are still wai...",Discussion on the status of leverage finance r...
2,JPMorgan Chase & Co.,Q3-2024,11,"Regulatory Justification, Economic Impact",We just want the numbers to be done right and ...,Emphasis on the need for careful regulatory ad...
