## `Imports & Setup`

#### External Libraries

In [12]:
import requests
import re
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import json
import torch

#### Config Variables

In [13]:
# "full", "first_half", "second_half"
row_range = "full"

# picking only the top k related paragraphs
find_top_k = 4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = device.type

### Functions

In [18]:
def split_text_into_paragraphs(text, chunk_size=3, merge_headings=True):
    """Split policy text into N-sentence chunks."""

    # Fix encoding and glued terms (e.g. BenchmarkRate → Benchmark Rate)
    text = text.replace("�", " ").replace("•", "*")
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)        # aB → a B
    text = re.sub(r'(?<=\d)(?=[A-Z])', ' ', text)           # 25Years → 25 Years
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)        # abc123 → abc 123
    text = re.sub(r'(?<=[a-z])(?=[A-Z][a-z])', '. ', text)  # add inferred periods

    # Normalize spacing
    text = re.sub(r'\n{2,}', '\n', text)      # collapse double line breaks
    text = re.sub(r'\s+', ' ', text).strip()  # remove excess whitespace

    # Split at likely section headings
    sections = re.split(r'\n(?=[A-Z][^\n]{3,60}\n)', text)

    tokenizer = PunktSentenceTokenizer()
    chunks = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # Convert bullet-style lines into full sentences
        section = re.sub(r"\n\s*\*\s*", ". ", section)
        section = re.sub(r"\*\s*", "", section)

        # Break before heading-like phrases
        section = re.sub(r'(?<=\. )([A-Z][^\n]{3,60})(?= )', r'\n\1', section)

        sentences = tokenizer.tokenize(section)

        # Attach short heading-only lines to previous chunk
        if merge_headings and len(sentences) <= 1 and chunks:
            chunks[-1] += " " + section
            continue

        # Group into N-sentence chunks
        for i in range(0, len(sentences), chunk_size):
            chunk = " ".join(sentences[i:i + chunk_size])
            chunks.append(chunk.strip())

    return chunks
def get_embedding(text, model="nomic-embed-text"):
    if device == "cuda":
        tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base")
        model = AutoModel.from_pretrained("intfloat/e5-base").to("cuda")
        
        if isinstance(text, str):
            text = [text]
        
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
        
        return embeddings.cpu().detach().numpy().tolist()[0]
    else:
        url = "http://localhost:11434/api/embed"
        payload = {
            "model": model,
            "input": text
        }
        response = requests.post(url, json=payload)
        return response.json()["embeddings"][0]
def cosine_similarity(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings):
    most_related_paragraphs = []

    for index in range(len(paragraph_embeddings)):
        score = cosine_similarity(policy_of_interest_embedding, paragraph_embeddings[index])
        most_related_paragraphs.append((paragraphs[index], score))
        
    most_related_paragraphs.sort(key=lambda x: x[1], reverse=True)
    most_related_paragraphs = most_related_paragraphs[:find_top_k]
    combined_paragraphs = "\n".join([p[0] for p in most_related_paragraphs])
    
    prompt = f"""
        You are to extract form-related policies from the attached 'Policies_Meridian.docx'. Focus on identifying both **explicit mentions** and **indirect references** (e.g., policies embedded in documentation procedures or described without using exact policy names).

        The 'Policies_Meridian.docx' file content is provided below:

        {combined_paragraphs}

        The policy of interest is "{policy_of_interest}" with Search Terms Synonyms "{search_terms_synonyms}" for targeting the location of that policy in the doc. 
        Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures. 
        This is a very important policy detail document, and many people's decisions will depend on the accuracy of your response.

        Output the result in the following format:

        {{
            "Y/N/M": "Y",
            "POLICY DETAILS": "Meridian requires signed consent from all applicants prior to underwriting, but does not provide consent forms to brokers."
        }}

        For each item, say:
        - "Y" if it is clearly mentioned, which means the policy is explicitly stated in the text without any ambiguity.
        - "M" if it is mentioned indirectly, which means the policy is implied or embedded in procedures, but not explicitly stated.
        - "N" if not found, which means the policy is not mentioned at all.

        - "POLICY DETAILS" should contain the specific details of the policy as mentioned in the text. If the policy is not mentioned, leave it blank.
    """.strip()

    return prompt
def request_extracted_policy_detail_from_ollama(prompt, policy_of_interest):
    url = "http://localhost:11434/api/generate"

    headers = {
        "Content-Type": "application/json"
    }

    data = {
        "model": "deepseek-r1:8b",
        "prompt": prompt,
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "Y/N/M": {
                    "type": "string",
                    "enum": ["Y", "M", "N"]
                },
                "POLICY DETAILS": {
                    "type": "string"
                }
            },
            "required": ["Y/N/M", "POLICY DETAILS"]
        }
    }

    response = requests.post(url, headers=headers, json=data)
    response = response.json()
    response_data = response['response']
    response_data = json.loads(response_data)
    response_data["POLICY NAME"] = policy_of_interest
    response_data = json.dumps(response_data)
    return response_data

## `Load & Preprocess`

#### Load DOCX and embedding them into vectors

In [19]:
with open("data/preprocessed_data/policies_meridian_plaintext.txt", "r", encoding="utf-8", errors="replace") as f:
    policies_meridian_plaintext = "\n".join(f.readlines())

In [20]:
paragraphs = split_text_into_paragraphs(policies_meridian_plaintext)
paragraph_embeddings = []
for p in paragraphs:
   paragraph_embeddings.append(get_embedding(p))

#### Load Policies and embedding them into vectors

In [22]:
df = pd.read_csv("./data/preprocessed_data/policies_to_update.csv")
df = df.drop(index=[0,1])
df = df.drop(columns=["Y/N/M", "POLICY DETAILS", "Prompt"])
if row_range == "full":
    df = df.iloc[0:len(df)]
elif row_range == "first_half":
    df = df.iloc[0:len(df)//2]
elif row_range == "second_half":
    df = df.iloc[len(df)//2:len(df)]
else:
    df = df.iloc[0:len(df)]
 
policy_of_interests = df["POLICY NAME"].tolist()
search_terms_synonyms = df["Search Terms Synonyms"].tolist()
policy_of_interest_embeddings = []

for p in policy_of_interests:
    policy_of_interest_embeddings.append(get_embedding(p))



## `Generate Table`

#### Loop through each policy of interest and generate the prompt

In [None]:
results = []
prompts = []
parsed_results = []
n_count = 0

for i, policy_of_interest in enumerate(policy_of_interests):
    policy_of_interest_embedding = policy_of_interest_embeddings[i]
    prompt = generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings)
    prompts.append(prompt)

for i, policy_of_interest in enumerate(policy_of_interests):
    extracted_policy_detail = request_extracted_policy_detail_from_ollama(prompts[i], policy_of_interest)
    results.append(extracted_policy_detail)

    try:
        detail_dict = json.loads(extracted_policy_detail)
        parsed_results.append(detail_dict)

        if detail_dict.get("Y/N/M", "").strip() == "N":
            n_count += 1
            status = "N"
        else:
            status = "Y/M"

    except Exception as e:
        parsed_results.append({
            "POLICY NAME": policy_of_interest,
            "Y/N/M": "ERROR",
            "POLICY DETAILS": f"Failed to parse: {str(e)}"
        })
        status = "Parse Error"

    print(f"Processed {i+1}/{len(policy_of_interests)}: {policy_of_interest} → {status}")

print(f"\n Total 'N' results: {n_count} / {len(policy_of_interests)}")

Processed 1/248: BFS (CMHC Program) → Y/M
Processed 2/248: BFS ALT-A → Y/M
Processed 3/248: BFS Stated Income (Bank Statements) → Y/M
Processed 4/248: BFS Stated Income (Conventional) → Y/M
Processed 5/248: BFS Stated Income (Sagen & CG Program) → Y/M
Processed 6/248: Cash Back Mortgages → Y/M
Processed 7/248: Collateral Switch/Transfer → Y/M
Processed 8/248: Construction → Y/M
Processed 9/248: Cottage/Recreational Properties → Y/M
Processed 10/248: Equity Program → Y/M
Processed 11/248: Flex Down Payments (ie; Credit Cards) → Y/M
Processed 12/248: Foreign Borrowers (Non-Residents) → Y/M
Processed 13/248: Limited Feature Mortgages → Y/M
Processed 14/248: Medical Professionals Program → Y/M
Processed 15/248: Mortgage & HELOC Combinations → Y/M
Processed 16/248: Net Worth Program → Y/M
Processed 17/248: New to Canada → Y/M
Processed 18/248: New to Canada (Rental) → Y/M
Processed 19/248: No-fee Alt Lender → Y/M
Processed 20/248: Non-Permanent/Temporary Residents → Y/M
Processed 21/248: Op

In [80]:
results_df = pd.DataFrame(results)
# df_results.to_csv("../../data/preprocessed_data/extracted_policy_details.csv", index=False)
# df_results.to_csv("../../data/preprocessed_data/extracted_policy_details_updated.csv", index=False)
results_df.to_csv("./extracted_policy_details_updated(first_half).csv", index=False)
# df_results.to_csv("../../data/preprocessed_data/extracted_policy_details_updated(second_half).csv", index=False)
# df_results.to_csv("../../data/preprocessed_data/extracted_policy_details_updated(full).csv", index=False)