## `Imports & Setup`

#### External Libraries

In [43]:
import requests
import re
import sys
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import json
import torch

#### >>> Config Variables <<<

In [None]:
# "full", "first_half", "second_half", "random_10"
row_range = "full"

# { find_top_k } number of paragraphs that have the highest similarity score with the policy and search terms will be used to generate the prompt.
find_top_k = 4

# score of similarity = { policy_weight } * cosine_similarity between policy and paragraph + { 1 - policy_weight } * cosine_similarity between policy and search terms
policy_weight = 0.6

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = device.type

### Functions

In [None]:
def print_progress_bar(iteration, total, prefix='', length=40, start_time=None, line_width=256):
    elapsed = time.time() - start_time if start_time else 0
    avg_time = elapsed / iteration if iteration > 0 else 0
    eta = avg_time * (total - iteration)

    percent = f"{100 * (iteration / float(total)):.1f}"
    filled_length = int(length * iteration // total)
    bar_color = '\033[32m'
    bar = bar_color + '█' * filled_length + '-' * (length - filled_length) + '\033[0m'

    eta_min = int(eta // 60)
    eta_sec = int(eta % 60)

    line = f"|{bar}| {percent}% Complete | ETA: {eta_min}m {eta_sec}s | {prefix}"
    padded_line = line.ljust(line_width)

    sys.stdout.write('\r' + padded_line)
    sys.stdout.flush()
def split_text_into_paragraphs(text, chunk_size=3, merge_headings=True):
    """Split policy text into N-sentence chunks."""

    # Fix encoding and glued terms (e.g. BenchmarkRate → Benchmark Rate)
    text = text.replace("�", " ").replace("•", "*")
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)        # aB → a B
    text = re.sub(r'(?<=\d)(?=[A-Z])', ' ', text)           # 25Years → 25 Years
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)        # abc123 → abc 123
    text = re.sub(r'(?<=[a-z])(?=[A-Z][a-z])', '. ', text)  # add inferred periods

    # Normalize spacing
    text = re.sub(r'\n{2,}', '\n', text)      # collapse double line breaks
    text = re.sub(r'\s+', ' ', text).strip()  # remove excess whitespace

    # Split at likely section headings
    sections = re.split(r'\n(?=[A-Z][^\n]{3,60}\n)', text)

    tokenizer = PunktSentenceTokenizer()
    chunks = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # Convert bullet-style lines into full sentences
        section = re.sub(r"\n\s*\*\s*", ". ", section)
        section = re.sub(r"\*\s*", "", section)

        # Break before heading-like phrases
        section = re.sub(r'(?<=\. )([A-Z][^\n]{3,60})(?= )', r'\n\1', section)

        sentences = tokenizer.tokenize(section)

        # Attach short heading-only lines to previous chunk
        if merge_headings and len(sentences) <= 1 and chunks:
            chunks[-1] += " " + section
            continue

        # Group into N-sentence chunks
        for i in range(0, len(sentences), chunk_size):
            chunk = " ".join(sentences[i:i + chunk_size])
            chunks.append(chunk.strip())

    return chunks
def get_embedding(text, model="nomic-embed-text"):
    if device == "cuda":
        tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-base")
        model = AutoModel.from_pretrained("intfloat/e5-base").to("cuda")
        
        if isinstance(text, str):
            text = [text]
        
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
        
        return embeddings.cpu().detach().numpy().tolist()[0]
    else:
        url = "http://localhost:11434/api/embed"
        payload = {
            "model": model,
            "input": text
        }
        response = requests.post(url, json=payload)
        return response.json()["embeddings"][0]
def cosine_similarity(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings, search_terms_synonyms_embedding):
    most_related_paragraphs = []

    for index in range(len(paragraph_embeddings)):
        score = policy_weight * cosine_similarity(policy_of_interest_embedding, paragraph_embeddings[index]) + (1 - policy_weight) * cosine_similarity(search_terms_synonyms_embedding, paragraph_embeddings[index])
        most_related_paragraphs.append((paragraphs[index], score))
        
    most_related_paragraphs.sort(key=lambda x: x[1], reverse=True)
    most_related_paragraphs = most_related_paragraphs[:find_top_k]
    combined_paragraphs = "\n".join([p[0] for p in most_related_paragraphs])
    
    prompt = f"""
        You are an expert in policy analysis. Your task is to extract **form-related policies** from the document titled *Policies_Meridian.docx*. These policies may be:

        1. **Explicitly stated** - directly mentioned in the text.
        2. **Implicitly referenced** - embedded within procedures, documentation processes, or described indirectly without using the exact policy names.

        ---
        
        ### Document Content:
        Below is the full content of *Policies_Meridian.docx*:
        
        {combined_paragraphs}
        
        ---

        ### Target Policy:
        - Policy of Interest: **"{policy_of_interest}"**
        - Search Term Synonyms: **"{search_terms_synonyms}"**
        
        Use these keywords and any related concepts to locate relevant policies. 
        Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures.
        This document is used for important decision-making. Ensure no relevant information is overlooked, whether it's directly stated or subtly implied.
        
        ---
        
        ### Output Instructions:
        
        For **each policy instance** found, provide the following:

        1. **Status**:
        - `"Y"` - Clearly mentioned (explicitly and unambiguously stated).
        - `"M"` - Mentioned indirectly (implied, inferred, or part of a procedure).
        - `"N"` - Not found (no relevant mention in the document).

        2. **POLICY DETAILS**:
        - Include the exact wording, numbers, or phrases that support your judgment.
        - If no policy is found, leave this field empty, returning `""`.
        - If the Response is `"N"`, the field should be empty, returning `""`.

        ---
    """.strip()

    return prompt
def request_extracted_policy_detail_from_ollama(prompt, policy_of_interest):
    url = "http://localhost:11434/api/generate"

    headers = {
        "Content-Type": "application/json"
    }

    data = {
        "model": "deepseek-r1:8b",
        "prompt": prompt,
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "Status": {
                    "type": "string",
                    "enum": ["Y", "M", "N"]
                },
                "POLICY DETAILS": {
                    "type": "string"
                }
            },
            "required": ["Status", "POLICY DETAILS"]
        }
    }

    response = requests.post(url, headers=headers, json=data)
    response = response.json()
    response_data = response['response']
    response_data = json.loads(response_data)
    response_data["POLICY NAME"] = policy_of_interest
    response_data = json.dumps(response_data)
    return response_data

## `Load & Preprocess`

#### Load DOCX and embedding them into vectors

In [90]:
with open("data/preprocessed_data/policies_meridian_plaintext.txt", "r", encoding="utf-8", errors="replace") as f:
    policies_meridian_plaintext = "\n".join(f.readlines())

In [93]:
paragraphs = split_text_into_paragraphs(policies_meridian_plaintext)
paragraph_embeddings = []

start_time = time.time()
for p in paragraphs:
   paragraph_embeddings.append(get_embedding(p))
   print_progress_bar(len(paragraph_embeddings), len(paragraphs), prefix="Embeddings", start_time=start_time, line_width=128)

|[34m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | Embeddings                                   

#### Load Policies & Search Terms Synonyms and embedding them into vectors

In [95]:
df = pd.read_csv("./data/preprocessed_data/policies_to_update.csv")
df = df.drop(index=[0,1])
df = df.drop(columns=["Y/N/M", "POLICY DETAILS", "Prompt"])
if row_range == "full":
    df = df.iloc[0:len(df)]
elif row_range == "first_half":
    df = df.iloc[0:len(df)//2]
elif row_range == "second_half":
    df = df.iloc[len(df)//2:len(df)]
elif row_range == "random_10":
    df = df.sample(n=10, random_state=42)
else:
    df = df.iloc[0:len(df)]
 
policy_of_interests = df["POLICY NAME"].tolist()
search_terms_synonyms = df["Search Terms Synonyms"].tolist()
policy_of_interest_embeddings = []
search_terms_synonyms_embeddings = []

start_time = time.time()
for p in policy_of_interests:
    policy_of_interest_embeddings.append(get_embedding(p))
    print_progress_bar(len(policy_of_interest_embeddings), len(policy_of_interests), prefix="Embeddings", start_time=start_time, line_width=128)



|[34m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | Embeddings                                   

In [96]:
start_time = time.time()
for s in search_terms_synonyms:
    search_terms_synonyms_embeddings.append(get_embedding(str(s)))
    print_progress_bar(len(search_terms_synonyms_embeddings), len(search_terms_synonyms), prefix="Embeddings", start_time=start_time, line_width=128)

|[34m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | Embeddings                                   

## `Generate Table`

#### Loop through each policy of interest and generate the prompt

In [97]:
results = []
prompts = []
parsed_results = []
n_count = 0
start_time = time.time()

for i, policy_of_interest in enumerate(policy_of_interests):
    policy_of_interest_embedding = policy_of_interest_embeddings[i]
    search_terms_synonyms_embedding = search_terms_synonyms_embeddings[i]
    prompt = generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings, search_terms_synonyms_embedding)
    prompts.append(prompt)

for i, policy_of_interest in enumerate(policy_of_interests):
    extracted_policy_detail = request_extracted_policy_detail_from_ollama(prompts[i], policy_of_interest)
    results.append(extracted_policy_detail)

    try:
        detail_dict = json.loads(extracted_policy_detail)
        parsed_results.append(detail_dict)
        
        if detail_dict.get("Status", "").strip() == "N":
            n_count += 1
            status = "N"
        else:
            status = "Y/M"

    except Exception as e:
        parsed_results.append({
            "POLICY NAME": policy_of_interest,
            "Y/N/M": "ERROR",
            "POLICY DETAILS": f"Failed to parse: {str(e)}"
        })
        status = "Parse Error"

    print_progress_bar(i + 1, len(policy_of_interests), 
                       prefix=f"{n_count } N's | {policy_of_interest} → {status}",
                       start_time=start_time)

|[34m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | 0 N's | NICHES → Y/M                                                                                                                                                         

#### Reformat the response into a table

In [98]:
results_df = pd.DataFrame(parsed_results)
results_df.rename(columns={"Status": "Y/N/M"}, inplace=True)
results_df = results_df[["Y/N/M", "POLICY NAME", "POLICY DETAILS"]]

#### Writing to CSV

In [99]:
results_df.to_csv(f"./extracted_policy_details/{row_range}_top_{find_top_k}_{n_count}Ns.csv", index=False)

In [100]:
results_df.head(10)

Unnamed: 0,Y/N/M,POLICY NAME,POLICY DETAILS
0,Y,BFS (CMHC Program),The document explicitly states that Tarion war...
1,Y,BFS ALT-A,The document explicitly mentions 'Tarion warra...
2,Y,BFS Stated Income (Bank Statements),The lender will conduct a credit check to asse...
3,Y,BFS Stated Income (Conventional),Tarion warranty required
4,Y,BFS Stated Income (Sagen & CG Program),The maximum number of properties a client can ...
5,Y,Cash Back Mortgages,Maximum number of properties a client can have...
6,Y,Collateral Switch/Transfer,The document explicitly states 'consent to rel...
7,Y,Construction,The use of DocuSign for electronic signatures ...
8,Y,Cottage/Recreational Properties,Tarion warranty required for new homes. Tarion...
9,Y,Equity Program,The document explicitly mentions Tarion warran...
