## `Imports & Setup`

#### External Libraries

In [1]:
import requests
import os
import re
import sys
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import json
import torch

  from .autonotebook import tqdm as notebook_tqdm


#### >>> Config Variables <<<

In [2]:
# "full", "first_half", "second_half", "random_10"
row_range = "full"

# { find_top_k } number of paragraphs that have the highest similarity score with the policy and search terms will be used to generate the prompt.
find_top_k = 4

# score of similarity = { policy_weight } * cosine_similarity between policy and paragraph + { 1 - policy_weight } * cosine_similarity between policy and search terms
policy_weight = 0.95

# { extracting_model } will be used to extract the policy details by using the policy name and the search terms.
extracting_model = "deepseek-r1:8b"

# { embedding_model } will be used to generate the embeddings for the policy and the paragraphs.
embedding_model = {"gpu": "intfloat/e5-base", "cpu": "nomic-embed-text"}

# { prompt_version } is the version of the prompt template to be used.
prompt_version = "v3"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = device.type

### Functions

In [3]:
def print_progress_bar(iteration, total, prefix='', length=40, start_time=None, line_width=256):
    elapsed = time.time() - start_time if start_time else 0
    avg_time = elapsed / iteration if iteration > 0 else 0
    eta = avg_time * (total - iteration)

    percent = f"{100 * (iteration / float(total)):.1f}"
    filled_length = int(length * iteration // total)
    bar_color = '\033[31m'
    bar = bar_color + '█' * filled_length + '-' * (length - filled_length) + '\033[0m'

    eta_min = int(eta // 60)
    eta_sec = int(eta % 60)

    line = f"|{bar}| {percent}% Complete | ETA: {eta_min}m {eta_sec}s | {prefix}"
    padded_line = line.ljust(line_width)

    sys.stdout.write('\r' + padded_line)
    sys.stdout.flush()
def split_text_into_paragraphs(text, chunk_size=3, merge_headings=True):
    """Split policy text into N-sentence chunks."""

    # Fix encoding and glued terms (e.g. BenchmarkRate → Benchmark Rate)
    text = text.replace("�", " ").replace("•", "*")
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)        # aB → a B
    text = re.sub(r'(?<=\d)(?=[A-Z])', ' ', text)           # 25Years → 25 Years
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)        # abc123 → abc 123
    text = re.sub(r'(?<=[a-z])(?=[A-Z][a-z])', '. ', text)  # add inferred periods

    # Normalize spacing
    text = re.sub(r'\n{2,}', '\n', text)      # collapse double line breaks
    text = re.sub(r'\s+', ' ', text).strip()  # remove excess whitespace

    # Split at likely section headings
    sections = re.split(r'\n(?=[A-Z][^\n]{3,60}\n)', text)

    tokenizer = PunktSentenceTokenizer()
    chunks = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # Convert bullet-style lines into full sentences
        section = re.sub(r"\n\s*\*\s*", ". ", section)
        section = re.sub(r"\*\s*", "", section)

        # Break before heading-like phrases
        section = re.sub(r'(?<=\. )([A-Z][^\n]{3,60})(?= )', r'\n\1', section)

        sentences = tokenizer.tokenize(section)

        # Attach short heading-only lines to previous chunk
        if merge_headings and len(sentences) <= 1 and chunks:
            chunks[-1] += " " + section
            continue

        # Group into N-sentence chunks
        for i in range(0, len(sentences), chunk_size):
            chunk = " ".join(sentences[i:i + chunk_size])
            chunks.append(chunk.strip())

    return chunks
def get_embedding(text):
    if device == "cuda":
        tokenizer = AutoTokenizer.from_pretrained(embedding_model["gpu"])
        model = AutoModel.from_pretrained(embedding_model["gpu"]).to("cuda")
        
        if isinstance(text, str):
            text = [text]
        
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
        
        return embeddings.cpu().detach().numpy().tolist()[0]
    else:
        model = embedding_model["cpu"]
        
        url = "http://localhost:11434/api/embed"
        payload = {
            "model": model,
            "input": text
        }
        response = requests.post(url, json=payload)
        return response.json()["embeddings"][0]
def cosine_similarity(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings, search_terms_synonyms_embedding):
    most_related_paragraphs = []

    for index in range(len(paragraph_embeddings)):
        score = policy_weight * cosine_similarity(policy_of_interest_embedding, paragraph_embeddings[index]) + (1 - policy_weight) * cosine_similarity(search_terms_synonyms_embedding, paragraph_embeddings[index])
        most_related_paragraphs.append((paragraphs[index], score))
        
    most_related_paragraphs.sort(key=lambda x: x[1], reverse=True)
    most_related_paragraphs = most_related_paragraphs[:find_top_k]
    combined_paragraphs = "\n".join([p[0] for p in most_related_paragraphs])
    
    prompt_v3 = f"""
        You are an expert in policy analysis. Your task is to extract **form-related policies** from the document titled *Policies_Meridian.docx*.
        
        1. **Explicitly stated** - directly and clearly mentioned in the text.
        2. **Implicitly referenced** - indirectly indicated, embedded within procedures, or inferred from documentation context.

        ---
        
        ### Document Content:
        Below is a selection of paragraphs extracted from *Policies_Meridian.docx*:
        
        {combined_paragraphs}
        
        ---

        ### Target Policy:
        - Policy of Interest: **"{policy_of_interest}"**
        - Search Term Synonyms: **"{search_terms_synonyms}"**
        
        Use these keywords and any related concepts to locate relevant policies. 
        Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures.
        Do **not** guess or make assumptions. Only mark a policy as found if there is **clear textual evidence**.
        
        ---
        
        ### Output Instructions:
        
        For **each policy instance** found, provide the following:

        1. **Y/N/M**:
        - `"Y"` - Clearly mentioned (explicitly and unambiguously stated).
        - `"M"` - Mentioned indirectly (implied, inferred, or part of a procedure).
        - `"N"` - The policy does not appear in the document in any clear or inferable form.
        ⛔ Do not guess. If unsure, default to `"N"`.

        2. **POLICY DETAILS**:
        - Provide the specific content from the document that explains what the policy is about, including any wording, numbers, or requirements mentioned.

        ---
    """.strip()
    prompt_v2 = f"""
        You are an expert in policy analysis. Your task is to extract **form-related policies** from the document titled *Policies_Meridian.docx*.
    
        ---
        
        ### Document Content:
        Below is a selection of paragraphs extracted from *Policies_Meridian.docx*:
        
        {combined_paragraphs}
        
        ---

        ### Target Policy:
        - Policy of Interest: **"{policy_of_interest}"**
        - Search Term Synonyms: **"{search_terms_synonyms}"**
        
        Use these keywords and any related concepts to locate relevant policies. 
        Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures.
        Do **not** guess or make assumptions. Only mark a policy as found if there is **clear textual evidence**.
        
        ---
        
        ### Output Instructions:
        
        For **each policy instance** found, provide the following:

        1. **Y/N/M**:
        - `"Y"` - Clearly mentioned (explicitly and unambiguously stated).
        - `"M"` - Mentioned indirectly (implied, inferred, or part of a procedure).
        - `"N"` - The policy does not appear in the document in any clear or inferable form.

        2. **POLICY DETAILS**:
        - Provide the specific content from the document that explains what the policy is about, including any wording, numbers, or requirements mentioned.

        ---
    """.strip()
    prompt_v1 = f"""
        You are an expert in policy analysis. Your task is to extract **form-related policies** from the document titled *Policies_Meridian.docx*. These policies may be:

        1. **Explicitly stated** - directly mentioned in the text.
        2. **Implicitly referenced** - embedded within procedures, documentation processes, or described indirectly without using the exact policy names.

        ---
        
        ### Document Content:
        Below is the full content of *Policies_Meridian.docx*:
        
        {combined_paragraphs}
        
        ---

        ### Target Policy:
        - Policy of Interest: **"{policy_of_interest}"**
        - Search Term Synonyms: **"{search_terms_synonyms}"**
        
        Use these keywords and any related concepts to locate relevant policies. 
        Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures.
        This document is used for important decision-making. Ensure no relevant information is overlooked, whether it's directly stated or subtly implied.
        
        ---
        
        ### Output Instructions:
        
        For **each policy instance** found, provide the following:

        1. **Status**:
        - "Y" - Clearly mentioned (explicitly and unambiguously stated).
        - "M" - Mentioned indirectly (implied, inferred, or part of a procedure).
        - "N" - Not found (no relevant mention in the document).

        2. **POLICY DETAILS**:
        - Provide the specific content from the document that explains what the policy is about, including any wording, numbers, or requirements mentioned.
        - If no policy is found, leave this field empty, returning "".
        - If the Response is "N", the field should be empty, returning "".

        ---
    """.strip()
    
    if prompt_version == "v1":
        prompt = prompt_v1
    elif prompt_version == "v2":
        prompt = prompt_v2
    elif prompt_version == "v3":
        prompt = prompt_v3
    else:
        raise ValueError("Invalid prompt version. Use 'v1' or 'v2'.")

    return prompt
def request_extracted_policy_detail_from_ollama(prompt, policy_of_interest):
    url = "http://localhost:11434/api/generate"

    headers = {
        "Content-Type": "application/json"
    }

    data = {
        "model": extracting_model,
        "prompt": prompt,
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "Y/N/M": {
                    "type": "string",
                    "enum": ["Y", "M", "N"]
                },
                "POLICY DETAILS": {
                    "type": "string"
                }
            },
            "required": ["Y/N/M", "POLICY DETAILS"]
        }
    }

    response = requests.post(url, headers=headers, json=data)
    response = response.json()
    response_data = response['response']
    response_data = json.loads(response_data)
    response_data["POLICY NAME"] = policy_of_interest
    response_data = json.dumps(response_data)
    return response_data

## `Load & Preprocess`

#### Load DOCX and embedding them into vectors

In [4]:
with open("data/preprocessed_data/policies_meridian_plaintext.txt", "r", encoding="utf-8", errors="replace") as f:
    policies_meridian_plaintext = "\n".join(f.readlines())

In [5]:
paragraphs = split_text_into_paragraphs(policies_meridian_plaintext)
paragraph_embeddings = []

start_time = time.time()
for p in paragraphs:
   paragraph_embeddings.append(get_embedding(p))
   print_progress_bar(len(paragraph_embeddings), len(paragraphs), prefix="Embeddings", start_time=start_time, line_width=128)



|[31m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | Embeddings                                   

#### Load Policies & Search Terms Synonyms and embedding them into vectors

In [None]:
df = pd.read_csv("./data/preprocessed_data/policies_to_update.csv")
df = df.drop(index=[0,1])
df = df.drop(columns=["Y/N/M", "POLICY DETAILS", "Prompt"])
if row_range == "full":
    df = df.iloc[0:len(df)]
elif row_range == "first_half":
    df = df.iloc[0:len(df)//2]
elif row_range == "second_half":
    df = df.iloc[len(df)//2:len(df)]
elif row_range == "random_10":
    df = df.sample(n=10, random_state=42)
else:
    df = df.iloc[0:len(df)]
 
policy_of_interests = df["POLICY NAME"].tolist()
search_terms_synonyms = df["Search Terms Synonyms"].tolist()
policy_of_interest_embeddings = []
search_terms_synonyms_embeddings = []

start_time = time.time()
for p in policy_of_interests:
    policy_of_interest_embeddings.append(get_embedding(p))
    print_progress_bar(len(policy_of_interest_embeddings), len(policy_of_interests), prefix="Embeddings", start_time=start_time, line_width=128)



|[31m███-------------------------------------[0m| 8.1% Complete | ETA: 2m 2s | Embeddings                                     

In [None]:
start_time = time.time()
for s in search_terms_synonyms:
    search_terms_synonyms_embeddings.append(get_embedding(str(s)))
    print_progress_bar(len(search_terms_synonyms_embeddings), len(search_terms_synonyms), prefix="Embeddings", start_time=start_time, line_width=128)

|[31m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | Embeddings                                   

## `Generate Table`

#### Loop through each policy of interest and generate the prompt

In [None]:
results = []
prompts = []
parsed_results = []
n_count = 0
start_time = time.time()

for i, policy_of_interest in enumerate(policy_of_interests):
    policy_of_interest_embedding = policy_of_interest_embeddings[i]
    search_terms_synonyms_embedding = search_terms_synonyms_embeddings[i]
    prompt = generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings, search_terms_synonyms_embedding)
    prompts.append(prompt)

for i, policy_of_interest in enumerate(policy_of_interests):
    extracted_policy_detail = request_extracted_policy_detail_from_ollama(prompts[i], policy_of_interest)
    results.append(extracted_policy_detail)

    try:
        detail_dict = json.loads(extracted_policy_detail)
        parsed_results.append(detail_dict)
        
        if detail_dict.get("Y/N/M", "").strip() == "N":
            n_count += 1
            status = "N"
        elif detail_dict.get("Y/N/M", "").strip() == "Y":
            status = "Y"
        elif detail_dict.get("Y/N/M", "").strip() == "M":
            status = "M"
        else:
            status = "ERROR"

    except Exception as e:
        parsed_results.append({
            "POLICY NAME": policy_of_interest,
            "Y/N/M": "ERROR",
            "POLICY DETAILS": f"Failed to parse: {str(e)}"
        })
        status = "Parse Error"

    print_progress_bar(i + 1, len(policy_of_interests), 
                       prefix=f"{n_count } N's | {policy_of_interest} → {status}",
                       start_time=start_time)

|[31m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | 79 N's | NICHES → M                                                                                                                                                          

#### Reformat the response into a table

In [None]:
results_df = pd.DataFrame(parsed_results)
results_df = results_df[["Y/N/M", "POLICY NAME", "POLICY DETAILS"]]

#### Writing to CSV

In [None]:
current_date_time = time.strftime("%Y-%m-%d_%H-%M-%S")
os.makedirs(current_date_time, exist_ok=True)
results_df.to_csv(f"./{current_date_time}/extracted_policy_details.csv", index=False)
with open(f"./{current_date_time}/config_variables.csv", "w", encoding="utf-8") as f:
    f.write(f"find_top_k: {find_top_k}\n")
    f.write(f"policy_weight: {policy_weight}\n")
    f.write(f"extracting_model: {extracting_model}\n")
    f.write(f"embedding_model: {embedding_model}\n")
    f.write(f"prompt_version: {prompt_version}\n")

In [None]:
results_df.head(10)

Unnamed: 0,Y/N/M,POLICY NAME,POLICY DETAILS
0,Y,BFS (CMHC Program),The maximum number of properties a client can ...
1,Y,BFS ALT-A,The policy states that 'lending areas' are res...
2,Y,BFS Stated Income (Bank Statements),"The property must be a laneway house, coach ho..."
3,Y,BFS Stated Income (Conventional),The maximum number of properties a borrower ca...
4,Y,BFS Stated Income (Sagen & CG Program),The property must be on land that is no larger...
5,Y,Cash Back Mortgages,The policy includes a 'Tarion warranty require...
6,Y,Collateral Switch/Transfer,This policy outlines the requirements for usin...
7,Y,Construction,The lender requires a minimum population requi...
8,Y,Cottage/Recreational Properties,The document explicitly states that 'a maximum...
9,Y,Equity Program,The policy explicitly states that properties w...
