### Importing Libraries and Defining Constants

In [1]:
import requests
import re
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
# from sentence_transformers import SentenceTransformer
import json
import torch

In [2]:
# "full", "first_half", "second_half"
row_range = "full"

# picking only the top k related paragraphs
find_top_k = 4

In [2]:
import torch
print(torch.version.cuda)         # PyTorch 编译时用的 CUDA 版本
print(torch.cuda.is_available())  # 检查是否能用 GPU
print(torch.cuda.get_device_name())  # 显示当前 GPU 名称


12.1
True
NVIDIA GeForce RTX 3080


### Read and Preprocess the Local Data

#### Defining all necessary functions

In [3]:
def split_text_into_paragraphs(text, chunk_size=3, merge_headings=True):
    """Split policy text into N-sentence chunks."""

    # Fix encoding and glued terms (e.g. BenchmarkRate → Benchmark Rate)
    text = text.replace("�", " ").replace("•", "*")
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)        # aB → a B
    text = re.sub(r'(?<=\d)(?=[A-Z])', ' ', text)           # 25Years → 25 Years
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)        # abc123 → abc 123
    text = re.sub(r'(?<=[a-z])(?=[A-Z][a-z])', '. ', text)  # add inferred periods

    # Normalize spacing
    text = re.sub(r'\n{2,}', '\n', text)      # collapse double line breaks
    text = re.sub(r'\s+', ' ', text).strip()  # remove excess whitespace

    # Split at likely section headings
    sections = re.split(r'\n(?=[A-Z][^\n]{3,60}\n)', text)

    tokenizer = PunktSentenceTokenizer()
    chunks = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # Convert bullet-style lines into full sentences
        section = re.sub(r"\n\s*\*\s*", ". ", section)
        section = re.sub(r"\*\s*", "", section)

        # Break before heading-like phrases
        section = re.sub(r'(?<=\. )([A-Z][^\n]{3,60})(?= )', r'\n\1', section)

        sentences = tokenizer.tokenize(section)

        # Attach short heading-only lines to previous chunk
        if merge_headings and len(sentences) <= 1 and chunks:
            chunks[-1] += " " + section
            continue

        # Group into N-sentence chunks
        for i in range(0, len(sentences), chunk_size):
            chunk = " ".join(sentences[i:i + chunk_size])
            chunks.append(chunk.strip())

    return chunks
def get_embedding(text, model="nomic-embed-text"):
    url = "http://localhost:11434/api/embed"
    payload = {
        "model": model,
        "input": text
    }
    response = requests.post(url, json=payload)
    return response.json()["embeddings"][0]
def cosine_similarity(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

#### Local policies and embadding into vectors

In [8]:
with open("../../data/preprocessed_data/policies_meridian_plaintext.txt", "r", encoding="utf-8", errors="replace") as f:
    policies_meridian_plaintext = "\n".join(f.readlines())

In [15]:
paragraphs = split_text_into_paragraphs(policies_meridian_plaintext)
paragraph_embeddings = []
for p in paragraphs:
   paragraph_embeddings.append(get_embedding(p))

KeyboardInterrupt: 

In [11]:
for i, p in enumerate(paragraphs):
    print(f"Paragraph {i}: {p}")
    print(f"Embedding: {paragraph_embeddings[i]}\n")

Paragraph 0: Qualifying rate policy High Ratio Fixed rate mortgages For all terms, applicants must qualify using the greater of 5.25% Benchmark or Contract Rate + 2% Conventional Mortgages: Purchases or Refinances Uninsurable purchases and refinances offer a 30 year amortization, with the exception of 2nd position mortgages behind an existing Meridian 1st mortgage which have a maximum amortization of 25 years. Insurable Mortgages must qualify at the greater of 5.25% Benchmark or Contract Rate plus 2%, over 25 years Flex Line All Flex Line arrangements must be qualified at the greater of the 5.25% Benchmark rate or the mortgage contract rate plus 2%. Example: Mortgage amount = $200 K; HELOC initial limit = $100 K --- Qualifying rate for each application is the mortgage rate plus 2% OR the 5.25% Benchmark rate (whichever is greater) G.D.S and T.D.S ratios Ratios are not to exceed 39% and 44% respectively (regardless of high ratio insurer guidelines) Monthly payment for credit cards & lin

#### Load all policies and embedding them into vectors

In [None]:
df = pd.read_csv("../../data/preprocessed_data/policies_to_update.csv")
df = df.drop(index=[0,1])
df = df.drop(columns=["Y/N/M", "POLICY DETAILS", "Prompt"])
if row_range == "full":
    df = df.iloc[0:len(df)]
elif row_range == "first_half":
    df = df.iloc[0:len(df)//2]
elif row_range == "second_half":
    df = df.iloc[len(df)//2:len(df)]
else:
    df = df.iloc[0:len(df)]
 
policy_of_interests = df["POLICY NAME"].tolist()
search_terms_synonyms = df["Search Terms Synonyms"].tolist()
policy_of_interest_embeddings = []

for p in policy_of_interests:
    policy_of_interest_embeddings.append(get_embedding(p, model="deepseek-r1:8b"))

KeyboardInterrupt: 

### Sending the Request

#### Defining all necessary functions

In [None]:
def generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings):
    most_related_paragraphs = []

    for index in range(len(paragraph_embeddings)):
        score = cosine_similarity(policy_of_interest_embedding, paragraph_embeddings[index])
        most_related_paragraphs.append((paragraphs[index], score))
        
    most_related_paragraphs.sort(key=lambda x: x[1], reverse=True)
    most_related_paragraphs = most_related_paragraphs[:find_top_k]
    combined_paragraphs = "\n".join([p[0] for p in most_related_paragraphs])
    
    prompt = f"""
You are to extract form-related policies from the attached 'Policies_Meridian.docx'. Focus on identifying both **explicit mentions** and **indirect references** (e.g., policies embedded in documentation procedures or described without using exact policy names).

The 'Policies_Meridian.docx' file content is provided below:

{combined_paragraphs}

The policy of interest is "{policy_of_interest}" with Search Terms Synonyms "{search_terms_synonyms}" for targeting the location of that policy in the doc. 
Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures. 
This is a very important policy detail document, and many people's decisions will depend on the accuracy of your response.

Output the result in the following format:

{{
    "Y/N/M": "Y",
    "POLICY DETAILS": "Meridian requires signed consent from all applicants prior to underwriting, but does not provide consent forms to brokers."
}}

For each item, say:
- "Y" if it is clearly mentioned, which means the policy is explicitly stated in the text without any ambiguity.
- "M" if it is mentioned indirectly, which means the policy is implied or embedded in procedures, but not explicitly stated.
- "N" if not found, which means the policy is not mentioned at all.

- "POLICY DETAILS" should contain the specific details of the policy as mentioned in the text. If the policy is not mentioned, leave it blank.
""".strip()

    return prompt
def request_extracted_policy_detail_from_ollama(prompt, policy_of_interest):
    url = "http://localhost:11434/api/generate"

    headers = {
        "Content-Type": "application/json"
    }

    data = {
        "model": "deepseek-r1:8b",
        "prompt": prompt,
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "Y/N/M": {
                    "type": "string",
                    "enum": ["Y", "M", "N"]
                },
                "POLICY DETAILS": {
                    "type": "string"
                }
            },
            "required": ["Y/N/M", "POLICY DETAILS"]
        }
    }

    response = requests.post(url, headers=headers, json=data)
    response = response.json()
    response_data = response['response']
    response_data = json.loads(response_data)
    response_data["POLICY NAME"] = policy_of_interest
    response_data = json.dumps(response_data)
    return response_data

#### Loop through each policy of interest and generate the prompt

In [79]:
results = []
prompts = []
parsed_results = []
n_count = 0

for i, policy_of_interest in enumerate(policy_of_interests):
    policy_of_interest_embedding = policy_of_interest_embeddings[i]
    prompt = generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings)
    prompts.append(prompt)

for i, policy_of_interest in enumerate(policy_of_interests):
    extracted_policy_detail = request_extracted_policy_detail_from_ollama(prompts[i], policy_of_interest)
    results.append(extracted_policy_detail)

    try:
        detail_dict = json.loads(extracted_policy_detail)
        parsed_results.append(detail_dict)

        if detail_dict.get("Y/N/M", "").strip() == "N":
            n_count += 1
            status = "N"
        else:
            status = "Y/M"

    except Exception as e:
        parsed_results.append({
            "POLICY NAME": policy_of_interest,
            "Y/N/M": "ERROR",
            "POLICY DETAILS": f"Failed to parse: {str(e)}"
        })
        status = "Parse Error"

    print(f"Processed {i+1}/{len(policy_of_interests)}: {policy_of_interest} → {status}")

print(f"\n Total 'N' results: {n_count} / {len(policy_of_interests)}")

Processed 1/124: Investment Income / RRIF → N
Processed 2/124: Long Term Disability → Y/M
Processed 3/124: Maximum Number of Applicants → Y/M
Processed 4/124: Non-Taxable/Tips → Y/M
Processed 5/124: OAS - CPP - RPP (Pension Income) → Y/M
Processed 6/124: On-Indian Reserve (Income) → Y/M
Processed 7/124: Ontario Disability Support Program Income → Y/M
Processed 8/124: Parental/Maternity Leave → Y/M
Processed 9/124: Reduce Rental Expenses and Add to Gross Income → Y/M
Processed 10/124: Rental (Income): Non-Subject Property → Y/M
Processed 11/124: Rental (Income): Owner-Occupied + Separate Unit → Y/M
Processed 12/124: Rental (Income): Subject Property → Y/M
Processed 13/124: Rental (Income): Worksheet → Y/M
Processed 14/124: Rental Income at Market Rents → Y/M
Processed 15/124: Rental Pools → N
Processed 16/124: Rental Surplus/Shortfall Calculation → N
Processed 17/124: Short Term Disability → Y/M
Processed 18/124: Taxable Other → Y/M
Processed 19/124: Trust (Income) → Y/M
Processed 20/12

In [80]:
results_df = pd.DataFrame(results)
# df_results.to_csv("../../data/preprocessed_data/extracted_policy_details.csv", index=False)
# df_results.to_csv("../../data/preprocessed_data/extracted_policy_details_updated.csv", index=False)
results_df.to_csv("./extracted_policy_details_updated(first_half).csv", index=False)
# df_results.to_csv("../../data/preprocessed_data/extracted_policy_details_updated(second_half).csv", index=False)
# df_results.to_csv("../../data/preprocessed_data/extracted_policy_details_updated(full).csv", index=False)