### Importing Libraries

In [1]:
import requests
import re
import numpy as np
import pandas as pd

In [None]:
second_half = True

### Read and Preprocess the Local Data

#### Defining all necessary functions

In [2]:
def split_text_into_paragraphs(text):
    paragraphs = re.split(r'\n{2,}', text)
    return [p.strip() for p in paragraphs if p.strip()]
def get_embedding(text, model="nomic-embed-text"):
    url = "http://localhost:11434/api/embed"
    payload = {
        "model": model,
        "input": text
    }
    response = requests.post(url, json=payload)
    return response.json()["embeddings"][0]
def cosine_similarity(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

#### Local policies and embadding into vectors

In [3]:
with open("../../data/preprocessed_data/policies_meridian_plaintext.txt", "r", encoding="utf-8", errors="replace") as f:
    policies_meridian_plaintext = "\n".join(f.readlines())

In [4]:
paragraphs = split_text_into_paragraphs(policies_meridian_plaintext)
paragraph_embeddings = []
for p in paragraphs:
    paragraph_embeddings.append(get_embedding(p))

In [12]:
for i, p in enumerate(paragraphs):
    print(f"Paragraph {i}: {p}")
    print(f"Embedding: {paragraph_embeddings[i]}\n")

Paragraph 0: Qualifying rate policy
Embedding: [0.059897747, 0.047818825, -0.16621745, -0.00979373, 0.045258608, -0.010991456, 0.014890423, 0.029830359, 0.043086495, -0.017570192, 0.008325352, -0.004369245, -0.010503042, -0.023361107, 0.004653472, -0.03615721, 0.027586576, -0.050505236, -0.016529562, 0.06321013, -0.010592704, -0.040891446, -0.028630473, -0.039426845, 0.108853534, -0.010721792, 0.0021710133, -0.058586404, -0.0054721297, 0.0030152295, -0.012481042, 0.025697978, 0.04492967, -0.031245545, -0.012599323, -0.01635831, 0.075756654, 0.005425954, -0.06413092, -0.09117378, 0.032067474, -0.044313576, 0.040641848, -0.013542132, 0.013190332, 0.01396001, 0.084150046, 0.0769693, 0.011220564, 0.021858195, 0.017757483, 0.049700562, -0.02043432, -0.021034112, 0.030360088, 0.034459826, -0.09307978, 0.06667721, -0.036118295, -0.009520237, 0.05771817, 0.05765334, -0.04969706, 0.051757067, 0.048444763, -0.022885107, 0.03847996, 0.062376637, -0.014391205, 0.0131915845, -0.05368668, -0.0181091

#### Load all policies and embedding them into vectors

In [None]:
df = pd.read_csv("../../data/preprocessed_data/policies_to_update.csv")
df = df.drop(index=[0,1])
df = df.drop(columns=["Y/N/M", "POLICY DETAILS", "Prompt"])
if second_half:
    df = df.iloc[0:len(df)//2]
else:
    df = df.iloc[len(df)//2:len(df)]
 
policy_of_interests = df["POLICY NAME"].tolist()
search_terms_synonyms = df["Search Terms Synonyms"].tolist()
policy_of_interest_embeddings = []

for p in policy_of_interests:
    policy_of_interest_embeddings.append(get_embedding(p))

### Sending the Request

#### Defining all necessary functions

In [6]:
def generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings):
    find_top_k = 6
    most_related_paragraphs = []

    for index in range(len(paragraph_embeddings)):
        score = cosine_similarity(policy_of_interest_embedding, paragraph_embeddings[index])
        most_related_paragraphs.append((paragraphs[index], score))
        
    most_related_paragraphs.sort(key=lambda x: x[1], reverse=True)
    most_related_paragraphs = most_related_paragraphs[:find_top_k]
    combined_paragraphs = "\n".join([p[0] for p in most_related_paragraphs])
    
    prompt = f"""
You are to extract form-related policies from the attached 'Policies_Meridian.docx'. Focus on identifying both **explicit mentions** and **indirect references** (e.g., policies embedded in documentation procedures or described without using exact policy names).

The 'Policies_Meridian.docx' file content is provided below:

{combined_paragraphs}

The policy of interest is "{policy_of_interest}" with Search Terms Synonyms "{search_terms_synonyms}" for targeting the location of that policy in the doc. 
Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures.

Output the result in the following format:

{{
    "POLICY NAME": "Consent To Release Client Information",
    "Y/N/M": "Y",
    "POLICY DETAILS": "Meridian requires signed consent from all applicants prior to underwriting, but does not provide consent forms to brokers."
}}

For each item, say:
- "Y" if it is clearly mentioned,
- "M" if it is mentioned indirectly,
- "N" if not found.
""".strip()

    return prompt
def request_extracted_policy_detail_from_ollama(prompt, policy_of_interest):
    url = "http://localhost:11434/api/generate"

    headers = {
        "Content-Type": "application/json"
    }

    data = {
        "model": "deepseek-r1:8b",
        "prompt": prompt,
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "POLICY NAME": {
                    "type": "string",
                    "enum": [policy_of_interest]
                },
                "Y/N/M": {
                    "type": "string",
                    "enum": ["Y", "M", "N"]
                },
                "POLICY DETAILS": {
                    "type": "string"
                }
            },
            "required": ["POLICY NAME", "Y/N/M", "POLICY DETAILS"]
        }
    }

    response = requests.post(url, headers=headers, json=data)
    response = response.json()
    response_data = response['response']
    return response_data

#### Loop through each policy of interest and generate the prompt

In [9]:
results = []
prompts = []

for i, policy_of_interest in enumerate(policy_of_interests):
    policy_of_interest_embedding = policy_of_interest_embeddings[i]
    prompt = generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings)
    prompts.append(prompt)
    
for i, policy_of_interest in enumerate(policy_of_interests):
    extracted_policy_detail = request_extracted_policy_detail_from_ollama(prompts[i], policy_of_interest) 
    results.append(extracted_policy_detail)
    print(f"Processed {i+1}/{len(policy_of_interests)}: {policy_of_interest}")
    
    
df_results = pd.DataFrame(results)

Processed 1/248: BFS (CMHC Program)
Processed 2/248: BFS ALT-A
Processed 3/248: BFS Stated Income (Bank Statements)
Processed 4/248: BFS Stated Income (Conventional)
Processed 5/248: BFS Stated Income (Sagen & CG Program)
Processed 6/248: Cash Back Mortgages
Processed 7/248: Collateral Switch/Transfer
Processed 8/248: Construction
Processed 9/248: Cottage/Recreational Properties
Processed 10/248: Equity Program
Processed 11/248: Flex Down Payments (ie; Credit Cards)
Processed 12/248: Foreign Borrowers (Non-Residents)
Processed 13/248: Limited Feature Mortgages
Processed 14/248: Medical Professionals Program
Processed 15/248: Mortgage & HELOC Combinations
Processed 16/248: Net Worth Program
Processed 17/248: New to Canada
Processed 18/248: New to Canada (Rental)
Processed 19/248: No-fee Alt Lender
Processed 20/248: Non-Permanent/Temporary Residents
Processed 21/248: Open Mortgages
Processed 22/248: Pre-Approval Programs
Processed 23/248: Pre-Approval Programs With No Premium
Processed 2

In [10]:
df_results.to_csv("../../data/preprocessed_data/extracted_policy_details.csv", index=False)