### Importing Libraries

In [1]:
import requests
import re
import numpy as np
import pandas as pd

### Read and Preprocess the Local Data

#### Defining all necessary functions

In [2]:
def split_text_into_paragraphs(text):
    paragraphs = re.split(r'\n{2,}', text)
    return [p.strip() for p in paragraphs if p.strip()]
def get_embedding(text, model="nomic-embed-text"):
    url = "http://localhost:11434/api/embed"
    payload = {
        "model": model,
        "input": text
    }
    response = requests.post(url, json=payload)
    return response.json()["embeddings"][0]
def cosine_similarity(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

#### Local policies and embadding into vectors

In [3]:
with open("../../data/preprocessed_data/policies_meridian_plaintext.txt", "r", encoding="utf-8", errors="replace") as f:
    policies_meridian_plaintext = "\n".join(f.readlines())

In [4]:
paragraphs = split_text_into_paragraphs(policies_meridian_plaintext)
paragraph_embeddings = []
for p in paragraphs:
    paragraph_embeddings.append(get_embedding(p))

#### Load all policies and embedding them into vectors

In [5]:
df = pd.read_csv("../../data/preprocessed_data/policies_to_update.csv")
df = df.drop(index=[0,1])
df = df.drop(columns=["Y/N/M", "POLICY DETAILS", "Prompt"])
# only keep the last 10 rows
df = df.tail(10)

policy_of_interests = df["POLICY NAME"].tolist()
search_terms_synonyms = df["Search Terms Synonyms"].tolist()
policy_of_interest_embeddings = []

for p in policy_of_interests:
    policy_of_interest_embeddings.append(get_embedding(p))

### Sending the Request

#### Defining all necessary functions

In [6]:
def generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings):
    find_top_k = 6
    most_related_paragraphs = []

    for index in range(len(paragraph_embeddings)):
        score = cosine_similarity(policy_of_interest_embedding, paragraph_embeddings[index])
        most_related_paragraphs.append((paragraphs[index], score))
        
    most_related_paragraphs.sort(key=lambda x: x[1], reverse=True)
    most_related_paragraphs = most_related_paragraphs[:find_top_k]
    combined_paragraphs = "\n".join([p[0] for p in most_related_paragraphs])
    
    prompt = f"""
You are to extract form-related policies from the attached 'Policies_Meridian.docx'. Focus on identifying both **explicit mentions** and **indirect references** (e.g., policies embedded in documentation procedures or described without using exact policy names).

The 'Policies_Meridian.docx' file content is provided below:

{combined_paragraphs}

The policy of interest is "{policy_of_interest}" with Search Terms Synonyms "{search_terms_synonyms}" for targeting the location of that policy in the doc. 
Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures.

Output the result in the following format:

{{
    "POLICY NAME": "Consent To Release Client Information",
    "Y/N/M": "Y",
    "POLICY DETAILS": "Meridian requires signed consent from all applicants prior to underwriting, but does not provide consent forms to brokers."
}}

For each item, say:
- "Y" if it is clearly mentioned,
- "M" if it is mentioned indirectly,
- "N" if not found.
""".strip()

    return prompt
def request_extracted_policy_detail_from_ollama(prompt, policy_of_interest):
    url = "http://localhost:11434/api/generate"

    headers = {
        "Content-Type": "application/json"
    }

    data = {
        "model": "deepseek-r1:8b",
        "prompt": prompt,
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "POLICY NAME": {
                    "type": "string",
                    "enum": [policy_of_interest]
                },
                "Y/N/M": {
                    "type": "string",
                    "enum": ["Y", "M", "N"]
                },
                "POLICY DETAILS": {
                    "type": "string"
                }
            },
            "required": ["POLICY NAME", "Y/N/M", "POLICY DETAILS"]
        }
    }

    response = requests.post(url, headers=headers, json=data)
    response = response.json()
    response_data = response['response']
    return response_data

#### Loop through each policy of interest and generate the prompt

In [7]:
results = []

for i, policy_of_interest in enumerate(policy_of_interests):
    policy_of_interest_embedding = policy_of_interest_embeddings[i]
    prompt = generate_prompt(policy_of_interest, policy_of_interest_embedding, paragraph_embeddings)
    extracted_policy_detail = request_extracted_policy_detail_from_ollama(prompt, policy_of_interest)
    results.append(extracted_policy_detail)
df_results = pd.DataFrame(results)

In [8]:
df_results.to_csv("../../data/preprocessed_data/extracted_policy_details.csv", index=False)