## `Imports & Setup`

#### External Libraries

In [30]:
import requests
import mammoth
import os
import re
import sys
import time
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import json
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = device.type

#### >>> Config Variables <<<

##### Row Range

- `full`

- `first_half`

- `second_half`

- `random_X`: replace X with the number of policies to be randomly selected

- `the_name_of_the_policy`: replace with the name of the policy

- `from_X_to_Y` / `from_start_to_Y` / `from_X_to_end` : replace X and Y with the range of policies to be selected

In [31]:
row_range = "from_240_to_end"

##### Find Top K

`find_top_k` number of paragraphs that have the highest similarity score with the policy and search terms will be used to generate the prompt.

In [32]:
find_top_k = 3

##### Policy Weight

`score_of_similarity`

$ =\ $`policy_weight` $\times$ `cosine_similarity` between policy and paragraph $ + ( 1 - $`policy_weight` $) \times$ `cosine_similarity` between policy and search terms

In [33]:
policy_weight = 0.65

##### Extracting Model

`extracting_model` will be used to extract the policy details by using the policy name and the search terms.

- `deepseek-r1:8b`

- `deepseek-r1:14b`

In [34]:
extracting_model = "deepseek-r1:8b"

##### Embedding Model

`embedding_model` will be used to generate the embeddings for the policy and the paragraphs.

In [35]:
embedding_model = {"gpu": "intfloat/e5-base", "cpu": "nomic-embed-text"}

##### Prompt_template

`prompt_template` is the template that will be used to generate the prompt for the LLM.

In [36]:
prompt_template = """
    You are an expert in policy analysis. Your task is to extract **form-related policies** from the document titled *Policies_Meridian.docx*.
    
    1. **Explicitly stated** — The policy is clearly and directly described in the text, with no ambiguity or need for interpretation.
    2. **Implicitly referenced** — The policy is not stated directly, but its presence can be **reasonably and confidently inferred** from specific procedures, requirements, or descriptions that **clearly align with the policy's core intent**.

    ❗ Do **not** infer based on vague, generic, or loosely related content.  
    ❗ If the policy is not clearly present or the reference is too indirect or speculative, treat it as not included.

    ---
    
    ### Document Content:
    Below is a selection of paragraphs extracted from *Policies_Meridian.docx*:
    
    ${combined_paragraphs}$
    
    ---

    ### Target Policy:
    - Policy of Interest: **"${policy_of_interest}$"**
    - Search Term Synonyms: **"${search_terms_synonym}$"**
    
    Use these keywords and any related concepts to locate relevant policies. 
    Remember to extract not just explicit mentions but also policies that are implied or embedded in procedures.
    Do **not** guess or make assumptions. Only mark a policy as found if there is **clear textual evidence**.
    
    ---
    
    ### Output Instructions:
    
    For **each policy instance** found, provide the following:

    1. **Y/N/M**:
    - `"Y"` - Clearly mentioned (explicitly and unambiguously stated).
    - `"M"` - Mentioned indirectly (implied, inferred, or part of a procedure).
    - `"N"` - The policy does not appear in the document in any clear or inferable form.
    ⛔ Do not guess. If unsure, default to `"N"`.

    2. **POLICY DETAILS**:
    - Copy the **exact original sentence(s)** that describe or imply the policy. If it’s implied, use only specific and logically tied text — no rewording. 
    - This is a highly sensitive policy detail extraction. **Do not paraphrase. Use the original sentence(s) from the document as much as possible**. 
    - If multiple sentences support the policy, return a **verbatim combination** of those sentences.

    ---
"""

### Functions

In [37]:
def print_progress_bar(iteration, total, prefix='', length=40, start_time=None, line_width=256):
    elapsed = time.time() - start_time if start_time else 0
    avg_time = elapsed / iteration if iteration > 0 else 0
    eta = avg_time * (total - iteration)

    percent = f"{100 * (iteration / float(total)):.1f}"
    filled_length = int(length * iteration // total)
    bar_color = '\033[31m'
    bar = bar_color + '█' * filled_length + '-' * (length - filled_length) + '\033[0m'

    eta_min = int(eta // 60)
    eta_sec = int(eta % 60)

    line = f"|{bar}| {percent}% Complete | ETA: {eta_min}m {eta_sec}s | {prefix}"
    padded_line = line.ljust(line_width)

    sys.stdout.write('\r' + padded_line)
    sys.stdout.flush()
def load_docx_to_markdown(docx_path):
    with open(docx_path, "rb") as docx_file:
        result = mammoth.convert_to_markdown(docx_file)
        markdown = result.value
    return markdown
def split_text_into_paragraphs(text, chunk_size=3, merge_headings=True):
    """Split policy text into N-sentence chunks."""

    # Fix encoding and glued terms (e.g. BenchmarkRate → Benchmark Rate)
    text = text.replace("�", " ").replace("•", "*")
    text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)        # aB → a B
    text = re.sub(r'(?<=\d)(?=[A-Z])', ' ', text)           # 25Years → 25 Years
    text = re.sub(r'(?<=[a-zA-Z])(?=\d)', ' ', text)        # abc123 → abc 123
    text = re.sub(r'(?<=[a-z])(?=[A-Z][a-z])', '. ', text)  # add inferred periods

    # Normalize spacing
    text = re.sub(r'\n{2,}', '\n', text)      # collapse double line breaks
    text = re.sub(r'\s+', ' ', text).strip()  # remove excess whitespace

    # Split at likely section headings
    sections = re.split(r'\n(?=[A-Z][^\n]{3,60}\n)', text)

    tokenizer = PunktSentenceTokenizer()
    chunks = []

    for section in sections:
        section = section.strip()
        if not section:
            continue

        # Convert bullet-style lines into full sentences
        section = re.sub(r"\n\s*\*\s*", ". ", section)
        section = re.sub(r"\*\s*", "", section)

        # Break before heading-like phrases
        section = re.sub(r'(?<=\. )([A-Z][^\n]{3,60})(?= )', r'\n\1', section)

        sentences = tokenizer.tokenize(section)

        # Attach short heading-only lines to previous chunk
        if merge_headings and len(sentences) <= 1 and chunks:
            chunks[-1] += " " + section
            continue

        # Group into N-sentence chunks
        for i in range(0, len(sentences), chunk_size):
            chunk = " ".join(sentences[i:i + chunk_size])
            chunks.append(chunk.strip())

    return chunks
def get_embedding(text):
    if device == "cuda":
        tokenizer = AutoTokenizer.from_pretrained(embedding_model["gpu"])
        model = AutoModel.from_pretrained(embedding_model["gpu"]).to("cuda")
        
        if isinstance(text, str):
            text = [text]
        
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :]
        
        return embeddings.cpu().detach().numpy().tolist()[0]
    else:
        model = embedding_model["cpu"]
        
        url = "http://localhost:11434/api/embed"
        payload = {
            "model": model,
            "input": text
        }
        response = requests.post(url, json=payload)
        return response.json()["embeddings"][0]
def cosine_similarity(vec1, vec2):
    vec1, vec2 = np.array(vec1), np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
def generate_prompt(policy_of_interest, search_terms_synonym, policy_of_interest_embedding, paragraph_embeddings, search_terms_synonyms_embedding):
    most_related_paragraphs = []

    for index in range(len(paragraph_embeddings)):
        score = policy_weight * cosine_similarity(policy_of_interest_embedding, paragraph_embeddings[index]) + (1 - policy_weight) * cosine_similarity(search_terms_synonyms_embedding, paragraph_embeddings[index])
        most_related_paragraphs.append((paragraphs[index], score))
        
    most_related_paragraphs.sort(key=lambda x: x[1], reverse=True)
    most_related_paragraphs = most_related_paragraphs[:find_top_k]
    combined_paragraphs = "\n".join([p[0] for p in most_related_paragraphs])
    
    prompt= prompt_template.replace("${combined_paragraphs}$", combined_paragraphs)
    prompt = prompt.replace("${policy_of_interest}$", policy_of_interest)
    prompt = prompt.replace("${search_terms_synonym}$", search_terms_synonym)
    prompt.strip()

    return prompt, combined_paragraphs
def request_extracted_policy_detail_from_ollama(prompt, policy_of_interest):
    url = "http://localhost:11434/api/generate"

    headers = {
        "Content-Type": "application/json"
    }

    data = {
        "model": extracting_model,
        "prompt": prompt,
        "stream": False,
        "format": {
            "type": "object",
            "properties": {
                "Y/N/M": {
                    "type": "string",
                    "enum": ["Y", "M", "N"]
                },
                "POLICY DETAILS": {
                    "type": "string"
                }
            },
            "required": ["Y/N/M", "POLICY DETAILS"]
        }
    }

    response = requests.post(url, headers=headers, json=data)
    response = response.json()
    response_data = response['response']
    response_data = json.loads(response_data)
    response_data["POLICY NAME"] = policy_of_interest
    response_data = json.dumps(response_data)
    return response_data

## `Load & Preprocess`

#### Load DOCX and embedding them into vectors

In [38]:
policies_meridian_markdown = load_docx_to_markdown("data/preprocessed_data/policies_meridian.docx")
paragraphs = split_text_into_paragraphs(policies_meridian_markdown)
paragraph_embeddings = []
      
start_time = time.time()
for p in paragraphs:
   paragraph_embeddings.append(get_embedding(p))
   print_progress_bar(len(paragraph_embeddings), len(paragraphs), prefix="Embeddings", start_time=start_time, line_width=128)

|[31m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | Embeddings                                   

#### Load Policies & Search Terms Synonyms and embedding them into vectors

In [39]:
df = pd.read_csv("./data/preprocessed_data/policies_to_update.csv")
df = df.drop(index=[0,1])
df = df.drop(columns=["Y/N/M", "POLICY DETAILS", "Prompt"])
if row_range == "full":
    df = df.iloc[0:len(df)]
elif row_range == "first_half":
    df = df.iloc[0:len(df)//2]
elif row_range == "second_half":
    df = df.iloc[len(df)//2:len(df)]
elif "random" in row_range:
    row_range = int(row_range.split("_")[1])
    df = df.sample(n=row_range, random_state=1)
elif "from_" in row_range and "to_" in row_range:
    if "start" in row_range:
        start_row = 0
    else:
        start_row = int(row_range.split("_")[1])   
    if "end" in row_range:
        end_row = len(df)
    else:
        end_row = int(row_range.split("_")[3])   
            
    if start_row < 0:
        start_row = 0
    if end_row > len(df):
        end_row = len(df)
    if start_row > end_row:
        print(f"Invalid range: {start_row} to {end_row}.")
        sys.exit(1)

    df = df.iloc[start_row:end_row]    
else:
    df = df[df["POLICY NAME"] == row_range]
    if len(df) == 0:
        print(f"Policy {row_range} not found in the CSV file.")
        sys.exit(1)
        

 
policy_of_interests = df["POLICY NAME"].tolist()
search_terms_synonyms = df["Search Terms Synonyms"].tolist()
policy_of_interest_embeddings = []
search_terms_synonyms_embeddings = []

start_time = time.time()
for p in policy_of_interests:
    policy_of_interest_embeddings.append(get_embedding(p))
    print_progress_bar(len(policy_of_interest_embeddings), len(policy_of_interests), prefix="Embeddings", start_time=start_time, line_width=128)

|[31m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | Embeddings                                   

In [40]:
start_time = time.time()
for s in search_terms_synonyms:
    search_terms_synonyms_embeddings.append(get_embedding(str(s)))
    print_progress_bar(len(search_terms_synonyms_embeddings), len(search_terms_synonyms), prefix="Embeddings", start_time=start_time, line_width=128)

|[31m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | Embeddings                                   

## `Generate Table`

#### Loop through each policy of interest and generate the prompt

In [41]:
results = []
prompts = []
parsed_results = []
n_count = 0
start_time = time.time()

policy_to_paragraphs = []

for i, policy_of_interest in enumerate(policy_of_interests):
    search_terms_synonym = str(search_terms_synonyms[i])
    policy_of_interest_embedding = policy_of_interest_embeddings[i]
    search_terms_synonyms_embedding = search_terms_synonyms_embeddings[i]
    prompt, combined_paragraphs = generate_prompt(policy_of_interest, search_terms_synonym, policy_of_interest_embedding, paragraph_embeddings, search_terms_synonyms_embedding)
    prompts.append(prompt)
    policy_to_paragraphs.append({
        "POLICY NAME": policy_of_interest,
        "Paragraphs": combined_paragraphs
    })

for i, policy_of_interest in enumerate(policy_of_interests):
    extracted_policy_detail = request_extracted_policy_detail_from_ollama(prompts[i], policy_of_interest)
    results.append(extracted_policy_detail)

    try:
        detail_dict = json.loads(extracted_policy_detail)
        parsed_results.append(detail_dict)
        
        if detail_dict.get("Y/N/M", "").strip() == "N":
            n_count += 1
            status = "N"
        elif detail_dict.get("Y/N/M", "").strip() == "Y":
            status = "Y"
        elif detail_dict.get("Y/N/M", "").strip() == "M":
            status = "M"
        else:
            status = "ERROR"

    except Exception as e:
        parsed_results.append({
            "POLICY NAME": policy_of_interest,
            "Y/N/M": "ERROR",
            "POLICY DETAILS": f"Failed to parse: {str(e)}"
        })
        status = "Parse Error"

    print_progress_bar(i + 1, len(policy_of_interests), 
                       prefix=f"{n_count } N's | {policy_of_interest} → {status}",
                       start_time=start_time)

|[31m████████████████████████████████████████[0m| 100.0% Complete | ETA: 0m 0s | 2 N's | NICHES → N                                                                                                                                                           

#### Reformat the response into a table

In [42]:
results_df = pd.DataFrame(parsed_results)
results_df = results_df[["Y/N/M", "POLICY NAME", "POLICY DETAILS"]]

policy_to_paragraphs_df = pd.DataFrame(policy_to_paragraphs)
policy_to_paragraphs_df = policy_to_paragraphs_df[["POLICY NAME", "Paragraphs"]]

#### Writing to CSV

In [43]:
current_date_time = time.strftime("%Y-%m-%d_%H-%M-%S")
os.makedirs(current_date_time, exist_ok=True)
results_df.to_csv(f"./{current_date_time}/extracted_policy_details.csv", index=False)
policy_to_paragraphs_df.to_csv(f"./{current_date_time}/policy_to_paragraphs.csv", index=False)
with open(f"./{current_date_time}/config_variables.md", "w", encoding="utf-8") as f:
    f.write(f"## Configuration Variables\n")
    f.write("| variable name | value |\n")
    f.write("|---|---|\n")
    f.write(f"| device | `{torch.cuda.get_device_name(0) if device == 'cuda' else 'cpu'}` | \n")
    f.write(f"| find_top_k | `{find_top_k}` | \n")
    f.write(f"| policy_weight | `{policy_weight}` | \n")
    if device == "cuda":
        f.write(f"| embedding_model | `{embedding_model['gpu']}` | \n")
    else:
        f.write(f"| embedding_model | `{embedding_model['cpu']}` | \n")
    f.write(f"| extracting_model | `{extracting_model}` | \n")
    f.write(f" \n **prompt_template**: \n ```{prompt_template}``` \n")