# vLLM direct redaction

> **⚠️⚠️⚠️ vLLM complicates running larger models compared to Ollama, which makes performance worse. ⚠️⚠️⚠️**

This notebooks explores using LLM models directly to identify `Personal Identifyable Information` (PII).

> ℹ️ We use [vLLM](https://docs.vllm.ai/en/stable/) and markdown inputs as produced by something like [docLing](https://docs.vllm.ai/en/stable/)

## ⚙️ Setup

In [None]:
# install uv
!curl -LsSf https://astral.sh/uv/install.sh | sh

# Install vLLM
!uv pip install --system vllm python-dotenv

## 🦜 LLM

In [None]:
import os
from dotenv import load_dotenv
from vllm import LLM, SamplingParams

load_dotenv()

# ===================================== 👇 Configure as needed =================================
assert os.environ["HF_TOKEN"], "Plase set the HF_TOKEN = hf..." # HuggingFace API token"

# Or: 'meta-llama/Llama-3.2-3B-Instruct'
# Or: 'mistralai/Mixtral-8x7B-Instruct-v0.1'
MODEL_ID = "microsoft/Phi-4-mini-instruct"

MAX_MODEL_LEN = 10000    # Adjust depending on avaialble memory
DTYPE = "float16"        # Change to bfloat16 if GPU is capable (cuda > 8.0)
# ===================================== 👆 Configure as needed =================================


llm = LLM(
    model=MODEL_ID,
    max_model_len=MAX_MODEL_LEN,  
    trust_remote_code=True, 
    dtype=DTYPE,
)

In [None]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

### 🗣️ Check chatter

In [None]:
def generate(prompt: str, sampling_params = {}, pbar=False, **chat_args):
    # Compose the messages
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": prompt},
    ]
    # Compose the sampling parameters
    sampling_params = SamplingParams(
        max_tokens=1000,
        temperature=0.0,
        **sampling_params,
    )
    # Inference
    output = llm.chat(
        messages=messages, sampling_params=sampling_params, use_tqdm=pbar, **chat_args
    )
    return output[0].outputs[0].text


# Run inference
ans = generate("hey there. What can you do?", pbar=True)
print(f"🗣️ Answer: {ans}")

## 📚 Data

In [None]:
from pathlib import Path
from utils import split_markdown_by_spans


DATA_DIR = Path("/datasets/client-data-us/")

md_docs = list(DATA_DIR.rglob("**/*.md"))
print(f"Total markdown documents: {len(md_docs)}")

# Read the first document
text = md_docs[0].open("r").read()
print(text[:200])

# Split into chunks
spans = split_markdown_by_spans(text)
for span_id, text in spans.items():
    print(f"ID: {span_id}")
    print(f"Text: {text[:50]}...\n")

## 🫥 Anonymisation

In [None]:
from collections import defaultdict
from tqdm.rich import tqdm
from pprint import pformat

TEMPLATE = """
In this chunk of markdown text:

```
{context}
```

{request}

Answer with a bullet-point list if any are found. Otherwise respond 'None'
"""

requests = {
    "orgs": (
        "What are the companies mentioned in the text? "
        "(Do not include placeholders like: 'Developer', 'Customer', 'Distributor' or similar.)",
    ),
    "loc": "What locations or addresses mentioned in the text?",
    "contact": "What telephone-numbers or emails are mentioned in the text?",
    "people": "What person's are mentioned in the text?",
    "date": "What dates are mentioned in the text?"
}

entities = defaultdict(lambda: defaultdict(list))
doc_iter = tqdm(md_docs)
for md_doc in doc_iter:
    doc_name = md_doc.stem
    doc_iter.set_description(f"📄 {doc_name}...")
    text = md_doc.open().read()
    spans = split_markdown_by_spans(text)
    for span_id, text in spans.items():
        for k, question in requests.items():
            ans = generate(TEMPLATE.format(context=text, request=question))
            if not "None" in ans:
                entities[doc_name][k] += [a.strip() for a in ans.split("- ") if a]
                
    break
                

print(pformat(entities))    

In [None]:
# map each entity to a placeholder and substitute in the original text
placeholders = {}
for doc_name, doc_entities in entities.items():
    placeholders[doc_name] = {}
    for ent_type, ent_list in doc_entities.items():
        for i, ent in enumerate(ent_list):
            placeholders[doc_name][ent] = f"{ent_type.upper()}_{i}" 

print(pformat(placeholders))

In [None]:
import copy
import re


def mask_text(text, entities):
    masked_text = copy.copy(text)
    for ent, placeholder in entities.items():
        if ent in ["Customer", "Developer", "Distributor"]:
            continue
        print(f"{ent} --> {placeholder}")
        masked_text = re.sub(ent, f"[{placeholder}]" , masked_text, count=0, flags=0)

    return masked_text
    
    
doc = md_docs[0]
doc_name = doc.stem
doc_text = doc.open().read()
print(mask_text(doc_text, placeholders[doc_name]))