## Synthetic dataset generator for an organization's surveillance system

In [None]:
# Imports

import os
import torch
import accelerate
import requests
from openai import OpenAI
from transformers import pipeline
from dotenv import load_dotenv
from huggingface_hub import login
from IPython.display import Markdown, display

In [None]:
# Get API keys

load_dotenv(override=True)
openai_api_key = os.getenv("OPENAI_API_KEY")
claude_api_key = os.getenv("ANTHROPIC_API_KEY")
gemini_api_key = os.getenv("GOOGLE_API_KEY")
hf_token = os.getenv("HF_TOKEN")

if openai_api_key and openai_api_key.startswith("sk-proj-") and len(openai_api_key) > 0:
    print("OpenAI key looks good")
else:
    print("OpenAI key not valid")

if claude_api_key and claude_api_key.startswith("sk-ant-") and len(claude_api_key) > 0:
    print("Anthropic API key looks good")
else:
    print("Anthropic API key not valid")

if gemini_api_key and len(gemini_api_key) > 0:
    print("Gemini API key looks good")
else:
    print("Gemini API key not valid")

if hf_token and hf_token.startswith("hf") and len(hf_token) > 0:
    print("Huggingface token looks good")
else:
    print("Huggingface token not valid")

In [None]:
# Constants

GPT_5_1 = "gpt-5.1"
CLAUDE = "claude-opus-4-5"
GEMINI = "gemini-3-pro-preview"
GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
ANTHROPIC_BASE_URL = "https://api.anthropic.com/v1/"
OLLAMA_URL = "http://localhost:11434/v1"

In [None]:
# Log in to Hugging Face
login(hf_token, add_to_git_credential=True)

# Sign in OpenAI
openai = OpenAI(api_key=openai_api_key)

# Sign in Anthropic
anthropic = OpenAI(base_url=ANTHROPIC_BASE_URL, api_key=claude_api_key)

# Sign in Gemini
gemini = OpenAI(base_url=GEMINI_BASE_URL, api_key=gemini_api_key)

# Using Ollama
ollama = OpenAI(base_url=OLLAMA_URL, api_key="ollama")

In [None]:
# Check for MPS availability and set device accordingly

if torch.backends.mps.is_available():
    device = "mps"
    print("Using MPS (Apple Silicon GPU) for accelerated computation.")
elif torch.cuda.is_available():
    device = "cuda"
    print("Using CUDA (NVIDIA GPU) for accelerated computation.")
else:
    device = "cpu"
    print("Using CPU for computation.")

dtype = torch.bfloat16 if device == "cuda" else torch.float32

In [None]:
system_prompt = """
You are a **Synthetic Security Dataset Generator** designed to produce **realistic, fully synthetic, privacy-safe datasets** for testing surveillance, access control, and monitoring systems in any organization.

Your job is to generate **anonymized, fictional, tabular datasets** such as:
- Employee directory datasets  
- Visitor registration logs  
- Access control event logs (badge, PIN, biometrics *as labels only*)  
- CCTV activity summary datasets (text-only, no images/videos)  
- Security alert & incident logs  

These datasets are used for **simulation, testing, analytics, and system evaluation**.

---

## **GLOBAL RULES & HARD CONSTRAINTS (MUST FOLLOW)**

### **1. Data must be 100% synthetic**
- All persons must be fictional.
- Do **not** use real names of staff, companies, or public figures.
- Use neutral identifiers such as `EMP001`, `VIS001`, etc.

### **2. No sensitive or protected attributes**
Do **not** include:
- Race  
- Ethnicity  
- Religion  
- Political views  
- Medical information  
- Sex/sexual orientation  
- Any biometric templates, hashes, or measurements  

Allowed neutral domain attributes:
- Job roles  
- Departments  
- Access levels  
- Visitor purpose  
- Room/zone identifiers  
- Timestamps  

### **3. No biometric details**
- You may reference access methods such as `"CARD"`, `"PIN"`, `"FINGERPRINT"`, `"FACE_ID"` **only as labels**.
- Do **not** generate biometric vectors, facial features, fingerprint templates, etc.

### **4. Realism & internal consistency**
- Produce realistic timestamps, floors, doors, zones, and event flows.
- Ensure referential integrity:
  - Employee IDs referenced in logs must exist.
  - Visitor IDs must be consistent across datasets.
- Maintain realistic work hours, visitor hours, and after-hours patterns.

### **5. Output format requirements**
- Default output: **CSV text** with a header row.
- Do **not** include commentary unless explicitly asked.
- Do **not** wrap results in explanations, unless the user asks for them.

### **6. Privacy & ethical compliance**
- Ensure datasets cannot be used to identify real individuals.
- The outputs must be suitable for **testing system logic**, not profiling people based on sensitive traits.

---

## **Your Behavior**
- When the user requests a dataset, return **only the dataset** in the requested format.
- If a request violates constraints (e.g., asks for real people or biometrics), politely restate limitations and produce a compliant synthetic version.
- Allow the user to define:
  - The organization name  
  - The scenario  
  - The dataset type  
  - The schema  
  - The number of rows  
  - The date ranges  
  - Any additional constraints  

Always adhere to the global rules above.

"""

user_prompt = """
Generate a synthetic ACCESS CONTROL EVENT LOG for NIMASA HQ.

SCENARIO:
- Time window: one full workweek, 2025-01-06 to 2025-01-10.
- Include both employees and registered visitors.
- Events are generated by door controllers, turnstiles, and biometric readers.

OUTPUT:
- Format: CSV (plain text, with a header row).
- Target volume: approximately 20,000–30,000 rows.

COLUMNS:
- event_id: unique string, e.g., “EVT000001”.
- timestamp: ISO 8601 format in local time, e.g., “2025-01-06T08:23:11+01:00”.
- subject_type: one of ["EMPLOYEE", "VISITOR", "SERVICE_ACCOUNT"].
- subject_id: employee_id or visitor_id (e.g., EMP023, VIS104), or system account like “SYS-CLEANING”.
- access_point_id: short code, e.g., “ENTRANCE_MAIN”, “PARKING_GATE_1”, “ICT_FLOOR_5_DOOR01”, “CONTROL_ROOM_DOOR”.
- direction: one of ["IN", "OUT"].
- method: one of ["CARD", "FINGERPRINT", "PIN", "FACE_ID"] – just a label, no biometric details.
- result: one of ["GRANTED", "DENIED"].
- denial_reason: empty if GRANTED, otherwise one of ["Invalid Credential", "Expired Access", "Outside Allowed Hours", "Insufficient Clearance", "Unknown Badge"].
- source_system: e.g., "ACCESS_CTRL_PANEL_01", "TURNSTILE_EAST".
- access_level_required: one of ["Low", "Medium", "High", "Restricted"].
- subject_access_level: one of ["Low", "Medium", "High", "Restricted"].
- correlated_visit_id: if a visitor, link to visitor_id or visit code; empty for employees.

PATTERNS TO INCLUDE:
- Normal behavior: employees entering between 07:30–09:30 and leaving 16:00–19:00.
- Some failed access attempts, e.g., wrong door, insufficient clearance.
- Occasional after-hours access events for ICT / Security staff.
- A small number of denied events outside normal hours.

CONSTRAINTS:
- Ensure subject_access_level is consistent with the employee master dataset assumptions.
- No sensitive attributes beyond what is listed.
- No biometric templates or numeric measurements.

Return ONLY the CSV dataset.

"""

messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

### Using an open source model via Hugging Face

In [None]:
# Using Phi 3 Instruct via Hugging Face

pipe = pipeline(
    "text-generation",
    model="microsoft/Phi-3-mini-4k-instruct",
    dtype=dtype,
    device=device
)

output = pipe(messages, max_new_tokens=4000)
csv_text = output[0]["generated_text"][-1]["content"]

with open("phi3_generated_dataset.csv", "w") as f:
    f.write(csv_text)

### Using Ollama to generate synthetic data locally

In [None]:
requests.get("http://localhost:11434/").content

# If not running, run ollama serve at a command line

In [None]:
# Only do this if you have a large machine - at least 16GB RAM
!ollama pull gpt-oss:20b

In [None]:
# Using GPT OSS locally

response = ollama.chat.completions.create(
    model="gpt-oss:20b",
    messages=messages
)

csv_text = response.choices[0].message.content

with open("gpt_oss_generated_dataset.csv", "w") as f:
    f.write(csv_text)

In [None]:
# Using GPT 5.1

response = openai.chat.completions.create(
    model=GPT_5_1,
    messages=messages
)

csv_text = response.choices[0].message.content

with open("gpt_5_1_generated_dataset.csv", "w") as f:
    f.write(csv_text)

In [None]:
# Using Claude Opus 4.5

response = anthropic.chat.completions.create(
    model=CLAUDE,
    messages=messages
)

csv_text = response.choices[0].message.content

with open("claude_generated_dataset.csv", "w") as f:
    f.write(csv_text)

In [None]:
# Using Gemini 3.0

response = gemini.chat.completions.create(
    model=GEMINI,
    messages=messages
)

csv_text = response.choices[0].message.content

with open("gemini_3_generated_dataset.csv", "w") as f:
    f.write(csv_text)