In [11]:
import os
import re
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import textwrap
from datasets import load_dataset

In [12]:
MAX_PER_CLASS = 5000
MAX_CHAR = 3500
OUTPUT_DIR = "triplet_output"

IMAGE_SIZE = (1024, 1024)
FONT_SIZE = 14
MARGIN = 15
LINE_SPACING = 9

In [13]:
DOC_FONT = "/System/Library/Fonts/Times New Roman.ttf" 
LATEX_FONT = "/System/Library/Fonts/Supplemental/Times.ttc"  
BROWSER_FONT = "/System/Library/Fonts/Supplemental/Arial.ttf" 

In [14]:
def render_text_to_image(text, font_path, output_path):
    try:
        font = ImageFont.truetype(font_path, FONT_SIZE)
    except:
        font = ImageFont.load_default()

    img = Image.new("RGB", IMAGE_SIZE, "white")
    draw = ImageDraw.Draw(img)

    max_width = IMAGE_SIZE[0] - 2 * MARGIN
    avg_char_width = font.getlength("A")
    max_chars_per_line = int(max_width / avg_char_width)

    wrapped_text = textwrap.fill(text, width=max_chars_per_line)

    draw.multiline_text(
        (MARGIN, MARGIN),
        wrapped_text,
        fill="black",
        font=font,
        spacing=LINE_SPACING
    )

    img.save(output_path)

In [15]:
def main():
    ds = load_dataset("artem9k/ai-text-detection-pile", split="train")
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    ai_count = 0
    human_count = 0

    for row in ds:
        text = row["text"]
        text_id = row["id"]
        source = row["source"].lower()

        if len(text) > MAX_CHAR:
            continue

        if source == "ai" and ai_count < MAX_PER_CLASS:
            label_ok = True
        elif source == "human" and human_count < MAX_PER_CLASS:
            label_ok = True
        else:
            label_ok = False

        if not label_ok:
            continue

        render_text_to_image(text, DOC_FONT, f"{OUTPUT_DIR}/{text_id}_doc.png")
        render_text_to_image(text, LATEX_FONT, f"{OUTPUT_DIR}/{text_id}_latex.png")
        render_text_to_image(text, BROWSER_FONT, f"{OUTPUT_DIR}/{text_id}_browser.png")

        if source == "ai":
            ai_count += 1
        else:
            human_count += 1

        if ai_count == MAX_PER_CLASS and human_count == MAX_PER_CLASS:
            break

    print("======================================")
    print(f"AI samples: {ai_count} → {ai_count * 3} images")
    print(f"Human samples: {human_count} → {human_count * 3} images")
    print(f"Total images: {(ai_count + human_count) * 3}")
    print("======================================")

if __name__ == "__main__":
    main()

AI samples: 5000 → 15000 images
Human samples: 5000 → 15000 images
Total images: 30000
