In [11]:
import os
import re
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import textwrap
from datasets import load_dataset

In [13]:
MAX_SAMPLES = 10
MAX_CHAR = 3000
OUTPUT_DIR = "triplet_output"

IMAGE_SIZE = (1024, 1024)
FONT_SIZE = 12
MARGIN = 20
LINE_SPACING = 7

In [14]:
DOC_FONT = "/System/Library/Fonts/Times New Roman.ttf" 
LATEX_FONT = "/System/Library/Fonts/Supplemental/Times.ttc"  
BROWSER_FONT = "/System/Library/Fonts/Supplemental/Arial.ttf" 

In [15]:
def render_text_to_image(text, font_path, output_path):
    try:
        font = ImageFont.truetype(font_path, FONT_SIZE)
    except:
        font = ImageFont.load_default()

    img = Image.new("RGB", IMAGE_SIZE, "white")
    draw = ImageDraw.Draw(img)

    max_width = IMAGE_SIZE[0] - 2 * MARGIN
    avg_char_width = font.getlength("A")
    max_chars_per_line = int(max_width / avg_char_width)

    wrapped_text = textwrap.fill(text, width=max_chars_per_line)

    draw.multiline_text(
        (MARGIN, MARGIN),
        wrapped_text,
        fill="black",
        font=font,
        spacing=LINE_SPACING
    )

    img.save(output_path)

In [16]:
def main():
    print("Loading dataset from Hugging Face...")
    ds = load_dataset("artem9k/ai-text-detection-pile", split="train")

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    count = 0
    total = len(ds)
    print(f"Total samples in dataset: {total}")

    for row in ds:
        text = row["text"]
        sample_id = row["id"]

        # filter excessively long text
        if len(text) > MAX_CHAR:
            continue

        # stop after MAX_SAMPLES
        if count >= MAX_SAMPLES:
            break

        # triplet rendering
        doc_path = os.path.join(OUTPUT_DIR, f"{sample_id}_doc.png")
        latex_path = os.path.join(OUTPUT_DIR, f"{sample_id}_latex.png")
        browser_path = os.path.join(OUTPUT_DIR, f"{sample_id}_browser.png")

        render_text_to_image(text, DOC_FONT, doc_path)
        render_text_to_image(text, LATEX_FONT, latex_path)
        render_text_to_image(text, BROWSER_FONT, browser_path)

        count += 1

        if count % 100 == 0:
            print(f"Generated triplets for {count} samples...")

    print("====================================================")
    print(f"Finished. Total entries processed: {count}")
    print(f"Triplets saved in: {OUTPUT_DIR}")
    print("====================================================")

if __name__ == "__main__":
    main()

Loading dataset from Hugging Face...




Total samples in dataset: 1392522
Finished. Total entries processed: 10
Triplets saved in: triplet_output
