In [1]:
import os
import re
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
import textwrap

In [11]:
CSV_FILE = "AI Generated Essays Dataset.csv"
OUTPUT_DIR = "output"
LABELS_CSV = "labels.csv"

IMAGE_SIZE = (1024, 1024)
FONT_SIZE = 32
MARGIN = 40
LINE_SPACING = 10

FONT_PATH = "/System/Library/Fonts/Supplemental/Arial.ttf"

In [12]:
def load_font():
    try:
        return ImageFont.truetype(FONT_PATH, FONT_SIZE)
    except OSError:
        return ImageFont.load_default()

In [13]:
def text_to_image(text, output_path, font):
    img = Image.new("RGB", IMAGE_SIZE, "white")
    draw = ImageDraw.Draw(img)

    max_width = IMAGE_SIZE[0] - 2 * MARGIN
    avg_char_width = font.getlength("A")
    max_chars_per_line = int(max_width / avg_char_width)

    wrapped_text = textwrap.fill(text, width=max_chars_per_line)

    draw.multiline_text(
        (MARGIN, MARGIN),
        wrapped_text,
        fill="black",
        font=font,
        spacing=LINE_SPACING
    )

    img.save(output_path)

In [14]:
def main():
    df = pd.read_csv(CSV_FILE)

    os.makedirs(OUTPUT_DIR, exist_ok=True)

    font = load_font()

    labels = []
    image_id = 1  # start numbering from 1

    total = len(df)
    print(f"Processing {total} samples...")

    for _, row in df.iterrows():
        text = str(row["text"]).strip()
        ground_truth = int(row["generated"])

        if not text:
            continue

        image_path = os.path.join(OUTPUT_DIR, f"{image_id}.png")

        text_to_image(text, image_path, font)

        labels.append({
            "ID": image_id,
            "ground_truth": ground_truth
        })

        if image_id % 500 == 0:
            print(f"Generated {image_id} images")

        image_id += 1

    # Save ground-truth CSV
    labels_df = pd.DataFrame(labels)
    labels_df.to_csv(LABELS_CSV, index=False)

    print("Done.")
    print(f"Total images saved: {image_id - 1}")
    print(f"Labels saved to: {LABELS_CSV}")

if __name__ == "__main__":
    main()

Processing 1460 samples...
Generated 500 images
Generated 1000 images
Done.
Total images saved: 1460
Labels saved to: labels.csv
