In [16]:
import random
import string
import math

In [17]:
def skewed_int(min_val, max_val, median):
    if not (min_val < median < max_val):
        raise ValueError("invalid input")

    R = max_val - min_val
    frac = (median - min_val) / R
    k = math.log(frac) / math.log(0.5)

    u = random.random()
    raw = (u ** k) * R
    x = min_val + int(raw)

    return min(x, max_val)

In [18]:
def generate_word():
    return ''.join(random.choices(string.ascii_letters + 'äöüÄÖÜ', k=skewed_int(3, 22, 6)))

In [19]:
def generate_number():
    return str(int(''.join(random.choices(string.digits, k=skewed_int(1, 10, 4)))))

In [20]:
def generate_punctuation():
    return random.choice(r"""!,.:;?""")

def generate_special():
    return random.choice(r"""#&%/^$""")

In [21]:
def replace_class(c):
    if c == 'word':
        return generate_word()
    elif c == 'number':
        return generate_number()
    elif c == 'punctuation':
        return generate_punctuation()
    elif c == 'special':
        return generate_special()
    return ''

In [22]:
def generate_sequence():
    sequence_length = skewed_int(15, 60, 30)
    sequence_length = 60

    structure = []

    classes = ['word', 'number']
    weights = [0.8, 0.2]

    structure.append(random.choices(classes, weights=weights, k=1)[0])

    classes = ['word', 'number', 'punctuation', 'special']
    weights = [0.7, 0.19, 0.1, 0.01]

    structure.extend(random.choices(classes, weights=weights, k=sequence_length))
    if not structure[-1] == 'punctuation':
        structure.append('punctuation')

    res = ''
    for i, val in enumerate(structure):
        res += replace_class(val)

        if len(structure) > i+1 and not structure[i + 1] == 'punctuation':
            res += ' '

    return res

In [23]:
from PIL import Image, ImageDraw, ImageFont


def wrap_text(text, font, max_width, draw):
    words = text.split()
    if not words:
        return []

    lines = []
    current_line = words[0]

    for word in words[1:]:
        # measure width of candidate line
        candidate = f"{current_line} {word}"
        bbox = draw.textbbox((0, 0), candidate, font=font)
        line_width = bbox[2] - bbox[0]
        if line_width <= max_width:
            current_line = candidate
        else:
            lines.append(current_line)
            current_line = word

    lines.append(current_line)
    return lines


def draw_text_with_wrap(image, position, text, font, fill, max_width, line_spacing=4):
    draw = ImageDraw.Draw(image)
    x, y = position

    # wrap text into lines fitting within inner_width
    lines = wrap_text(text, font, max_width, draw)

    # draw each line at offset x
    for line in lines:
        draw.text((x, y), line, font=font, fill=fill)
        bbox = draw.textbbox((0, 0), line, font=font)
        line_height = bbox[3] - bbox[1]
        y += line_height + line_spacing

    return image, lines

In [25]:
fonts = [
    ImageFont.truetype("arial.ttf", 14),
    ImageFont.truetype("arial.ttf", 16),
    ImageFont.truetype("arial.ttf", 18),
    ImageFont.truetype("cour.ttf", 14),
    ImageFont.truetype("cour.ttf", 16),
    ImageFont.truetype("cour.ttf", 18),
    ImageFont.truetype("times.ttf", 14),
    ImageFont.truetype("times.ttf", 16),
    ImageFont.truetype("times.ttf", 18),
    ImageFont.truetype("consola.ttf", 14),
    ImageFont.truetype("consola.ttf", 16),
    ImageFont.truetype("consola.ttf", 18),
]

for i in range(100):
    img = Image.new("RGB", (400, 400), color="white")

    text = generate_sequence()
    font = random.choice(fonts)
    offset = random.randint(0, 40)
    padding = random.randint(0, 40)
    line_spacing = random.randint(4, 10)

    wrapped_img, lines = draw_text_with_wrap(
        img,
        position=(padding, offset),
        text=text,
        font=font,
        fill="black",
        max_width=400 - padding * 2,
        line_spacing=line_spacing
    )

    name = str(i).zfill(2)
    wrapped_img.save(f'valid/{name}.png')

    with open(f'valid/{name}.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join(lines))