Generating *.txt files

In [1]:
from faker import Faker
import random
import os

def generate_random_text(size, language = "en"):
    fake = Faker(language)
    words = []
    while sum(len(w) for w in words) + len(words) - 1 < size:
        words.append(fake.word())

    return ' '.join(words)[:size]

def create_txt_files(num_of_files = 3000, min_size = 300, max_size = 10000, output_dir = '3000_files/txt', language = 'en'):
    os.makedirs(output_dir, exist_ok=True)

    for i in range(1, num_of_files + 1):
        size = random.randint(min_size, max_size)
        text = generate_random_text(size, language)
        file_name = os.path.join(output_dir, f'file{i:04d}.txt')
        with open(file_name, 'w', encoding='utf-8') as file:
            file.write(text)

    print(f"[TXT] Generated {num_of_files} *.txt files, saved in '{output_dir}'")

Generating *.html files

In [2]:
def create_html_files(num_of_files = 3000, min_size = 300, max_size = 10000, output_dir = '3000_files/html', language = 'en'):
    os.makedirs(output_dir, exist_ok=True)
    faker = Faker()

    for i in range(1, num_of_files + 1):
        size = random.randint(min_size, max_size)
        paragraphs = []
        current_size = 0

        while current_size < size:
            p = faker.paragraph(nb_sentences=random.randint(2, 5))
            paragraphs.append(f"\t\t\t<p>{p}<p>")
            current_size = sum(len(p) for p in paragraphs)

        html = f"""<!DOCTYPE html>
        <html>
            <head>
                <title>{faker.catch_phrase()}</title>
            </head>
            <body>
                <h1>{faker.company()}</h1>
                {'\n    '.join(paragraphs)}
            </body>
        </html>"""
        
        with open(os.path.join(output_dir, f'file{i:04d}.html'), 'w', encoding='utf-8') as file:
            file.write(html)

    print(f"[HTML] Generated {num_of_files} *.html files, saved in '{output_dir}'")

Generating *.csv files

In [3]:
import pandas as pd

def create_csv_files(num_of_files = 3000, min_size = 300, max_size = 10000, output_dir = '3000_files/csv', language = 'en'):
    os.makedirs(output_dir, exist_ok = True)
    faker = Faker()

    for i in range(1, num_of_files + 1):
        size = random.randint(min_size, max_size)
        rows = max(5, size // 100)
        columns = random.randint(4,8)
        headers = [faker.word().capitalize() for _ in range(columns)]

        data = []
        for _ in range(rows):
            row = []
            for _ in range(columns):
                val_type = random.choice(['str', 'float', 'int'])
                if val_type == 'str':
                    row.append(faker.word())
                elif val_type == 'float':
                    row.append(round(random.uniform(1, 100000), 2))
                else:
                    row.append(random.randint(1,100000))
        data.append(row)

        df = pd.DataFrame(data, columns=headers)
        file_name = os.path.join(output_dir, f'file{i:04d}.csv')
        df.to_csv(file_name, index = False)

    print(f"[CSV] Generated {num_of_files} *.csv files, saved in '{output_dir}'")

Generating *.py files

In [4]:
def create_python_files(num_of_files = 3000, min_size = 300, max_size = 10000, output_dir = '3000_files/pyth'):
    os.makedirs(output_dir, exist_ok=True)
    faker = Faker()

    for i in range(1, num_of_files + 1):
        size = random.randint(min_size, max_size)
        
        code = [
            "import random\n\n",
            "class SampleClass:\n",
            "    def __init__(self):\n",
            "        self.data = [random.randint(0, 100) for _ in range(10)]\n\n",
            "    def sort_data(self):\n",
            "        return sorted(self.data)\n\n",
            "def main():\n",
            "    obj = SampleClass()\n",
            "    print(obj.sort_data())\n\n",
            "if __name__ == '__main__':\n",
            "    main()\n"
        ]

        while sum(len(line) for line in code) < size:
            method_name = faker.word()
            code.append(f"\ndef {method_name}():\n\treturn {random.randint(1,100000)}\n")

        with open(os.path.join(output_dir, f'file{i:04d}.py'), 'w', encoding='utf-8') as file:
            file.write(''.join(code))

    print(f"[PY] Generated {num_of_files} *.py files, saved in '{output_dir}'")

Generating *.wav files

In [5]:
import wave
import numpy as np

def generate_waveform(duration_sec, framerate=44100):
    t = np.linspace(0, duration_sec, int(framerate * duration_sec), False)
    waveform = np.zeros_like(t)
    
    # Dodajemy kilka losowych tonów
    for _ in range(random.randint(3, 6)):
        freq = random.randint(220, 1200)  # np. A3 do A6
        waveform += np.sin(2 * np.pi * freq * t)
    
    waveform /= np.max(np.abs(waveform))  # normalizacja
    return (waveform * 32767).astype(np.int16)

def wav_file(filename, target_size, framerate=44100, nchannels=1, sampwidth=2):
    sampwidth = 2  # 16 bitów
    nchannels = 1
    bytes_per_sec = framerate * sampwidth * nchannels
    duration = max(1, target_size // bytes_per_sec)

    data = generate_waveform(duration, framerate)

    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(nchannels)
        wf.setsampwidth(sampwidth)
        wf.setframerate(framerate)
        wf.writeframes(data.tobytes())

def create_wav_files(num_of_files=3000, min_size=10000, max_size=500000, output_dir="3000_files/wav"):
    os.makedirs(output_dir, exist_ok=True)

    for i in range(1, num_of_files + 1):
        size = random.randint(min_size, max_size)
        wav_file(os.path.join(output_dir, f"file{i:04d}.wav"), target_size=size)
    
    print(f"[WAV] Generated {num_of_files} *.wav files, saved in '{output_dir}'")

Generating *.bmp files

In [6]:
from PIL import Image
import numpy as np
import os

def create_bmp_file(filename, target_size):
    
    pixel_count = max(1, (target_size - 54) // 3)
    side = int(np.sqrt(pixel_count))  

    color = tuple(np.random.randint(0, 256, 3)) 
    img = Image.new("RGB", (side, side), color=color)
    img.save(filename, format='BMP')

def create_bmp_files(num_of_files=3000, min_size=50000, max_size=300000, output_dir="3000_files/bmp"):
    os.makedirs(output_dir, exist_ok=True)
    for i in range(1, num_of_files + 1):
        size = np.random.randint(min_size, max_size + 1)
        filename = os.path.join(output_dir, f"image_{i:04d}.bmp")
        create_bmp_file(filename, size)
    print(f"[BMP] Generated {num_of_files} *.bmp files, saved in '{output_dir}'")

Generating files

In [7]:
create_txt_files()
create_html_files()
create_csv_files()
create_python_files()
create_wav_files()
create_bmp_files()

[TXT] Generated 3000 *.txt files, saved in '3000_files/txt'
[HTML] Generated 3000 *.html files, saved in '3000_files/html'
[CSV] Generated 3000 *.csv files, saved in '3000_files/csv'
[PY] Generated 3000 *.py files, saved in '3000_files/pyth'
[WAV] Generated 3000 *.wav files, saved in '3000_files/wav'
[BMP] Generated 3000 *.bmp files, saved in '3000_files/bmp'
