### Generate datasets with random words - small strings, medium and large strings

In [4]:
import random
import string

def generate_random_string(length):
    """Generates a random string of given fixed length."""
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

def generate_dataset(num_lines, string_length):
    """Generates a dataset with the specified number of lines and fixed string length."""
    return [generate_random_string(string_length) for _ in range(num_lines)]

# Parameters
num_lines = 50  # Number of lines per dataset

# Generate datasets with fixed lengths
random_words_small = generate_dataset(num_lines, 500)       # Strings of length 500
random_words_medium = generate_dataset(num_lines, 5000)     # Strings of length 5000
random_words_large = generate_dataset(num_lines, 10000)     # Strings of length 10000

# Save datasets to files
with open("random_words_small.txt", "w") as f:
    f.write("\n".join(random_words_small))

with open("random_words_medium.txt", "w") as f:
    f.write("\n".join(random_words_medium))

with open("random_words_large.txt", "w") as f:
    f.write("\n".join(random_words_large))



### Generate datasets with 2 columns of words that share a pattern

In [6]:
import random
import string
import csv

# Function to generate a random word
def generate_word(length):
    return ''.join(random.choices(string.ascii_lowercase, k=length))

# Function to generate two words with a common pattern at a random position
def generate_word_pair_with_common_pattern(pattern_length, total_length1, total_length2):
    # Generate the common pattern
    common_pattern = generate_word(pattern_length)
    
    # Randomly split the remaining length for both words
    split1 = random.randint(0, total_length1 - pattern_length)
    split2 = random.randint(0, total_length2 - pattern_length)
    
    # Generate surrounding parts for each word
    part1_left = generate_word(split1)
    part1_right = generate_word(total_length1 - pattern_length - split1)
    
    part2_left = generate_word(split2)
    part2_right = generate_word(total_length2 - pattern_length - split2)
    
    # Combine parts to form the two words
    word1 = part1_left + common_pattern + part1_right
    word2 = part2_left + common_pattern + part2_right
    
    return word1, word2

# Function to create a dataset with two words per line sharing a common pattern
def create_dataset_with_common_pattern(filename, num_lines, pattern_length_range, total_length_range):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write the header
        writer.writerow(["string1", "string2"])
        # Generate rows with random words and a shared pattern
        for _ in range(num_lines):
            pattern_length = random.randint(*pattern_length_range)
            total_length1 = random.randint(*total_length_range)
            total_length2 = random.randint(*total_length_range)
            word1, word2 = generate_word_pair_with_common_pattern(pattern_length, total_length1, total_length2)
            writer.writerow([word1, word2])

# Small patterns
small_pattern_length_range = (5, 50)
small_total_length_range = (50, 100)

# Medium patterns
medium_pattern_length_range = (5, 100)
medium_total_length_range = (100, 500)

# Large patterns
large_pattern_length_range = (5, 500)
large_total_length_range = (500, 1000)

# Create the datasets
create_dataset_with_common_pattern("word_pairs_with_common_pattern_small.csv", 50, small_pattern_length_range, small_total_length_range)
create_dataset_with_common_pattern("word_pairs_with_common_pattern_medium.csv", 50, medium_pattern_length_range, medium_total_length_range)
create_dataset_with_common_pattern("word_pairs_with_common_pattern_large.csv", 50, large_pattern_length_range, large_total_length_range)




### Generate datasets of two columns: One with the word, and the other with a pattern that exists on that word

In [9]:
import random
import string
import csv

# Function to generate a random word
def generate_word(length):
    return ''.join(random.choices(string.ascii_lowercase, k=length))

# Function to generate a word with an embedded pattern
def generate_word_with_pattern(pattern_length, word_length, max_repeats):
    """
    Generates a word with a pattern inserted multiple times at random positions.
    :param pattern_length: Length of the pattern to insert.
    :param word_length: Total length of the generated word.
    :param max_repeats: Maximum number of times to insert the pattern.
    :return: Tuple (word, pattern).
    """
    # Generate the pattern
    pattern = generate_word(pattern_length)
    
    # Dynamically adjust the number of repetitions to fit within the word length
    max_possible_repeats = word_length // pattern_length
    num_repeats = min(max_repeats, max_possible_repeats)
    num_repeats = random.randint(1, num_repeats)
    
    # Calculate remaining space in the word after all patterns are inserted
    remaining_length = word_length - (pattern_length * num_repeats)
    
    # Generate the base word without the patterns
    base_word = generate_word(remaining_length)
    
    # Randomly split the base word into segments for pattern insertion
    positions = sorted(random.sample(range(len(base_word) + 1), num_repeats))
    word = ""
    last_pos = 0
    for pos in positions:
        word += base_word[last_pos:pos] + pattern
        last_pos = pos
    word += base_word[last_pos:]
    
    return word, pattern


# Function to create a dataset with word and its pattern
def create_dataset_with_patterns(filename, num_lines, pattern_length_range, word_length_range):
    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        # Write the header
        writer.writerow(["word", "pattern"])
        # Generate rows with words and patterns
        for _ in range(num_lines):
            pattern_length = random.randint(*pattern_length_range)
            word_length = random.randint(*word_length_range)
            word, pattern = generate_word_with_pattern(pattern_length, word_length, max_repeats=3)
            writer.writerow([word, pattern])

short_pattern_length_range = (5, 50)
medium_pattern_length_range = (5, 100)
long_pattern_length_range = (5, 500)

short_word_length_range = (50, 100)
medium_word_length_range = (100, 500)
long_word_length_range = (500, 1000)

# Create datasets for different scenarios
create_dataset_with_patterns("short_patterns.csv", 50, short_pattern_length_range, short_word_length_range)
create_dataset_with_patterns("medium_patterns.csv", 50, medium_pattern_length_range, medium_word_length_range)
create_dataset_with_patterns("long_patterns.csv", 50, long_pattern_length_range, long_word_length_range)
