In [7]:
import random
import pandas as pd
import numpy as np

In [8]:
def generate_k_digit_addition_dataset(max_k=8, samples_per_k=200):
    """
    Generate a dataset of k-digit addition problems using Pandas DataFrame.
    
    Args:
    - max_k (int): Maximum number of digits to generate (default 32)
    - samples_per_k (int): Number of samples to generate for each k
    
    Returns:
    - pandas.DataFrame with columns: digits, num1, num2, result
    """
    # Prepare lists to store data
    digits_list = []
    num1_list = []
    num2_list = []
    result_list = []
    
    # Generate samples for k from 1 to max_k
    for k in range(1, max_k + 1):
        # Calculate the range for k-digit numbers
        min_val = 10 ** (k - 1)
        max_val = (10 ** k) - 1
        
        # Generate samples_per_k samples for this k
        for _ in range(samples_per_k):
            # Generate two random k-digit numbers
            num1 = random.randint(min_val, max_val)
            num2 = random.randint(min_val, max_val)
            
            # Calculate the result
            result = num1 + num2
            
            # Append to lists
            digits_list.append(k)
            num1_list.append(num1)
            num2_list.append(num2)
            result_list.append(result)
    
    # Create DataFrame
    df = pd.DataFrame({
        'digits': digits_list,
        'num1': num1_list,
        'num2': num2_list,
        'result': result_list
    })
    
    print(f"Dataset generated")
    print(f"Total samples: {len(df)}")
    print(f"DataFrame shape: {df.shape}")
    
    return df

In [9]:
k_digit_addition_df = generate_k_digit_addition_dataset()

Dataset generated
Total samples: 1600
DataFrame shape: (1600, 4)


In [10]:
dataset_name = "math"

# Split the data into train and test sets, with 90% in the train set
train_df = k_digit_addition_df.sample(frac=0.9, random_state=42)
test_df = k_digit_addition_df.drop(train_df.index)

# Save the dataframes to .jsonl files
train_df.to_json(f'{dataset_name}_train.jsonl', orient='records', lines=True)
test_df.to_json(f'{dataset_name}_test.jsonl', orient='records', lines=True)