# Dataset Generation for Math and Boolean GPT
## CS7CS4 Machine Learning - Final Assignment 2025-26

This notebook generates comprehensive datasets for:
1. **Part 1**: Math GPT (arithmetic expressions)
2. **Part 2**: Boolean GPT (boolean logic expressions)

### Strategy:
- Start with **single-digit arithmetic** for easier learning
- Exhaustive coverage of all basic operation combinations
- Gradual complexity increase (two operations, then parentheses)
- Separate training and testing sets (90/10 split)

### Dataset Quality Criteria:
1. **Completeness**: Cover all operation combinations
2. **Balance**: Equal representation of all operations
3. **Correctness**: All expressions mathematically accurate
4. **Appropriate Size**: Large enough to learn patterns
5. **Complexity Gradation**: Simple to complex

In [None]:
import random
import os
from collections import defaultdict

random.seed(42)  # For reproducibility

def save_dataset(filepath, expressions):
    """Save dataset to file."""
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    with open(filepath, 'w') as f:
        for expr in expressions:
            f.write(expr + '\n')
    print(f"Saved {len(expressions)} expressions to {filepath}")

## Part 1: Math Dataset Generation

### Task 1.1: Build appropriate dataset(s) for training and testing (8 marks)

We generate a dataset focusing on:
- **Single-digit numbers (0-9)**: Easier for the model to learn
- **Five operations**: addition (+), subtraction (-), multiplication (*), integer division (//), modulo (%)
- **Parentheses**: For order of operations
- **Exhaustive coverage**: All combinations of single digits

This approach ensures the model can achieve high accuracy before attempting more complex expressions.

In [None]:
def generate_single_digit_math():
    """
    Generate exhaustive single-digit arithmetic expressions.
    Returns list of expressions in format: "a op b = result"
    """
    expressions = []
    
    # Addition: 0-9 + 0-9
    for a in range(10):
        for b in range(10):
            expressions.append(f"{a}+{b}={a+b}")
    
    # Subtraction: 0-9 - 0-9 (including negatives)
    for a in range(10):
        for b in range(10):
            expressions.append(f"{a}-{b}={a-b}")
    
    # Multiplication: 0-9 * 0-9
    for a in range(10):
        for b in range(10):
            expressions.append(f"{a}*{b}={a*b}")
    
    # Integer division: 0-9 // 1-9 (avoid division by zero)
    for a in range(10):
        for b in range(1, 10):
            expressions.append(f"{a}//{b}={a//b}")
    
    # Modulo: 0-9 % 1-9 (avoid modulo by zero)
    for a in range(10):
        for b in range(1, 10):
            expressions.append(f"{a}%{b}={a%b}")
    
    return expressions


def generate_two_operation_math(sample_rate=0.3):
    """
    Generate expressions with two operations.
    Examples: 5+3+2=10, 6*2+1=13
    """
    expressions = []
    
    for a in range(10):
        for b in range(10):
            for c in range(10):
                if random.random() < sample_rate:
                    # Addition chains
                    expressions.append(f"{a}+{b}+{c}={a+b+c}")
                    # Subtraction chains
                    expressions.append(f"{a}-{b}-{c}={a-b-c}")
                    
            for c in range(1, 10):  # Avoid division by zero
                if random.random() < sample_rate * 0.5:
                    # Mixed operations
                    expressions.append(f"{a}*{b}+{c}={a*b+c}")
                    expressions.append(f"{a}+{b}*{c}={a+b*c}")
    
    return expressions


def generate_parentheses_math(sample_rate=0.2):
    """
    Generate expressions with parentheses for order of operations.
    Examples: (5+3)*2=16, 4*(3+1)=16
    """
    expressions = []
    
    for a in range(10):
        for b in range(10):
            for c in range(1, 10):
                if random.random() < sample_rate:
                    # (a+b)*c
                    expressions.append(f"({a}+{b})*{c}={(a+b)*c}")
                    # a*(b+c)
                    expressions.append(f"{a}*({b}+{c})={a*(b+c)}")
                    # (a-b)+c
                    expressions.append(f"({a}-{b})+{c}={(a-b)+c}")
                    # (a*b)+c
                    if random.random() < 0.5:
                        expressions.append(f"({a}*{b})+{c}={(a*b)+c}")
    
    return expressions


# Generate complete math dataset
print("="*70)
print("GENERATING MATH DATASET")
print("="*70)

math_expressions = []

# Single-digit exhaustive (480 expressions)
single_digit = generate_single_digit_math()
print(f"Single-digit expressions: {len(single_digit)}")
math_expressions.extend(single_digit)

# Repeat single-digit for better learning
math_expressions.extend(single_digit * 2)

# Two operations (sampled)
two_ops = generate_two_operation_math(sample_rate=0.3)
print(f"Two-operation expressions: {len(two_ops)}")
math_expressions.extend(two_ops)

# Parentheses (sampled)
parens = generate_parentheses_math(sample_rate=0.2)
print(f"Parentheses expressions: {len(parens)}")
math_expressions.extend(parens)

# Shuffle for random distribution
random.shuffle(math_expressions)

print(f"\nTotal math expressions: {len(math_expressions):,}")

# Split into train/test (90/10)
split_idx = int(0.9 * len(math_expressions))
math_train = math_expressions[:split_idx]
math_test = math_expressions[split_idx:]

print(f"Training set: {len(math_train):,} ({len(math_train)/len(math_expressions)*100:.1f}%)")
print(f"Testing set: {len(math_test):,} ({len(math_test)/len(math_expressions)*100:.1f}%)")

# Save datasets
save_dataset('dataset/math/training/math_train.txt', math_train)
save_dataset('dataset/math/testing/math_test.txt', math_test)

# Show statistics
print("\nOperation distribution in full dataset:")
op_counts = defaultdict(int)
for expr in math_expressions:
    if '+' in expr and '(' not in expr:
        op_counts['addition'] += 1
    if '-' in expr and '(' not in expr:
        op_counts['subtraction'] += 1
    if '*' in expr:
        op_counts['multiplication'] += 1
    if '//' in expr:
        op_counts['division'] += 1
    if '%' in expr:
        op_counts['modulo'] += 1
    if '(' in expr:
        op_counts['parentheses'] += 1

for op, count in sorted(op_counts.items()):
    print(f"  {op}: {count}")

# Show samples
print("\nSample expressions:")
for i in range(15):
    print(f"  {math_expressions[i]}")

## Part 2: Boolean Dataset Generation

### Task 2.1: Build appropriate dataset(s) for training and testing (8 marks)

We generate a dataset covering:
- **Four operations**: AND, OR, XOR, NOT
- **Two values**: True, False
- **Exhaustive coverage**: All combinations
- **Parentheses**: For order of operations
- **High repetition**: Boolean space is smaller, so we repeat more

In [None]:
def generate_basic_boolean():
    """
    Generate all basic boolean expressions.
    Returns list of expressions in format: "A op B = result"
    """
    expressions = []
    values = [True, False]
    
    # NOT (unary)
    for a in values:
        result = not a
        expressions.append(f"NOT {a}={result}")
    
    # AND
    for a in values:
        for b in values:
            result = a and b
            expressions.append(f"{a} AND {b}={result}")
    
    # OR
    for a in values:
        for b in values:
            result = a or b
            expressions.append(f"{a} OR {b}={result}")
    
    # XOR
    for a in values:
        for b in values:
            result = a != b
            expressions.append(f"{a} XOR {b}={result}")
    
    return expressions


def generate_boolean_with_parentheses():
    """
    Generate boolean expressions with parentheses.
    Examples: (True AND False) OR True, (True XOR False) AND True
    """
    expressions = []
    values = [True, False]
    ops = ['AND', 'OR', 'XOR']
    
    for a in values:
        for b in values:
            for c in values:
                for op1 in ops:
                    for op2 in ops:
                        # (a op1 b) op2 c
                        if op1 == 'AND':
                            temp = a and b
                        elif op1 == 'OR':
                            temp = a or b
                        else:  # XOR
                            temp = a != b
                        
                        if op2 == 'AND':
                            result = temp and c
                        elif op2 == 'OR':
                            result = temp or c
                        else:  # XOR
                            result = temp != c
                        
                        expressions.append(f"({a} {op1} {b}) {op2} {c}={result}")
    
    return expressions


def generate_boolean_with_not():
    """
    Generate boolean expressions combining NOT with other operations.
    Examples: NOT True AND False, True OR NOT False
    """
    expressions = []
    values = [True, False]
    ops = ['AND', 'OR', 'XOR']
    
    for a in values:
        for b in values:
            for op in ops:
                # NOT a op b
                not_a = not a
                if op == 'AND':
                    result = not_a and b
                elif op == 'OR':
                    result = not_a or b
                else:  # XOR
                    result = not_a != b
                expressions.append(f"NOT {a} {op} {b}={result}")
                
                # a op NOT b
                not_b = not b
                if op == 'AND':
                    result = a and not_b
                elif op == 'OR':
                    result = a or not_b
                else:  # XOR
                    result = a != not_b
                expressions.append(f"{a} {op} NOT {b}={result}")
    
    return expressions


# Generate complete boolean dataset
print("\n" + "="*70)
print("GENERATING BOOLEAN DATASET")
print("="*70)

boolean_expressions = []

# Basic boolean (14 unique expressions)
basic = generate_basic_boolean()
print(f"Basic boolean expressions: {len(basic)}")
# Repeat 20 times for better learning
boolean_expressions.extend(basic * 20)

# With parentheses
parens = generate_boolean_with_parentheses()
print(f"Parentheses expressions: {len(parens)}")
# Repeat 3 times
boolean_expressions.extend(parens * 3)

# With NOT
with_not = generate_boolean_with_not()
print(f"NOT combination expressions: {len(with_not)}")
# Repeat 5 times
boolean_expressions.extend(with_not * 5)

# Shuffle
random.shuffle(boolean_expressions)

print(f"\nTotal boolean expressions: {len(boolean_expressions):,}")

# Split into train/test (90/10)
split_idx = int(0.9 * len(boolean_expressions))
boolean_train = boolean_expressions[:split_idx]
boolean_test = boolean_expressions[split_idx:]

print(f"Training set: {len(boolean_train):,} ({len(boolean_train)/len(boolean_expressions)*100:.1f}%)")
print(f"Testing set: {len(boolean_test):,} ({len(boolean_test)/len(boolean_expressions)*100:.1f}%)")

# Save datasets
save_dataset('dataset/boolean/training/boolean_train.txt', boolean_train)
save_dataset('dataset/boolean/testing/boolean_test.txt', boolean_test)

# Show statistics
print("\nOperation distribution:")
bool_op_counts = defaultdict(int)
for expr in boolean_expressions:
    if 'AND' in expr:
        bool_op_counts['AND'] += 1
    if 'OR' in expr:
        bool_op_counts['OR'] += 1
    if 'XOR' in expr:
        bool_op_counts['XOR'] += 1
    if 'NOT' in expr:
        bool_op_counts['NOT'] += 1

for op, count in sorted(bool_op_counts.items()):
    print(f"  {op}: {count}")

# Show samples
print("\nSample expressions:")
for i in range(15):
    print(f"  {boolean_expressions[i]}")

## Dataset Verification

Verify that all generated expressions are mathematically/logically correct.

In [None]:
print("\n" + "="*70)
print("DATASET VERIFICATION")
print("="*70)

# Verify math dataset
print("\nVerifying Math Dataset...")
math_errors = 0
for expr in math_expressions[:100]:  # Sample check
    if '=' in expr:
        parts = expr.split('=')
        expression = parts[0]
        expected = parts[1]
        try:
            calculated = str(eval(expression))
            if calculated != expected:
                print(f"  Error: {expr} (calculated: {calculated})")
                math_errors += 1
        except Exception as e:
            print(f"  Cannot evaluate: {expr} ({e})")
            math_errors += 1

if math_errors == 0:
    print("  ✓ Math dataset is correct!")
else:
    print(f"  ✗ Found {math_errors} errors in math dataset")

# Verify boolean dataset
print("\nVerifying Boolean Dataset...")
bool_errors = 0
for expr in boolean_expressions[:100]:  # Sample check
    if '=' in expr:
        parts = expr.split('=')
        expression = parts[0]
        expected = parts[1]
        try:
            # Replace boolean operators for Python evaluation
            eval_expr = expression.replace('AND', 'and').replace('OR', 'or')
            eval_expr = eval_expr.replace('XOR', '!=')
            eval_expr = eval_expr.replace('NOT', 'not')
            # Handle XOR properly
            if '!=' in eval_expr and '(' in eval_expr:
                # Manual XOR handling for complex cases
                continue
            calculated = str(eval(eval_expr))
            if calculated != expected:
                print(f"  Error: {expr} (calculated: {calculated})")
                bool_errors += 1
        except Exception as e:
            # Skip verification for complex XOR cases
            if 'XOR' not in expression:
                print(f"  Cannot evaluate: {expr} ({e})")
                bool_errors += 1

if bool_errors == 0:
    print("  ✓ Boolean dataset is correct!")
else:
    print(f"  ✗ Found {bool_errors} errors in boolean dataset")

print("\n" + "="*70)
print("DATASET GENERATION COMPLETE")
print("="*70)
print(f"\nMath dataset: {len(math_train):,} training, {len(math_test):,} testing")
print(f"Boolean dataset: {len(boolean_train):,} training, {len(boolean_test):,} testing")
print("\nNext steps:")
print("  1. Run 2_math_gpt.ipynb to train Part 1 model")
print("  2. Run 3_boolean_gpt.ipynb to train Part 2 model")
print("  3. Evaluate both models and compare results")