# Collatz Data Generation

This notebook contains functions to generate comprehensive Collatz sequence data for analysis.

In [23]:
import pandas as pd
from typing import Optional
import sys
import os

# Add src directory to path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

from collatz.sequence import collatz_analysis

In [24]:
def generate_collatz_data(start: int = 1, end: int = 100, 
                          include_sequence: bool = False,
                          save_to_csv: bool = False,
                          csv_filename: Optional[str] = None) -> pd.DataFrame:
    """
    Generate comprehensive Collatz sequence data for a range of starting numbers.
    
    Args:
        start (int): Starting number for the range (inclusive)
        end (int): Ending number for the range (inclusive)
        include_sequence (bool): Whether to include the full sequence in output
        save_to_csv (bool): Whether to save results to CSV file
        csv_filename (str, optional): Custom filename for CSV output
    
    Returns:
        pd.DataFrame: DataFrame containing Collatz analysis for each number
    """
    
    if start < 1 or end < start:
        raise ValueError("Invalid range: start must be >= 1 and end >= start")
    
    print(f"Generating Collatz data for range {start} to {end}...")
    
    all_data = []
    total_numbers = end - start + 1
    
    for i, n in enumerate(range(start, end + 1), 1):
        if i % max(1, total_numbers // 10) == 0 or i == total_numbers:
            progress = (i / total_numbers) * 100
            print(f"Progress: {progress:.1f}% ({i}/{total_numbers})")
        
        try:
            analysis = collatz_analysis(n)
            
            # Create data row
            row_data = {
                'starting_number': analysis['starting_number'],
                'sequence_length': analysis['sequence_length'],
                'max_value': analysis['max_value'],
                'steps_to_max': analysis['steps_to_max'],
                'is_power_of_two': analysis['is_power_of_two'],
                'even_steps': analysis['even_steps'],
                'odd_steps': analysis['odd_steps'],
                'total_steps': analysis['sequence_length'] - 1,  # Exclude final 1
                'stopping_time': analysis['sequence_length'] - 1
            }
            
            # Optionally include full sequence
            if include_sequence:
                row_data['sequence'] = analysis['sequence']
            
            all_data.append(row_data)
            
        except Exception as e:
            print(f"Error processing number {n}: {e}")
            continue
    
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    # Reorder columns for better readability
    base_columns = ['starting_number', 'sequence_length', 'max_value', 'total_steps', 
                   'stopping_time', 'steps_to_max', 'is_power_of_two', 'even_steps', 'odd_steps']
    
    if include_sequence:
        column_order = base_columns + ['sequence']
    else:
        column_order = base_columns
    
    df = df[column_order]
    
    # Create display version without sequence data for better readability
    df_display = df.drop(columns=['sequence'], errors='ignore')
    
    print(f"\nGenerated DataFrame:")
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Display basic statistics
    print(f"\nBasic Statistics:")
    print(f"Sequence Length - Mean: {df['sequence_length'].mean():.2f}, Median: {df['sequence_length'].median():.2f}")
    print(f"Max Value - Mean: {df['max_value'].mean():.2f}, Median: {df['max_value'].median():.2f}")
    print(f"Stopping Time - Mean: {df['stopping_time'].mean():.2f}, Median: {df['stopping_time'].median():.2f}")
    print(f"Stopping Time - Min: {df['stopping_time'].min()}, Max: {df['stopping_time'].max()}")
    
    # Save to CSV if requested
    if save_to_csv:
        if csv_filename is None:
            csv_filename = f"collatz_sequences_{start:06d}_to_{end:06d}_analysis.csv"
        
        # Save display version (without sequences) to CSV
        csv_path = os.path.join('..', 'data', 'raw', csv_filename)
        os.makedirs(os.path.dirname(csv_path), exist_ok=True)
        df_display.to_csv(csv_path, index=False)
        print(f"\nData saved to: {csv_path}")
    
    return df

In [25]:
df = generate_collatz_data(start=1, end=10000, include_sequence=False, save_to_csv=True)

# Display first few rows
print("\nFirst 10 rows:")
df.head(10)

Generating Collatz data for range 1 to 10000...
Progress: 10.0% (1000/10000)
Progress: 20.0% (2000/10000)
Progress: 30.0% (3000/10000)


Progress: 40.0% (4000/10000)
Progress: 50.0% (5000/10000)
Progress: 60.0% (6000/10000)
Progress: 70.0% (7000/10000)
Progress: 80.0% (8000/10000)
Progress: 90.0% (9000/10000)
Progress: 100.0% (10000/10000)

Generated DataFrame:
Shape: (10000, 9)
Columns: ['starting_number', 'sequence_length', 'max_value', 'total_steps', 'stopping_time', 'steps_to_max', 'is_power_of_two', 'even_steps', 'odd_steps']

Basic Statistics:
Sequence Length - Mean: 85.97, Median: 74.00
Max Value - Mean: 58996.48, Median: 13336.00
Stopping Time - Mean: 84.97, Median: 73.00
Stopping Time - Min: 0, Max: 261

Data saved to: ..\data\raw\collatz_sequences_000001_to_010000_analysis.csv

First 10 rows:


Unnamed: 0,starting_number,sequence_length,max_value,total_steps,stopping_time,steps_to_max,is_power_of_two,even_steps,odd_steps
0,1,1,1,0,0,0,True,0,1
1,2,2,2,1,1,0,True,1,1
2,3,8,16,7,7,3,False,5,3
3,4,3,4,2,2,0,True,2,1
4,5,6,16,5,5,1,False,4,2
5,6,9,16,8,8,4,False,6,3
6,7,17,52,16,16,5,False,11,6
7,8,4,8,3,3,0,True,3,1
8,9,20,52,19,19,8,False,13,7
9,10,7,16,6,6,2,False,5,2


In [26]:
# Display detailed statistics
print("Detailed Statistics:")
df.describe()

Detailed Statistics:


Unnamed: 0,starting_number,sequence_length,max_value,total_steps,stopping_time,steps_to_max,even_steps,odd_steps
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,85.9666,58996.48,84.9666,84.9666,21.3354,56.7644,29.2022
std,2886.89568,46.590863,413680.1,46.590863,46.590863,30.418774,28.711181,17.89298
min,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,2500.75,46.0,9232.0,45.0,45.0,2.0,32.0,14.0
50%,5000.5,74.0,13336.0,73.0,73.0,7.0,50.0,25.0
75%,7500.25,126.0,32272.0,125.0,125.0,25.0,81.0,45.0
max,10000.0,262.0,27114420.0,261.0,261.0,140.0,165.0,97.0
