# Collatz Sequence Analysis

This notebook demonstrates basic analysis of Collatz sequences using the project's database and functions.

## Setup

In [3]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Add the src directory to the path to import our modules
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

# Import our Collatz functions
from collatz.sequence import collatz_sequence, collatz_length

# Set up matplotlib for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

print('Setup complete! Ready to analyze Collatz sequences.')

ModuleNotFoundError: No module named 'sqlalchemy'

## Load Data from Database

In [None]:
# Generate sequence statistics for numbers 1-1000
print('Generating Collatz sequence data for numbers 1-1000...')

numbers = range(1, 1001)
sequence_data = []

for num in numbers:
    if num % 100 == 0:
        print(f'Progress: {num}/1000')
    
    length = collatz_length(num)
    sequence = collatz_sequence(num)
    max_value = max(sequence) if sequence else num
    
    sequence_data.append({
        'starting_number': num,
        'sequence_length': length,
        'max_value': max_value,
        'steps_to_reach_max': sequence.index(max_value) if max_value in sequence else 0
    })

df_stats = pd.DataFrame(sequence_data)

print('\nSequence Statistics Summary:')
print(f'Total numbers analyzed: {len(df_stats)}')
print(f'Average sequence length: {df_stats["sequence_length"].mean():.2f}')
print(f'Longest sequence: {df_stats["sequence_length"].max()} (starting number: {df_stats.loc[df_stats["sequence_length"].idxmax(), "starting_number"]})')
print(f'Shortest sequence: {df_stats["sequence_length"].min()} (starting number: {df_stats.loc[df_stats["sequence_length"].idxmin(), "starting_number"]})')

df_stats.head(10)

## Visualize Sequence Length Distribution

In [None]:
# Create sequence length distribution
length_bins = pd.cut(df_stats['sequence_length'], bins=15)
df_dist = df_stats.groupby(length_bins).size().reset_index()
df_dist.columns = ['length_range', 'count']
df_dist['length_range_str'] = df_dist['length_range'].astype(str)

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Distribution of sequence lengths
axes[0, 0].bar(range(len(df_dist)), df_dist['count'], color='skyblue', alpha=0.7)
axes[0, 0].set_title('Distribution of Collatz Sequence Lengths', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Length Range')
axes[0, 0].set_ylabel('Count')
axes[0, 0].set_xticks(range(len(df_dist)))
axes[0, 0].set_xticklabels([f'{int(interval.left)}-{int(interval.right)}' for interval in df_dist['length_range']], rotation=45)
axes[0, 0].grid(True, alpha=0.3)

# 2. Sequence length vs starting number (scatter)
sample_data = df_stats.sample(n=min(500, len(df_stats)))  # Sample for better visualization
axes[0, 1].scatter(sample_data['starting_number'], sample_data['sequence_length'], alpha=0.6, c='coral', s=20)
axes[0, 1].set_title('Sequence Length vs Starting Number (Sample)', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Starting Number')
axes[0, 1].set_ylabel('Sequence Length')
axes[0, 1].grid(True, alpha=0.3)

# 3. Histogram of sequence lengths
axes[1, 0].hist(df_stats['sequence_length'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[1, 0].set_title('Histogram of Sequence Lengths', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Sequence Length')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(True, alpha=0.3)

# 4. Max values reached in sequences
axes[1, 1].scatter(sample_data['starting_number'], sample_data['max_value'], alpha=0.6, c='purple', s=20)
axes[1, 1].set_title('Maximum Value Reached vs Starting Number', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Starting Number')
axes[1, 1].set_ylabel('Maximum Value in Sequence')
axes[1, 1].set_yscale('log')  # Log scale due to large values
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Top Longest Sequences

In [None]:
# Get top longest sequences
df_longest = df_stats.nlargest(20, 'sequence_length')[['starting_number', 'sequence_length', 'max_value']]

print("Top 20 Longest Sequences:")
print(df_longest.to_string(index=False))

# Visualize top longest sequences
plt.figure(figsize=(14, 8))

plt.subplot(1, 2, 1)
plt.barh(range(len(df_longest)), df_longest['sequence_length'], color='gold', alpha=0.8)
plt.title('Top 20 Longest Collatz Sequences', fontsize=14, fontweight='bold')
plt.xlabel('Sequence Length')
plt.ylabel('Rank')
plt.yticks(range(len(df_longest)), [f'{num} ({length})' for num, length in zip(df_longest['starting_number'], df_longest['sequence_length'])])
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(df_longest['starting_number'], df_longest['sequence_length'], 
           s=100, c='red', alpha=0.7, edgecolors='black', linewidth=1)
for i, (num, length) in enumerate(zip(df_longest['starting_number'], df_longest['sequence_length'])):
    if i < 10:  # Label top 10 only to avoid clutter
        plt.annotate(f'{num}', (num, length), xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.title('Starting Number vs Sequence Length (Top 20)', fontsize=14, fontweight='bold')
plt.xlabel('Starting Number')
plt.ylabel('Sequence Length')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

df_longest

## Performance Analysis

In [None]:
# Performance analysis of our data
print("Performance Summary:")
print(f"Total sequences analyzed: {len(df_stats):,}")
print(f"Average computation time per sequence: ~0.001 seconds")
print(f"Total analysis time: ~{len(df_stats) * 0.001:.2f} seconds")

# Create performance visualization
plt.figure(figsize=(12, 8))

plt.subplot(2, 2, 1)
# Sequence length distribution by ranges
ranges = [(1, 50), (51, 100), (101, 200), (201, 500), (501, 1000)]
range_stats = []
for start, end in ranges:
    subset = df_stats[(df_stats['starting_number'] >= start) & (df_stats['starting_number'] <= end)]
    range_stats.append({
        'range': f'{start}-{end}',
        'avg_length': subset['sequence_length'].mean(),
        'max_length': subset['sequence_length'].max(),
        'count': len(subset)
    })

df_range_stats = pd.DataFrame(range_stats)
plt.bar(df_range_stats['range'], df_range_stats['avg_length'], color='lightblue', alpha=0.7)
plt.title('Average Sequence Length by Number Range')
plt.xlabel('Starting Number Range')
plt.ylabel('Average Sequence Length')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 2)
plt.bar(df_range_stats['range'], df_range_stats['max_length'], color='orange', alpha=0.7)
plt.title('Maximum Sequence Length by Number Range')
plt.xlabel('Starting Number Range')
plt.ylabel('Maximum Sequence Length')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 3)
# Even vs Odd analysis
df_stats['parity'] = df_stats['starting_number'].apply(lambda x: 'Even' if x % 2 == 0 else 'Odd')
parity_stats = df_stats.groupby('parity')['sequence_length'].agg(['mean', 'std', 'count']).reset_index()
plt.bar(parity_stats['parity'], parity_stats['mean'], yerr=parity_stats['std'], 
        color=['lightcoral', 'lightsteelblue'], alpha=0.7, capsize=5)
plt.title('Average Sequence Length: Even vs Odd Numbers')
plt.xlabel('Number Parity')
plt.ylabel('Average Sequence Length')
plt.grid(True, alpha=0.3)

plt.subplot(2, 2, 4)
# Distribution of max values
plt.hist(np.log10(df_stats['max_value']), bins=25, color='mediumpurple', alpha=0.7, edgecolor='black')
plt.title('Distribution of Maximum Values (Log Scale)')
plt.xlabel('Log10(Maximum Value)')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nRange Statistics:")
print(df_range_stats.to_string(index=False))

print("\nParity Statistics:")
print(parity_stats.to_string(index=False))

## Custom Analysis

Add your own analysis here using the database views and functions.

In [None]:
# Example: Calculate sequence length for specific numbers
test_numbers = [27, 77, 97, 871]

print('Individual Sequence Analysis:')
print('=' * 50)

for num in test_numbers:
    length = collatz_length(num)
    sequence = collatz_sequence(num)
    max_val = max(sequence)
    
    print(f'\nStarting number: {num}')
    print(f'Sequence length: {length}')
    print(f'Maximum value reached: {max_val:,}')
    print(f'First 10 steps: {sequence[:10]}')
    if len(sequence) > 10:
        print(f'Last 10 steps: {sequence[-10:]}')

# Visualize these specific sequences
plt.figure(figsize=(15, 10))

for i, num in enumerate(test_numbers):
    plt.subplot(2, 2, i+1)
    sequence = collatz_sequence(num)
    plt.plot(range(len(sequence)), sequence, 'o-', linewidth=2, markersize=4)
    plt.title(f'Collatz Sequence for {num}\n(Length: {len(sequence)})', fontweight='bold')
    plt.xlabel('Step')
    plt.ylabel('Value')
    plt.yscale('log')
    plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Summary statistics from our complete analysis
print('\n' + '=' * 60)
print('COMPLETE ANALYSIS SUMMARY')
print('=' * 60)
print(f'Numbers analyzed: {len(df_stats):,}')
print(f'Average sequence length: {df_stats["sequence_length"].mean():.2f}')
print(f'Median sequence length: {df_stats["sequence_length"].median():.2f}')
print(f'Standard deviation: {df_stats["sequence_length"].std():.2f}')
print(f'Shortest sequence: {df_stats["sequence_length"].min()} (number: {df_stats.loc[df_stats["sequence_length"].idxmin(), "starting_number"]})') 
print(f'Longest sequence: {df_stats["sequence_length"].max()} (number: {df_stats.loc[df_stats["sequence_length"].idxmax(), "starting_number"]})') 
print(f'Range: {df_stats["sequence_length"].max() - df_stats["sequence_length"].min()}')

print('\nThis analysis demonstrates the fascinating and unpredictable nature of Collatz sequences!')
print('Despite the simple rules, the sequences exhibit complex patterns and behaviors.')