# GTO Performance Analysis

This notebook analyzes the performance of the GTO workflow on different FASTA file sizes.

Workflow: `gto_fasta_reverse < input.fa | gto_fasta_complement | gto_fasta_mutate -e 0.5 | gto_fasta_to_seq | gto_genomic_extract -i 0 -e 1000 > output.txt`

In [3]:
import subprocess
import time
import psutil
import tempfile
import os
import statistics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

In [4]:
# Configuration
BIN_PATH = '../../gto/bin'
TEST_FILES = [
    'AllMis_2400_parcial.fasta',
    'BraLanc_464_parcial.fasta',
    'HydCol_1000_parcial.fasta',
    'HomoSapiens_3300_parcial.fasta',
]

# Get file sizes for reference
file_sizes = {}
for file in TEST_FILES:
    size = os.path.getsize(file)
    file_sizes[file] = size
    print(f"{file}: {size:,} bytes ({size/1024/1024:.2f} MB)")

AllMis_2400_parcial.fasta: 810,018 bytes (0.77 MB)
BraLanc_464_parcial.fasta: 810,019 bytes (0.77 MB)
HydCol_1000_parcial.fasta: 810,010 bytes (0.77 MB)
HomoSapiens_3300_parcial.fasta: 809,991 bytes (0.77 MB)


In [5]:
def measure_workflow_performance_improved(input_file):
    """
    Execute the GTO workflow step by step with improved memory monitoring.
    Uses /usr/bin/time for accurate memory measurements.
    """
    # Create unique temporary directory for this test
    temp_dir = tempfile.mkdtemp(prefix='gto_test_')

    step_files = [
        os.path.join(temp_dir, f'step{i}.txt') for i in range(1, 7)
    ]
    timing_files = [
        os.path.join(temp_dir, f'timing{i}.txt') for i in range(1, 7)
    ]

    # Commands with /usr/bin/time for accurate memory measurement
    commands = [
        f'/usr/bin/time -f "%e %M %P" -o {timing_files[0]} ./gto_fasta_complement < ../../tests/local_test/{input_file} > {step_files[0]}',
        f'/usr/bin/time -f "%e %M %P" -o {timing_files[1]} ./gto_fasta_mutate -e 0.5 < {step_files[0]} > {step_files[1]}',
        f'/usr/bin/time -f "%e %M %P" -o {timing_files[2]} ./gto_fasta_to_seq < {step_files[1]} > {step_files[2]}',
        f'/usr/bin/time -f "%e %M %P" -o {timing_files[3]} ./gto_genomic_extract -i 0 -e 1000 < {step_files[2]} > {step_files[3]}',
        f'/usr/bin/time -f "%e %M %P" -o {timing_files[4]} ./gto_fasta_from_seq < {step_files[3]} > {step_files[4]}',
        f'/usr/bin/time -f "%e %M %P" -o {timing_files[5]} ./gto_fasta_reverse < {step_files[4]}> {step_files[5]}'
    ]

    print(f"  Executing workflow for: {input_file}")
    input_path = f"../../tests/local_test/{input_file}"
    if os.path.exists(input_path):
        input_size = os.path.getsize(input_path)
        print(f"  Input file size: {input_size:,} bytes")

    total_runtime = 0
    total_max_memory = 0
    step_results = []

    # Execute each command and collect timing data
    for i, cmd in enumerate(commands, 1):
        print(f"    Step {i}: {cmd.split('> ')[0].split('time')[1].strip()}")

        step_start = time.time()

        # Execute command
        result = subprocess.run(
            cmd,
            shell=True,
            cwd='../../gto/bin',
            capture_output=True,
            text=True
        )

        step_end = time.time()
        wall_time = step_end - step_start

        # Read timing information
        step_time = 0
        step_memory = 0
        cpu_percent = 0

        try:
            with open(timing_files[i-1], 'r') as f:
                timing_data = f.read().strip().split()
                step_time = float(timing_data[0])  # elapsed time in seconds
                step_memory = float(timing_data[1]) / 1024  # max RSS in MB (time reports in KB)
                cpu_percent = float(timing_data[2].rstrip('%'))  # CPU usage percentage
        except (FileNotFoundError, ValueError, IndexError) as e:
            print(f"      → Warning: Could not read timing data: {e}")
            step_memory = 0
            step_time = wall_time

        total_runtime += step_time
        total_max_memory = max(total_max_memory, step_memory)

        # Check output file
        output_size = 0
        if i < 6 and os.path.exists(step_files[i-1]):
            output_size = os.path.getsize(step_files[i-1])

            # Show first line for verification
            try:
                with open(step_files[i-1], 'r') as f:
                    first_line = f.readline().strip()
                    preview = first_line[:100] + "..." if len(first_line) > 100 else first_line
                    print(f"      → Step {i} completed: {step_time:.3f}s, {step_memory:.2f} MB, output: {output_size:,} bytes")
                    print(f"      → Preview: {preview}")
            except Exception:
                print(f"      → Step {i} completed: {step_time:.3f}s, {step_memory:.2f} MB, output: {output_size:,} bytes")
        elif i == 6:
            if os.path.exists(step_files[i-1]):
                output_size = os.path.getsize(step_files[i-1])
            print(f"      → Final step completed: {step_time:.3f}s, {step_memory:.2f} MB, output: {output_size:,} bytes")

        step_results.append({
            'step': i,
            'runtime': step_time,
            'memory_mb': step_memory,
            'cpu_percent': cpu_percent,
            'output_size': output_size,
            'return_code': result.returncode
        })

        if result.returncode != 0:
            print(f"    Step {i} failed with return code: {result.returncode}")
            print(f"    STDERR: {result.stderr}")
            break

    # Read final output
    final_output = ""
    if os.path.exists(step_files[5]):
        try:
            with open(step_files[5], 'r') as f:
                final_output = f.read()
        except Exception as e:
            print(f"  Warning: Could not read final output: {e}")

    # Clean up temporary files
    try:
        for f in step_files + timing_files:
            if os.path.exists(f):
                os.remove(f)
        os.rmdir(temp_dir)
    except Exception as e:
        print(f"  Warning: Cleanup failed: {e}")

    print(f"  Total Runtime: {total_runtime:.3f} seconds")
    print(f"  Peak Memory: {total_max_memory:.2f} MB")
    print(f"  Final Output length: {len(final_output)} characters")

    # Show memory usage per step for debugging
    print("  Memory by step:")
    for i, step in enumerate(step_results, 1):
        print(f"    Step {i}: {step['memory_mb']:.2f} MB")

    return {
        'runtime': total_runtime,
        'max_memory_mb': total_max_memory,
        'return_code': max([s['return_code'] for s in step_results]),
        'step_results': step_results,
        'output': final_output
    }


def run_multiple_tests(input_file, num_runs=3):
    """Run multiple iterations to get more reliable statistics."""
    print(f"\nRunning {num_runs} iterations for {input_file}")
    results = []

    for run in range(num_runs):
        print(f"  Run {run + 1}/{num_runs}")
        result = measure_workflow_performance_improved(input_file)
        results.append(result)
        time.sleep(0.1)  # Small delay between runs

    # Calculate statistics
    runtimes = [r['runtime'] for r in results]
    memories = [r['max_memory_mb'] for r in results]

    return {
        'mean_runtime': statistics.mean(runtimes),
        'std_runtime': statistics.stdev(runtimes) if len(runtimes) > 1 else 0,
        'min_runtime': min(runtimes),
        'max_runtime': max(runtimes),
        'mean_memory': statistics.mean(memories),
        'std_memory': statistics.stdev(memories) if len(memories) > 1 else 0,
        'min_memory': min(memories),
        'max_memory': max(memories),
        'success_rate': sum(1 for r in results if r['return_code'] == 0) / len(results),
        'all_results': results
    }


def validate_memory_scaling():
    """
    Quick validation test to check if memory scaling makes sense.
    """
    print("\n" + "="*60)
    print("MEMORY SCALING VALIDATION")
    print("="*60)

    # Test with files of very different sizes
    test_files = ['AllMis_2400_parcial.fasta', 'BraLanc_464_parcial.fasta', 'HydCol_1000_parcial.fasta', 'HomoSapiens_3300_parcial.fasta']

    baseline_results = {}
    for test_file in test_files:
        if os.path.exists(f"../../tests/local_test/{test_file}"):
            file_size = os.path.getsize(f"../../tests/local_test/{test_file}")
            result = measure_workflow_performance_improved(test_file)
            
            baseline_results[test_file] = {
                'file_size_mb': file_size / 1024 / 1024,
                'memory_mb': result['max_memory_mb'],
                'runtime': result['runtime'],
                'memory_efficiency': result['max_memory_mb'] / (file_size / 1024 / 1024) if file_size > 0 else 0
            }

    print("\nMemory Efficiency Analysis:")
    for filename, data in baseline_results.items():
        print(f"{filename}:")
        print(f"  File size: {data['file_size_mb']:.3f} MB")
        print(f"  Peak memory: {data['memory_mb']:.2f} MB")
        print(f"  Memory efficiency: {data['memory_efficiency']:.2f} MB memory per MB input")
        print(f"  Runtime: {data['runtime']:.3f} seconds")
        print()

    return baseline_results

In [6]:
# Run improved tests
all_results = {}

for test_file in TEST_FILES:
    if os.path.exists(f"{test_file}"):
        # Run multiple iterations for reliability
        stats = run_multiple_tests(test_file, num_runs=3)
        all_results[test_file] = stats
        
        print(f"\nStatistics for {test_file}:")
        print(f"  Runtime: {stats['mean_runtime']:.3f} ± {stats['std_runtime']:.3f} seconds")
        print(f"  Memory: {stats['mean_memory']:.2f} ± {stats['std_memory']:.2f} MB")
        print(f"  Success rate: {stats['success_rate']*100:.1f}%")
    else:
        print(f"Warning: Test file {test_file} not found")

# Validation check
validate_memory_scaling()


Running 3 iterations for AllMis_2400_parcial.fasta
  Run 1/3
  Executing workflow for: AllMis_2400_parcial.fasta
  Input file size: 810,018 bytes
    Step 1: -f "%e %M %P" -o /tmp/gto_test_izjb4skc/timing1.txt ./gto_fasta_complement < ../../tests/local_test/AllMis_2400_parcial.fasta
      → Step 1 completed: 0.010s, 1.75 MB, output: 810,018 bytes
      → Preview: >CM061332.1 Alligator mississippiensis isolate rAllMis1 chromosome 1, whole genome shotgun sequence
    Step 2: -f "%e %M %P" -o /tmp/gto_test_izjb4skc/timing2.txt ./gto_fasta_mutate -e 0.5 < /tmp/gto_test_izjb4skc/step1.txt
      → Step 2 completed: 0.030s, 1.88 MB, output: 810,018 bytes
      → Preview: >CM061332.1 Alligator mississippiensis isolate rAllMis1 chromosome 1, whole genome shotgun sequence
    Step 3: -f "%e %M %P" -o /tmp/gto_test_izjb4skc/timing3.txt ./gto_fasta_to_seq < /tmp/gto_test_izjb4skc/step2.txt
      → Step 3 completed: 0.000s, 1.74 MB, output: 799,920 bytes
      → Preview: AGCTGGGCGCTGGGATTAGACTCCGC

{'AllMis_2400_parcial.fasta': {'file_size_mb': 0.7724933624267578,
  'memory_mb': 1.99609375,
  'runtime': 0.03,
  'memory_efficiency': 2.583962331701271},
 'BraLanc_464_parcial.fasta': {'file_size_mb': 0.7724943161010742,
  'memory_mb': 1.875,
  'runtime': 0.03,
  'memory_efficiency': 2.427202324883737},
 'HydCol_1000_parcial.fasta': {'file_size_mb': 0.7724857330322266,
  'memory_mb': 1.99609375,
  'runtime': 0.04,
  'memory_efficiency': 2.5839878520018273},
 'HomoSapiens_3300_parcial.fasta': {'file_size_mb': 0.7724676132202148,
  'memory_mb': 1.8671875,
  'runtime': 0.04,
  'memory_efficiency': 2.417172536484973}}

In [7]:
# Store current performance results
import json
import datetime
import os

# Use actual results from the tests instead of hardcoded zeros
current_results = all_results if 'all_results' in globals() and all_results else {}

# Add metadata
metadata = {
    "timestamp": datetime.datetime.now().isoformat(),
    "platform": "local_machine",
    "description": "GTO workflow performance test using GNU time, 3 iterations per file",
    "workflow_steps": [
        "gto_fasta_complement",
        "gto_fasta_mutate -e 0.5",
        "gto_fasta_to_seq",
        "gto_genomic_extract -i 0 -e 1000",
        "gto_fasta_from_seq",
        "gto_fasta_reverse"
    ],
    "measurement_method": "gnu_time"
}

# Combine everything
full_results = {
    "metadata": metadata,
    "results": current_results
}

# Save as JSON
filename = f"gto_performance_local_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(filename, 'w') as f:
    json.dump(full_results, f, indent=2)

print(f"Results saved to: {filename}")

# Also save as simple CSV for easy viewing
csv_filename = filename.replace('.json', '.csv')
with open(csv_filename, 'w') as f:
    f.write("# GTO Performance Results\n")
    f.write(f"# Platform: {metadata['platform']}\n")
    f.write(f"# Timestamp: {metadata['timestamp']}\n")
    f.write(f"# Description: {metadata['description']}\n\n")
    
    f.write("test_file,mean_runtime_s,std_runtime_s,mean_memory_mb,std_memory_mb,success_rate\n")
    for filename, stats in current_results.items():
        f.write(f"{filename},{stats['mean_runtime']},{stats['std_runtime']},{stats['mean_memory']},{stats['std_memory']},{stats['success_rate']}\n")

print(f"CSV saved to: {csv_filename}")

# Display summary
print("\n" + "="*50)
print("STORED RESULTS SUMMARY")
print("="*50)
if current_results:
    for filename, stats in current_results.items():
        print(f"{filename}:")
        print(f"  Runtime: {stats['mean_runtime']:.3f} ± {stats['std_runtime']:.3f} seconds")
        print(f"  Memory:  {stats['mean_memory']:.2f} ± {stats['std_memory']:.2f} MB")
        print(f"  Success: {stats['success_rate']*100:.1f}%")
        print()
else:
    print("No results available. Please run cell 4 first to generate performance data.")
    print("Current all_results variable:", 'all_results' in globals())

Results saved to: gto_performance_local_20250901_210436.json
CSV saved to: gto_performance_local_20250901_210436.csv

STORED RESULTS SUMMARY
AllMis_2400_parcial.fasta:
  Runtime: 0.037 ± 0.006 seconds
  Memory:  1.92 ± 0.07 MB
  Success: 100.0%

BraLanc_464_parcial.fasta:
  Runtime: 0.033 ± 0.006 seconds
  Memory:  1.96 ± 0.07 MB
  Success: 100.0%

HydCol_1000_parcial.fasta:
  Runtime: 0.043 ± 0.006 seconds
  Memory:  1.92 ± 0.07 MB
  Success: 100.0%

HomoSapiens_3300_parcial.fasta:
  Runtime: 0.033 ± 0.006 seconds
  Memory:  1.88 ± 0.04 MB
  Success: 100.0%

