# Simple AlphaFold Prediction

‚Ä¢ **Run the first cell**

‚Ä¢ **Paste in protein sequence**

‚Ä¢ **Run the second cell** (takes awhile!)

‚Ä¢ **Run the last cell to save back to your Google Drive** (authorization dialogs)

In [None]:
# Paste your protein sequence here
import ipywidgets as widgets
from IPython.display import display, HTML

# Create input area
sequence_input = widgets.Textarea(
    placeholder='Paste your protein sequence here (e.g., MKTVRQERLKSIVRILERSKEPVSGAQ...)',
    description='',
    layout=widgets.Layout(width='100%', height='150px'),
    value=''  # Start empty
)

display(HTML('<h3 style="color: #2e7d32;">üìã Enter Your Protein Sequence:</h3>'))
display(HTML('<p style="color: #666;">Invalid amino acids (X, B, Z, etc.) will be automatically removed</p>'))
display(sequence_input)
display(HTML('<p style="color: #666; margin-top: 10px;">üëá After pasting your sequence, run the next cell to start prediction</p>'))

In [None]:
# ============================================================================
# Run this cell to predict your protein structure
# ============================================================================

import os
import sys
import re
import time
import warnings
import subprocess
from pathlib import Path
from datetime import datetime
from IPython.display import clear_output

warnings.filterwarnings('ignore')

# Global variables to share with download cell
ALPHAFOLD_RESULTS_DIR = None
ALPHAFOLD_JOB_NAME = None

# Get the sequence from the input widget
user_sequence = sequence_input.value.strip()

if not user_sequence:
    print("‚ùå Please paste a protein sequence in the box above and run this cell again.")
else:
    print("="*70)
    print("üöÄ Starting AlphaFold prediction")
    print("="*70)
    
    # ========================================================================
    # Step 1: Validate and clean sequence
    # ========================================================================
    print("\nüìù Processing sequence...")
    
    # Remove whitespace and numbers, convert to uppercase
    cleaned_seq = re.sub(r'[\s\d]', '', user_sequence).upper()
    
    # Valid standard amino acids
    valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
    
    # Find and remove invalid characters
    invalid_chars = {}
    for char in cleaned_seq:
        if char not in valid_aa:
            invalid_chars[char] = invalid_chars.get(char, 0) + 1
    
    # Clean the sequence
    final_sequence = ''.join(char for char in cleaned_seq if char in valid_aa)
    
    # Report cleaning
    if invalid_chars:
        print(f"‚ö†Ô∏è  Removed invalid characters: {', '.join(f'{c}({n}x)' for c, n in invalid_chars.items())}")
    
    # Check sequence length
    seq_length = len(final_sequence)
    if seq_length < 10:
        print(f"‚ùå Sequence too short ({seq_length} residues). Minimum is 10.")
        sys.exit()
    elif seq_length > 2500:
        print(f"‚ùå Sequence too long ({seq_length} residues). Maximum for Colab is 2500.")
        sys.exit()
    
    print(f"‚úÖ Valid sequence: {seq_length} amino acids")
    
    # ========================================================================
    # Step 2: Install required packages
    # ========================================================================
    print("\nüì¶ Installing AlphaFold (this takes 2-3 minutes on first run)...")
    
    # Install JAX 0.5.3 (compatible with Colab's NumPy 2.0)
    subprocess.run(
        'pip install -q "jax[cuda12]==0.5.3" jaxlib==0.5.3 flax==0.9.0 optax==0.2.3 orbax-checkpoint==0.8.0',
        shell=True, capture_output=True
    )
    
    # Install ColabFold
    subprocess.run(
        'pip install -q "colabfold[alphafold-minus-jax]@git+https://github.com/sokrypton/ColabFold"',
        shell=True, capture_output=True
    )
    
    # Install visualization tools
    subprocess.run('pip install -q py3Dmol', shell=True, capture_output=True)
    
    # Fix TensorFlow compatibility issue
    os.system("rm -f /usr/local/lib/python3.*/dist-packages/tensorflow/core/kernels/libtfkernel_sobol_op.so")
    
    print("‚úÖ Installation complete")
    
    # ========================================================================
    # Step 3: Import modules
    # ========================================================================
    from colabfold.download import download_alphafold_params
    from colabfold.batch import get_queries, run, set_model_type
    from colabfold.utils import setup_logging as setup_cf_logging
    import py3Dmol
    
    # ========================================================================
    # Step 4: Run AlphaFold prediction
    # ========================================================================
    print("\nüß¨ Starting structure prediction (balanced quality mode)...")
    print("‚è±Ô∏è  This typically takes:")
    print(f"   ‚Ä¢ {seq_length} residues: ~{max(10, seq_length//25)} minutes on T4 GPU")
    print("   ‚Ä¢ First run downloads data (extra 2-3 minutes)")
    print("   ‚Ä¢ Using 3 models for better accuracy")
    print("\n" + "="*70)
    
    # Create job directory
    job_name = f"protein_{datetime.now().strftime('%H%M%S')}"
    job_dir = Path(f"/content/{job_name}")
    job_dir.mkdir(parents=True, exist_ok=True)
    
    # Store globally for download cell
    ALPHAFOLD_RESULTS_DIR = job_dir
    ALPHAFOLD_JOB_NAME = job_name
    
    # Save sequence to FASTA file
    fasta_path = job_dir / f"{job_name}.fasta"
    with open(fasta_path, 'w') as f:
        f.write(f">{job_name}\n{final_sequence}\n")
    
    # Parse queries
    queries, is_complex = get_queries(fasta_path)
    model_type = set_model_type(is_complex, "auto")
    
    # Download AlphaFold parameters (cached after first download)
    print("üì• Downloading AlphaFold parameters (first time only)...")
    download_alphafold_params(model_type, Path("."))
    
    # Setup logging
    setup_cf_logging(job_dir / "prediction.log")
    
    # Run prediction with balanced quality settings
    print("üî¨ Running AlphaFold with balanced settings...")
    print("   MSA generation: ~2-10 minutes")
    print("   Structure prediction: ~5-20 minutes (3 models)")
    print("\n" + "-"*70)
    
    start_time = time.time()
    
    try:
        # Balanced settings for better quality
        results = run(
            queries=queries,
            result_dir=job_dir,
            is_complex=is_complex,
            msa_mode="MMseqs2 (UniRef+Environmental)",
            model_type=model_type,
            num_models=3,  # Using 3 models for better accuracy
            num_recycle=5,  # More recycling for refinement
            model_order=[1, 2, 3],  # Run models 1, 2, and 3
            use_gpu_relax=False,
            relax_max_iterations=0,
            stop_at_score=90,  # Higher threshold for early stopping
            data_dir=Path("."),  # Critical for finding parameters
            user_agent="colabfold/student-notebook-balanced",  # Identifies this notebook version
            overwrite_existing_results=False
        )
        
        elapsed = time.time() - start_time
        print("-"*70)
        print(f"\n‚úÖ Prediction complete! (Time: {elapsed/60:.1f} minutes)")
        
        # Find output PDB file
        pdb_files = list(job_dir.glob("*.pdb"))
        if pdb_files:
            # Sort to get the best model (ranked by ColabFold)
            pdb_files_sorted = sorted(pdb_files, key=lambda x: x.stem.split('_')[-1])
            pdb_path = pdb_files_sorted[0]
            print(f"\nüìÑ Best structure saved to: {pdb_path}")
            print(f"   Generated {len(pdb_files)} models, showing best ranked")
            print(f"üíæ Download from: Files panel (left sidebar) ‚Üí {job_dir.name}/")
            
            # ================================================================
            # Step 5: Visualize structure
            # ================================================================
            print("\nüé® Visualizing best structure...")
            
            with open(pdb_path, 'r') as f:
                pdb_string = f.read()
            
            # Create 3D visualization
            view = py3Dmol.view(width=800, height=600)
            view.addModel(pdb_string, 'pdb')
            view.setStyle({'cartoon': {'color': 'spectrum'}})
            view.setBackgroundColor('white')
            view.zoomTo()
            
            print("\n" + "="*70)
            print("üéâ Your protein structure:")
            print("="*70)
            print("   ‚Ä¢ Drag to rotate")
            print("   ‚Ä¢ Scroll to zoom")
            print("   ‚Ä¢ Right-click to pan")
            print("="*70)
            view.show()
            
            # Get pLDDT scores for quality assessment
            plddt_scores = []
            for line in pdb_string.split('\n'):
                if line.startswith('ATOM') and ' CA ' in line:
                    try:
                        plddt = float(line[60:66])
                        plddt_scores.append(plddt)
                    except:
                        pass
            
            if plddt_scores:
                avg_plddt = sum(plddt_scores) / len(plddt_scores)
                print(f"\nüìä Structure Quality:")
                print(f"   Average pLDDT: {avg_plddt:.1f}/100")
                if avg_plddt >= 90:
                    print("   ‚≠ê Excellent confidence")
                elif avg_plddt >= 70:
                    print("   ‚úÖ Good confidence")
                elif avg_plddt >= 50:
                    print("   ‚ö†Ô∏è  Moderate confidence")
                else:
                    print("   ‚ö†Ô∏è  Low confidence - interpret with caution")
            
            print("\n" + "="*70)
            print("‚ú® Done! Your structure is ready.")
            print("üíæ Run the next cell to save to Google Drive")
            print("="*70)
            
        else:
            print("‚ùå No PDB file generated - prediction may have failed")
            ALPHAFOLD_RESULTS_DIR = None  # Clear if failed
            
    except Exception as e:
        print(f"\n‚ùå Prediction failed: {str(e)}")
        print("\nTroubleshooting:")
        print("1. Check GPU is enabled (Runtime ‚Üí Change runtime type ‚Üí T4)")
        print("2. Try restarting runtime and running again")
        print("3. Ensure sequence is valid protein sequence")
        ALPHAFOLD_RESULTS_DIR = None  # Clear if failed

In [None]:
# ============================================================================
# Save your results to Google Drive (optional)
# ============================================================================

try:
    # Check if we have results to save
    if 'ALPHAFOLD_RESULTS_DIR' not in globals() or ALPHAFOLD_RESULTS_DIR is None:
        print("‚ùå No results to save. Please run the prediction cell first.")
    else:
        print("="*70)
        print("üíæ Saving to Google Drive")
        print("="*70)
        
        from google.colab import drive
        import shutil
        from datetime import datetime
        import os
        
        # Mount Google Drive (will show popup if not already mounted)
        print("\nüìÅ Connecting to Google Drive...")
        print("   (Click 'Connect' if prompted)")
        
        # Check if already mounted
        if not os.path.exists('/content/drive/MyDrive'):
            drive.mount('/content/drive')
            print("‚úÖ Google Drive connected")
        else:
            print("‚úÖ Google Drive already connected")
        
        # Create timestamp for unique naming
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        zip_filename = f"alphafold_results_{timestamp}.zip"
        
        # Create zip file
        print(f"\nüì¶ Creating archive: {zip_filename}")
        shutil.make_archive(
            f"/content/{zip_filename.replace('.zip', '')}",
            'zip',
            ALPHAFOLD_RESULTS_DIR
        )
        
        # Copy to Google Drive
        drive_path = f"/content/drive/MyDrive/{zip_filename}"
        shutil.copy(f"/content/{zip_filename}", drive_path)
        
        # Success message
        print("\n" + "="*70)
        print("‚úÖ Saved to Google Drive!")
        print("="*70)
        print(f"üìç Location: My Drive/{zip_filename}")
        print(f"üìä Contents: PDB structure, MSA, logs, and all prediction files")
        print("\nüí° To find your file:")
        print("   1. Open Google Drive in a new tab")
        print("   2. Look for: " + zip_filename)
        print("   3. Download and unzip to access all files")
        print("="*70)
        
        # Clean up local zip
        os.remove(f"/content/{zip_filename}")
        
except Exception as e:
    print(f"‚ö†Ô∏è  Could not save to Drive: {str(e)}")
    print("\nüí° Alternative: Use the Files panel (left sidebar) to download manually")