# EvolvePro - Local Environment Setup

This notebook is adapted to run in your local environment instead of Google Colab.

## Setup: Path Configuration

In [2]:
import os
import sys

# Add the project root to Python path
project_root = os.path.abspath('..')
sys.path.insert(0, project_root)

# Set up paths for local environment
base_dir = project_root
output_dir = os.path.join(base_dir, 'output')
rounds_data_dir = os.path.join(base_dir, 'colab', 'rounds_data')

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

print(f'Project root: {project_root}')
print(f'Output directory: {output_dir}')
print(f'Rounds data directory: {rounds_data_dir}')

Project root: /Users/jmsung/projects/EvolvePro
Output directory: /Users/jmsung/projects/EvolvePro/output
Rounds data directory: /Users/jmsung/projects/EvolvePro/colab/rounds_data


## Import Required Libraries

In [3]:
# Import EvolvePro modules
from evolvepro.src.process import generate_wt, generate_single_aa_mutants, suggest_initial_mutants
from evolvepro.src.evolve import evolve_experimental
from evolvepro.src.plot import read_exp_data, plot_variants_by_iteration

print('All imports successful!')

  "(\d+)([A-Z]+)", expand=True


All imports successful!


## Process: Generate Wild Type and Mutants

In [4]:
# Generate wild type sequence
wt_sequence = 'MAKEDNIEMQGTVLETLPNTMFRVELENGHVVTAHISGKMRKNYIRILTGDKVTVELTPYDLSKGRIVFRSR'
wt_fasta_path = os.path.join(output_dir, 'kelsic_WT.fasta')
generate_wt(wt_sequence, output_file=wt_fasta_path)

# Generate single amino acid mutants
mutants_fasta_path = os.path.join(output_dir, 'kelsic.fasta')
generate_single_aa_mutants(wt_fasta_path, output_file=mutants_fasta_path)

print(f'Generated wild type: {wt_fasta_path}')
print(f'Generated mutants: {mutants_fasta_path}')

Number of mutants: 1369
Generated wild type: /Users/jmsung/projects/EvolvePro/output/kelsic_WT.fasta
Generated mutants: /Users/jmsung/projects/EvolvePro/output/kelsic.fasta


In [5]:
# Suggest initial mutants
suggest_initial_mutants(mutants_fasta_path, num_mutants=12, random_seed=42)


Suggested 12 mutants for testing:
1. R23K
2. T58E
3. I36D
4. V31C
5. I7A
6. K3F
7. Q10P
8. G38E
9. E4M
10. D61W
11. E4Y
12. R23N


## PLM: Extract Embeddings

In [6]:
# Extract ESM embeddings with MPS support for Apple Silicon
import subprocess

embeddings_name = 'kelsic_esm1b_t33_650M_UR50S'
esm_script = os.path.join(project_root, 'evolvepro', 'plm', 'esm', 'extract_mps.py')

cmd = [
    'python', esm_script,
    'esm1b_t33_650M_UR50S',
    mutants_fasta_path,
    embeddings_name,
    '--toks_per_batch', '512',
    '--include', 'mean',
    '--concatenate_dir', output_dir
]

print('Running ESM extraction with MPS support...')
print(f'Command: {' '.join(cmd)}')

try:
    result = subprocess.run(cmd, cwd=project_root, capture_output=True, text=True)
    if result.returncode == 0:
        print('ESM extraction completed successfully!')
        print('Check the output above to see if MPS (GPU) was used.')
    else:
        print(f'ESM extraction failed with error: {result.stderr}')
except Exception as e:
    print(f'Error running ESM extraction: {e}')

Running ESM extraction...
Command: python /Users/jmsung/projects/EvolvePro/evolvepro/plm/esm/extract.py esm1b_t33_650M_UR50S /Users/jmsung/projects/EvolvePro/output/kelsic.fasta kelsic_esm1b_t33_650M_UR50S --toks_per_batch 512 --include mean --concatenate_dir /Users/jmsung/projects/EvolvePro/output


KeyboardInterrupt: 

## Run EVOLVEpro

In [None]:
# Set up parameters for evolution
protein_name = 'kelsic'
embeddings_base_path = output_dir
embeddings_file_name = f'{embeddings_name}.csv'
round_base_path = rounds_data_dir
number_of_variants = 12
rename_WT = False

print(f'Protein: {protein_name}')
print(f'Embeddings: {os.path.join(embeddings_base_path, embeddings_file_name)}')
print(f'Rounds data: {round_base_path}')
print(f'WT fasta: {wt_fasta_path}')

### Round 1

In [None]:
round_name = 'Round1'
round_file_names = ['kelsic_Round1.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)

print(f'Round 1 completed. Selected variants: {this_round_variants}')

### Round 2

In [None]:
round_name = 'Round2'
round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)

print(f'Round 2 completed. Selected variants: {this_round_variants}')

### Round 3

In [None]:
round_name = 'Round3'
round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx', 'kelsic_Round3.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)

print(f'Round 3 completed. Selected variants: {this_round_variants}')

### Round 4

In [None]:
round_name = 'Round4'
round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx', 'kelsic_Round3.xlsx', 'kelsic_Round4.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)

print(f'Round 4 completed. Selected variants: {this_round_variants}')

### Round 5

In [None]:
round_name = 'Round5'
round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx', 'kelsic_Round3.xlsx', 'kelsic_Round4.xlsx', 'kelsic_Round5.xlsx']

this_round_variants, df_test, df_sorted_all = evolve_experimental(
    protein_name,
    round_name,
    embeddings_base_path,
    embeddings_file_name,
    round_base_path,
    round_file_names,
    wt_fasta_path,
    rename_WT,
    number_of_variants,
    output_dir
)

print(f'Round 5 completed. Selected variants: {this_round_variants}')

## Plot Results

In [None]:
# Read experimental data and create plots
round_file_names = ['kelsic_Round1.xlsx', 'kelsic_Round2.xlsx', 'kelsic_Round3.xlsx', 'kelsic_Round4.xlsx', 'kelsic_Round5.xlsx']

df = read_exp_data(round_base_path, round_file_names, wt_fasta_path)
plot_variants_by_iteration(df, activity_column='activity', output_dir=output_dir, output_file='kelsic')

print('Plots generated successfully!')
print(f'Output directory: {output_dir}')