In [1]:
from Bio.Seq import Seq, MutableSeq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import random
import json
import os
from gene import Gene
import sys
import numpy as np

from colabfold.batch import get_queries, run
from colabfold.download import default_data_dir
from colabfold.utils import setup_logging
from pathlib import Path

E0000 00:00:1732811963.665660   65401 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732811963.671251   65401 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
def save_protein(protein, ident, desc, filename):
    record = SeqRecord(protein, id=ident, description=desc)
    SeqIO.write(record, filename, 'fasta')

In [3]:
def first_generation(n_genes, n_bases, filepath='evolution'):
    try:
        os.mkdir('evolution/gen0/')
    except FileExistsError:
        pass
    genes = [Gene(n_bases) for _ in range(n_genes)]
    for i, gene in enumerate(genes):
        ident = f'gen0_gene{i}'
        save_protein(gene.get_protein(), ident, str(gene), f'{filepath}/gen0/{ident}.fasta')
    return genes

In [4]:
colabfold_params = {
    'msa_mode': "single_sequence", #@param ["MMseqs2 (UniRef+Environmental)", "MMseqs2 (UniRef only)","single_sequence","custom"]
    'num_models': 1, #@param [1,2,3,4,5] {type:"raw"}
    'num_recycles': 3, #@param [1,3,6,12,24,48] {type:"raw"}
    'stop_at_score': 100, #@param {type:"string"}
    #'use_custom_msa': False,
    'num_relax': 0, #@param [0, 1, 5] {type:"raw"}
    'use_amber': False,
    'relax_max_iterations': 200, #@param [0,200,2000] {type:"raw"}
    'use_templates': False, #@param {type:"boolean"}
    'keep_existing_results': True, #@param {type:"boolean"}
    'zip_results': False, #@param {type:"boolean"}
    'model_type': "auto",
    'model_order': [1, 2, 3, 4, 5],
    'data_dir': default_data_dir,
    'keep_existing_results': True,
    'rank_by': "auto",
    'pair_mode': "unpaired+paired",
    'user_agent': "colabfold/google-colab-batch",
}

In [5]:
# this is from the colabfold jupyter notebook, not using amber so dont need atm
# intron irl

# For some reason we need that to get pdbfixer to import
#if use_amber and f"/usr/local/lib/python{python_version}/site-packages/" not in sys.path:
#    sys.path.insert(0, f"/usr/local/lib/python{python_version}/site-packages/")

In [6]:
def fold(input_dir, result_dir, kwargs):
    setup_logging(Path(result_dir).joinpath("log.txt"))
    kwargs['queries'], kwargs['is_complex'] = get_queries(input_dir)
    kwargs['result_dir'] = result_dir
    results = run(**kwargs)
    return results

In [27]:
def calculate_fitnesses(results):
    fitnesses = []
    for metric in results['metric']:
        mean_plddt, ptm = metric[0]['mean_plddt'], metric[0]['ptm']
        fitness = (mean_plddt / 100) * 0.5 + ptm * 0.5
        fitnesses.append(fitness)
    return fitnesses

In [8]:
def select(genes, fitnesses, n, method='weak'):
    if method == 'weak':
        return random.choices(genes, weights=fitnesses, k=n)
    else:
        print('not implemented hehe :3')
        return None

In [15]:
genes = [Gene(30) for _ in range(10)]
for i, gene in enumerate(genes):
    protein = gene.get_protein()
    record = SeqRecord(protein, id=str(i), description=str(gene))
    SeqIO.write(record, f'proteins/gen0/gene{i}.fasta', 'fasta')

In [16]:
results = fold('proteins/gen0', 'folds/gen0', colabfold_params)

2024-11-28 11:40:44,708 Running on GPU
2024-11-28 11:40:45,250 Found 2 citations for tools or databases
2024-11-28 11:40:45,250 Query 1/10: gene0 (length 9)


I0000 00:00:1732812051.664983   65401 mlir_graph_optimization_pass.cc:401] MLIR V1 optimization pass is not enabled


2024-11-28 11:40:51,875 Padding length to 10
2024-11-28 11:41:01,873 alphafold2_ptm_model_1_seed_000 recycle=0 pLDDT=71.8 pTM=0.045
2024-11-28 11:41:01,970 alphafold2_ptm_model_1_seed_000 recycle=1 pLDDT=68.6 pTM=0.044 tol=0.271
2024-11-28 11:41:02,055 alphafold2_ptm_model_1_seed_000 recycle=2 pLDDT=67.9 pTM=0.0442 tol=0.0969
2024-11-28 11:41:02,141 alphafold2_ptm_model_1_seed_000 recycle=3 pLDDT=67.1 pTM=0.0441 tol=0.044
2024-11-28 11:41:02,142 alphafold2_ptm_model_1_seed_000 took 10.3s (3 recycles)
2024-11-28 11:41:02,145 reranking models by 'plddt' metric
2024-11-28 11:41:02,146 rank_001_alphafold2_ptm_model_1_seed_000 pLDDT=67.1 pTM=0.0441
2024-11-28 11:41:02,469 Query 2/10: gene2 (length 9)
2024-11-28 11:41:03,619 Padding length to 10
2024-11-28 11:41:03,706 alphafold2_ptm_model_1_seed_000 recycle=0 pLDDT=77.2 pTM=0.049
2024-11-28 11:41:03,790 alphafold2_ptm_model_1_seed_000 recycle=1 pLDDT=82.4 pTM=0.0501 tol=0.261
2024-11-28 11:41:03,876 alphafold2_ptm_model_1_seed_000 recycle=2

In [17]:
fitnesses = calculate_fitnesses(results)
selection = np.random.choice(genes, size=5, replace=False, p=fitnesses)
next_gen = {gene: fitness for gene, fitness in zip(genes, fitnesses) if gene in selection}

In [21]:
os.mkdir('proteins/gen1')
mutations = [gene.random_mutation() for gene in next_gen.keys()]
for i, gene in enumerate(mutations):
    protein = gene.get_protein()
    record = SeqRecord(protein, id=str(i), description=str(gene))
    SeqIO.write(record, f'proteins/gen1/gene{i}.fasta', 'fasta')



In [22]:
results = fold('proteins/gen1', 'folds/gen1', colabfold_params)

2024-11-28 11:44:46,490 Running on GPU
2024-11-28 11:44:46,494 Found 2 citations for tools or databases
2024-11-28 11:44:46,494 Query 1/5: gene1 (length 9)
2024-11-28 11:44:50,383 Padding length to 18
2024-11-28 11:45:00,517 alphafold2_ptm_model_1_seed_000 recycle=0 pLDDT=76.9 pTM=0.046
2024-11-28 11:45:00,648 alphafold2_ptm_model_1_seed_000 recycle=1 pLDDT=78.3 pTM=0.0475 tol=0.222
2024-11-28 11:45:00,767 alphafold2_ptm_model_1_seed_000 recycle=2 pLDDT=79.3 pTM=0.0478 tol=0.227
2024-11-28 11:45:00,890 alphafold2_ptm_model_1_seed_000 recycle=3 pLDDT=77.9 pTM=0.0478 tol=0.0917
2024-11-28 11:45:00,890 alphafold2_ptm_model_1_seed_000 took 10.5s (3 recycles)
2024-11-28 11:45:00,901 reranking models by 'plddt' metric
2024-11-28 11:45:00,901 rank_001_alphafold2_ptm_model_1_seed_000 pLDDT=77.9 pTM=0.0478
2024-11-28 11:45:01,207 Query 2/5: gene2 (length 10)
2024-11-28 11:45:02,343 Padding length to 18
2024-11-28 11:45:02,464 alphafold2_ptm_model_1_seed_000 recycle=0 pLDDT=70.6 pTM=0.0425
2024-

In [28]:
mutation_fitnesses = calculate_fitnesses(results)
genes = list(next_gen.keys()) + mutations
fitnesses = list(next_gen.values()) + mutation_fitnesses
selection = np.random.choice(genes, size=5, replace=False, p=np.divide(fitnesses, sum(fitnesses)))
next_gen = {gene: fitness for gene, fitness in zip(genes, fitnesses) if gene in selection}