In [18]:
from Bio.Seq import Seq, MutableSeq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import random
import json
import os
from gene import Gene
import sys

from colabfold.batch import get_queries, run
from colabfold.download import default_data_dir
from colabfold.utils import setup_logging
from pathlib import Path

In [2]:
def save_protein(protein, ident, desc, filename):
    record = SeqRecord(protein, id=ident, description=desc)
    SeqIO.write(record, filename, 'fasta')

In [3]:
generations = []

In [4]:
gene = Gene(10)
gene.get_protein()



Seq('TIL')

In [5]:
def first_generation(n_genes, n_bases, filepath='evolution'):
    try:
        os.mkdir('evolution/gen0/')
    except FileExistsError:
        pass
    genes = [Gene(n_bases) for _ in range(n_genes)]
    for i, gene in enumerate(genes):
        ident = f'gen0_gene{i}'
        save_protein(gene.get_protein(), ident, str(gene), f'{filepath}/gen0/{ident}.fasta')
    return genes

In [24]:
colabfold_params = {
    'msa_mode': "single_sequence", #@param ["MMseqs2 (UniRef+Environmental)", "MMseqs2 (UniRef only)","single_sequence","custom"]
    'num_models': 1, #@param [1,2,3,4,5] {type:"raw"}
    'num_recycles': 3, #@param [1,3,6,12,24,48] {type:"raw"}
    'stop_at_score': 100, #@param {type:"string"}
    'use_custom_msa': False,
    'num_relax': 0, #@param [0, 1, 5] {type:"raw"}
    'use_amber': False,
    'relax_max_iterations': 200, #@param [0,200,2000] {type:"raw"}
    'use_templates': False, #@param {type:"boolean"}
    'keep_existing_results': True, #@param {type:"boolean"}
    'zip_results': False, #@param {type:"boolean"}
    'model_type': "auto",
    'model_order': [1, 2, 3, 4, 5],
    'data_dir': default_data_dir,
    'keep_existing_results': True,
    'rank_by': "auto",
    'pair_mode': "unpaired+paired",
    'user_agent': "colabfold/google-colab-batch",
}

In [None]:
# this is from the colabfold jupyter notebook, not using amber so dont need atm
# intron irl

# For some reason we need that to get pdbfixer to import
#if use_amber and f"/usr/local/lib/python{python_version}/site-packages/" not in sys.path:
#    sys.path.insert(0, f"/usr/local/lib/python{python_version}/site-packages/")

In [26]:
def fold(input_dir, result_dir, kwargs):
    setup_logging(Path(result_dir).joinpath("log.txt"))
    kwargs['queries'], kwargs['is_complex'] = get_queries(input_dir)
    kwargs['input_dir'], kwargs['result_dir'] = input_dir, result_dir
    results = run(**kwargs)
    return results

In [None]:
results = fold('evolution/gen0', 'folds', colabfold_params)

2024-11-28 08:30:49,054 Running on GPU
2024-11-28 08:30:49,062 Found 2 citations for tools or databases
2024-11-28 08:30:49,063 Query 1/10: gen0_gene7 (length 2)
2024-11-28 08:30:52,975 Padding length to 3
2024-11-28 08:30:59,128 alphafold2_ptm_model_1_seed_000 recycle=0 pLDDT=19.7 pTM=0.00149
2024-11-28 08:30:59,188 alphafold2_ptm_model_1_seed_000 recycle=1 pLDDT=28.8 pTM=0.000749 tol=31.5
2024-11-28 08:30:59,249 alphafold2_ptm_model_1_seed_000 recycle=2 pLDDT=46.7 pTM=0.00176 tol=4.7
2024-11-28 08:30:59,302 alphafold2_ptm_model_1_seed_000 recycle=3 pLDDT=50.4 pTM=0.00188 tol=22.5
2024-11-28 08:30:59,302 alphafold2_ptm_model_1_seed_000 took 6.3s (3 recycles)
2024-11-28 08:30:59,304 reranking models by 'plddt' metric
2024-11-28 08:30:59,304 rank_001_alphafold2_ptm_model_1_seed_000 pLDDT=50.4 pTM=0.00188
2024-11-28 08:30:59,614 Query 2/10: gen0_gene0 (length 3)
2024-11-28 08:31:00,794 alphafold2_ptm_model_1_seed_000 recycle=0 pLDDT=72.9 pTM=0.117
2024-11-28 08:31:00,845 alphafold2_ptm_m

In [32]:
def calculate_fitnesses(results):
    fitnesses = 0
    for metric in results['metric']:
        mean_plddt, ptm = metric[0]['mean_plddt'], metric[0]['ptm']
        fitness = (mean_plddt / 100) * 0.5 + ptm * 0.5
        fitnesses.append(fitness)
    return fitnesses

In [33]:
def select(genes, fitnesses, n, method='weak'):
    if method == 'weak':
        return random.choices(genes, weights=fitnesses, k=n)
    else:
        print('not implemented hehe :3')
        return None

## End goals
* List of mutated genes along with their predicted folds
    * Fasta file with associated pdb file
* Metadata with gene seq : [protein fasta file, folded pdb file, fitness, generation, parent(s)]