In [2]:
from Bio.Seq import Seq, MutableSeq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import random
import subprocess
import os

### Mutate gene and check for missense

In [25]:
def point_mutate(gene):
    new_gene = MutableSeq(gene)
    pos = random.randint(0, len(gene)-1)
    old_base = gene[pos]
    base_choices = ['A', 'T', 'C', 'G']
    base_choices.remove(old_base)
    new_base = random.choice(base_choices)
    new_gene[pos] = new_base
    return Seq(new_gene)

In [26]:
def to_stop(gene):
    new_gene = MutableSeq(gene)
    stop_pos = new_gene.find('*')
    if stop_pos != -1:
        new_gene = new_gene[:stop_pos]
    return Seq(new_gene)

In [27]:
seq_record = SeqIO.read('data/lacz_gene.fna', 'fasta')
gene = seq_record.seq

In [32]:
new_gene = point_mutate(gene)
protein = gene.translate(stop_symbol="")
new_protein = new_gene.translate(stop_symbol="")
[i for i in range(len(protein)) if protein[i] != new_protein[i]]

[887]

### Save proteins for original and mutated gene in fasta format

In [33]:
orig_record = SeqRecord(
    protein,
    id='original_lacz_protein',
    description='original lacz protein',
)

mutated_record = SeqRecord(
    new_protein,
    id='mutated_lacz_protein',
    description='mutated lacz protein',
)

SeqIO.write(orig_record, 'data/original_lacz_protein.fasta', 'fasta')
SeqIO.write(mutated_record, 'data/mutated_lacz_protein.fasta', 'fasta')

1

### Run LocalColabFold

In [34]:
input_dir = 'data' #@param {type:"string"}
result_dir = 'output' #@param {type:"string"}

# number of models to use
#@markdown ---
#@markdown ### Advanced settings
msa_mode = "single_sequence" #@param ["MMseqs2 (UniRef+Environmental)", "MMseqs2 (UniRef only)","single_sequence","custom"]
num_models = 1 #@param [1,2,3,4,5] {type:"raw"}
num_recycles = 3 #@param [1,3,6,12,24,48] {type:"raw"}
stop_at_score = 100 #@param {type:"string"}
#@markdown - early stop computing models once score > threshold (avg. plddt for "structures" and ptmscore for "complexes")
use_custom_msa = False
num_relax = 0 #@param [0, 1, 5] {type:"raw"}
use_amber = num_relax > 0
relax_max_iterations = 200 #@param [0,200,2000] {type:"raw"}
use_templates = False #@param {type:"boolean"}
do_not_overwrite_results = True #@param {type:"boolean"}
zip_results = False #@param {type:"boolean"}


In [35]:
import sys

from colabfold.batch import get_queries, run
from colabfold.download import default_data_dir
from colabfold.utils import setup_logging
from pathlib import Path

# For some reason we need that to get pdbfixer to import
if use_amber and f"/usr/local/lib/python{python_version}/site-packages/" not in sys.path:
    sys.path.insert(0, f"/usr/local/lib/python{python_version}/site-packages/")

setup_logging(Path(result_dir).joinpath("log.txt"))

In [36]:
queries, is_complex = get_queries(input_dir)
run(
    queries=queries,
    result_dir=result_dir,
    use_templates=use_templates,
    num_relax=num_relax,
    relax_max_iterations=relax_max_iterations,
    msa_mode=msa_mode,
    model_type="auto",
    num_models=num_models,
    num_recycles=num_recycles,
    model_order=[1, 2, 3, 4, 5],
    is_complex=is_complex,
    data_dir=default_data_dir,
    keep_existing_results=do_not_overwrite_results,
    rank_by="auto",
    pair_mode="unpaired+paired",
    stop_at_score=stop_at_score,
    zip_results=zip_results,
    user_agent="colabfold/google-colab-batch",
)

2024-11-15 17:29:57,850 non-fasta/a3m file in input directory: data/lacz_gene.fna
2024-11-15 17:29:57,850 non-fasta/a3m file in input directory: data/shh.fna
2024-11-15 17:29:57,851 Running on GPU
2024-11-15 17:29:57,854 Found 2 citations for tools or databases
2024-11-15 17:29:57,854 Skipping colabfold_sample (already done)
2024-11-15 17:29:57,855 Query 2/3: mutated_lacz_protein (length 1024)
2024-11-15 17:31:33,741 alphafold2_ptm_model_1_seed_000 recycle=0 pLDDT=25.7 pTM=0.205
2024-11-15 17:32:49,732 alphafold2_ptm_model_1_seed_000 recycle=1 pLDDT=24.5 pTM=0.221 tol=22.2
2024-11-15 17:34:03,677 alphafold2_ptm_model_1_seed_000 recycle=2 pLDDT=25.1 pTM=0.232 tol=16
2024-11-15 17:35:17,338 alphafold2_ptm_model_1_seed_000 recycle=3 pLDDT=25.7 pTM=0.238 tol=6.6
2024-11-15 17:35:17,351 alphafold2_ptm_model_1_seed_000 took 315.2s (3 recycles)
2024-11-15 17:35:18,319 reranking models by 'plddt' metric
2024-11-15 17:35:18,319 rank_001_alphafold2_ptm_model_1_seed_000 pLDDT=25.7 pTM=0.238
2024-

{'rank': [['rank_001_alphafold2_ptm_model_1_seed_000'],
  ['rank_001_alphafold2_ptm_model_1_seed_000']],
 'metric': [[{'mean_plddt': 25.71875,
    'ptm': 0.2376708984375,
    'print_line': ' pLDDT=25.7 pTM=0.238'}],
  [{'mean_plddt': 25.6875,
    'ptm': 0.236572265625,
    'print_line': ' pLDDT=25.7 pTM=0.237'}]]}