In [1]:
from Bio.Seq import Seq, MutableSeq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import random

In [94]:
def generate_gene(length):
    bases = ('A', 'T', 'C', 'G')
    gene = ''.join([random.choice(bases) for _ in range(length)])
    return Seq(gene)

In [59]:
def point_insertion(gene):
    new_gene = MutableSeq(gene)
    new_base = random.choice(('A', 'T', 'C', 'G'))
    new_pos = random.randint(0, len(gene))
    new_gene.insert(new_pos, new_base)
    return Seq(new_gene)

In [103]:
def point_deletion(gene):
    if len(gene) <= 1:
        return gene
    new_gene = MutableSeq(gene)
    pos = random.randint(0, len(gene)-1)
    new_gene = new_gene[:pos] + new_gene[pos+1:]
    return Seq(new_gene)

In [61]:
def point_mutation(gene):
    new_gene = MutableSeq(gene)
    pos = random.randint(0, len(gene)-1)
    old_base = gene[pos]
    base_choices = ['A', 'T', 'C', 'G']     
    base_choices.remove(old_base)
    new_base = random.choice(base_choices)
    new_gene[pos] = new_base
    return Seq(new_gene)

In [125]:
def partial_deletion(gene):
    new_gene = MutableSeq(gene)
    start = random.randint(0, len(gene)-1)
    end = random.randint(start+1, len(gene))
    new_gene = new_gene[start:end]
    return Seq(new_gene)

In [223]:
def random_insertion(gene):
    new_gene = MutableSeq(gene)
    pos = random.randint(0, len(gene))
    seq_len = random.randint(1, len(gene))
    new_seq = generate_gene(seq_len)
    new_gene = new_gene[:pos] + new_seq + new_gene[pos:]
    return Seq(new_gene)

In [182]:
def partial_duplication(gene):
    new_gene = MutableSeq(gene)
    pos = random.randint(0, len(gene)-1)
    seq_len = random.randint(1, len(gene))
    new_seq = new_gene[pos:pos+seq_len]
    new_gene = new_gene[:pos] + new_seq + new_gene[pos:]
    return Seq(new_gene)

In [206]:
def circular_permutation(gene):
    new_gene = MutableSeq(gene)
    pos = random.randint(1, len(gene)-1)
    new_gene = new_gene[pos:] + new_gene[:pos]
    return Seq(new_gene)

In [66]:
def full_duplication(gene):
    new_gene = MutableSeq(gene)
    new_gene = new_gene + new_gene
    return Seq(new_gene)

In [67]:
def to_stop(gene):
    new_gene = MutableSeq(gene)
    stop_pos = new_gene.find('*')
    if stop_pos != -1:
        new_gene = new_gene[:stop_pos]
    return Seq(new_gene)

In [224]:
all_mutations = [
    point_insertion,
    point_deletion,
    point_mutation,
    partial_deletion,
    random_insertion,
    partial_duplication,
    circular_permutation,
    full_duplication,
]

In [239]:
new_genes = {}
for mutation in all_mutations:
    new_genes[mutation.__name__] = [mutation(gene) for _ in range(10)]

In [None]:
new_gene = point_mutate(gene)
protein = gene.translate(stop_symbol="")
new_protein = new_gene.translate(stop_symbol="")
[i for i in range(len(protein)) if protein[i] != new_protein[i]]

In [None]:
orig_record = SeqRecord(
    protein,
    id='original_lacz_protein',
    description='original lacz protein',
)

mutated_record = SeqRecord(
    new_protein,
    id='mutated_lacz_protein',
    description='mutated lacz protein',
)

SeqIO.write(orig_record, 'data/original_lacz_protein.fasta', 'fasta')
SeqIO.write(mutated_record, 'data/mutated_lacz_protein.fasta', 'fasta')

1

In [None]:
input_dir = 'data' #@param {type:"string"}
result_dir = 'output' #@param {type:"string"}

# number of models to use
#@markdown ---
#@markdown ### Advanced settings
msa_mode = "single_sequence" #@param ["MMseqs2 (UniRef+Environmental)", "MMseqs2 (UniRef only)","single_sequence","custom"]
num_models = 1 #@param [1,2,3,4,5] {type:"raw"}
num_recycles = 3 #@param [1,3,6,12,24,48] {type:"raw"}
stop_at_score = 100 #@param {type:"string"}
#@markdown - early stop computing models once score > threshold (avg. plddt for "structures" and ptmscore for "complexes")
use_custom_msa = False
num_relax = 0 #@param [0, 1, 5] {type:"raw"}
use_amber = num_relax > 0
relax_max_iterations = 200 #@param [0,200,2000] {type:"raw"}
use_templates = False #@param {type:"boolean"}
do_not_overwrite_results = True #@param {type:"boolean"}
zip_results = False #@param {type:"boolean"}


In [None]:
import sys

from colabfold.batch import get_queries, run
from colabfold.download import default_data_dir
from colabfold.utils import setup_logging
from pathlib import Path

# For some reason we need that to get pdbfixer to import
if use_amber and f"/usr/local/lib/python{python_version}/site-packages/" not in sys.path:
    sys.path.insert(0, f"/usr/local/lib/python{python_version}/site-packages/")

setup_logging(Path(result_dir).joinpath("log.txt"))

In [None]:
queries, is_complex = get_queries(input_dir)
run(
    queries=queries,
    result_dir=result_dir,
    use_templates=use_templates,
    num_relax=num_relax,
    relax_max_iterations=relax_max_iterations,
    msa_mode=msa_mode,
    model_type="auto",
    num_models=num_models,
    num_recycles=num_recycles,
    model_order=[1, 2, 3, 4, 5],
    is_complex=is_complex,
    data_dir=default_data_dir,
    keep_existing_results=do_not_overwrite_results,
    rank_by="auto",
    pair_mode="unpaired+paired",
    stop_at_score=stop_at_score,
    zip_results=zip_results,
    user_agent="colabfold/google-colab-batch",
)