In [7]:
import sys
import subprocess
from typing import List

sequence = "MGLSDGEWQLVLNVWGKVEADVAGHGQ"
label = "eukaryotic"
input_locs = None
masked_locs = [(0,5), (10,15), 21, 23]
length = len(sequence)
add_bos = True
add_eos = True
output = "samples_cond.fasta"

masked_sequence = sequence
for il in masked_locs:
    if type(il) == tuple:
        masked_sequence = masked_sequence[:il[0]] + "_"*(il[1]-il[0]) + masked_sequence[il[1]:]
    elif type(il) == int:
        masked_sequence = masked_sequence[:il] + "_" + masked_sequence[il+1:]
if add_bos:
    masked_sequence = "[" + masked_sequence
    sequence = "[" + sequence
    length += 1
if add_eos:
    masked_sequence = masked_sequence + "]"
    sequence = sequence + "]"
    length += 1
print("sequence before and after masking:")
print(sequence)
print(masked_sequence)
print("length of sequence after preprocessing:", length)

sequence before and after masking:
[MGLSDGEWQLVLNVWGKVEADVAGHGQ]
[_____GEWQL_____GKVEAD_A_HGQ]
length of sequence after preprocessing: 29


In [8]:
print(masked_sequence)

# preprocess to get the correct input format
input_locs = []
for i, c in enumerate(masked_sequence):
    if c != "_":
        input_locs.append(i)
input_sequence = "".join([masked_sequence[i] for i in input_locs])

print("input_locs:", input_locs)
print("input_sequence:", input_sequence)

[_____GEWQL_____GKVEAD_A_HGQ]
input_locs: [0, 6, 7, 8, 9, 10, 16, 17, 18, 19, 20, 21, 23, 25, 26, 27, 28]
input_sequence: [GEWQLGKVEADAHGQ]


In [9]:
from sample_cond import sample_conditional
from transformers import PreTrainedTokenizerFast
# run the axolotl model

tokenizer = PreTrainedTokenizerFast.from_pretrained('/home/kkj/axolotl/tokenizer/tokenizer_absorb')


sample_conditional(
    model_path = "/home/kkj/axolotl/pretrained/IPR036736_90_grouped/2025.01.29/143108",
    tokenizer = tokenizer,
    input = input_sequence,
    input_locations = input_locs,
    length = length,
    batch_size = 1,
    steps = 128,
    cfg_w = 0.75,
    label = "eukaryotic",
    output = output,
)

config.json not found in /home/kkj/axolotl/pretrained/IPR036736_90_grouped/2025.01.29/143108
Sampling: 100%|██████████| 128/128 [00:01<00:00, 69.53it/s]

Writing samples to samples_cond.fasta





In [None]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
import os
import warnings

# make a temp folder to store the input for boltz-1
# this is because boltz-1 requires the input to be in separate files
# convert to valid fasta format

folding_model = "chai-1" # one of "boltz-1" and "chai-1"
temp_folder = "temp"
os.makedirs(temp_folder, exist_ok=True)

for i, record in enumerate(SeqIO.parse(output, "fasta")):

    sequence = str(record.seq)
    id = record.id

    if not ("[" in sequence and "]" in sequence):
        warnings.warn(f"Sequence {i} does not have start and end tokens")

    sequence = sequence.replace("[", "").replace("]", "").replace("?", "X")

    with open(f"{temp_folder}/{i}.fasta", "w") as f:
        if folding_model == "boltz-1":
            f.write(f">{i}|protein|empty\n") # If you wish to explicitly run single sequence mode (which is generally advised against as it will hurt model performance), you may do so by using the special keyword empty for that protein (ex: >A|protein|empty)
        elif folding_model == "chai-1":
            f.write(f">protein|name={id}")
        f.write(str(sequence) + "\n")

# run through boltz-1 to fold
import subprocess
num_workers = 2
output_format = "pdb" # "mmcif" or "pdb"
subprocess.run(["boltz", "predict", temp_folder, "--use_msa_server", "--accelerator", "gpu", "--num_workers", str(num_workers), "--output_format", output_format, "--out_dir", f"{temp_folder}_boltz_output"],
    check=True, capture_output=True)


CalledProcessError: Command '['boltz', 'predict', 'temp', '--use_msa_server', '--accelerator', 'gpu', '--num_workers', '2', '--output_format', 'pdb', '--out_dir', 'temp_boltz_output']' died with <Signals.SIGKILL: 9>.

In [None]:
# visualize using pymol