<a href="https://colab.research.google.com/github/jtrinquier/SoftAlign/blob/main/Foldseek_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook automates the discovery of distant structural relatives through a three-stage pipeline:

    Search & Filter: Queries the AlphaFold Database via Foldseek to identify initial candidates.

    Reconstruct: Converts CŒ± traces into high-fidelity all-atom backbones using Pulchra.

    Align: Employs SoftAlign deep learning to perform geometric alignment rerank hits.

In [1]:
# @title Libraries
%%capture
! pip install py3Dmol
! pip install Bio
import jax
!  pip install git+https://github.com/deepmind/dm-haiku
import haiku as hk
import jax.numpy as jnp
from jax import vmap
import numpy as np
import time
import numpy as np
import time
import os
! git clone https://github.com/jtrinquier/SoftAlign.git
import sys
! mv ./SoftAlign/softalign/pulchra ./
softalign_path = os.path.join(os.getcwd(), 'SoftAlign')

# Add SoftAlign directory to sys.path if it's not already there
if softalign_path not in sys.path:
    sys.path.append(softalign_path)
softalign_code_path = os.path.join(softalign_path, 'softalign/colab')
if softalign_code_path not in sys.path:
    sys.path.append(softalign_code_path)


import os, io, time, gzip, tarfile, shutil
import pandas as pd
import requests
from google.colab import files

import os, sys, json, shutil, glob, pickle
import numpy as np
import jax, jax.numpy as jnp
import haiku as hk


if os.path.exists("./SoftAlign/softalign/foldseek_client.py"):
    shutil.copy("./SoftAlign/softalign/foldseek_client.py", "./foldseek_client.py")

from foldseek_client import FoldseekClient

import os, json, subprocess, shutil, stat
import numpy as np
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor


import ENCODING as enco
import utils
import search
import Input_MPNN as inp

In [2]:
# @title 1. Setup & Configuration
import os, shutil
from google.colab import files

# --- CONFIG ---
BASE_DIR = "/content"
QUERY_DIR = f"{BASE_DIR}/query"
PDB_OUT = f"{BASE_DIR}/rebuilt_pdbs"

for d in [QUERY_DIR, PDB_OUT]:
    os.makedirs(d, exist_ok=True)

# --- USER INPUTS ---
print("üìÇ Upload your Query PDB file:")
uploaded = files.upload()
query_name = list(uploaded.keys())[0]
shutil.move(query_name, f"{QUERY_DIR}/{query_name}")

db_choice = "afdb50" # @param ["afdb50", "afdb-swissprot", "afdb-proteome"]
model_type = "Softmax" # @param ["Softmax", "Smith-Waterman"]

print(f"\n‚úÖ Ready to process {query_name}")

üìÇ Upload your Query PDB file:


Saving AF-A0A058ZRF3-F1-model_v6.pdb to AF-A0A058ZRF3-F1-model_v6.pdb

‚úÖ Ready to process AF-A0A058ZRF3-F1-model_v6.pdb


In [3]:
# @title 2. Run Search & Rebuilding (Silent)
%%capture
import foldseek_client
# --- 1. FOLDSEEK SEARCH ---
client = foldseek_client.FoldseekClient()
job = client.submit_search(f"{QUERY_DIR}/{query_name}", [db_choice])
job_data = client.wait_for_job(job.get("id") or job.get("ticket"))

# Download and Decompress Results
url = f"https://search.foldseek.com/api/result/download/{job_data['id']}"
r = requests.get(url)
data = r.content
if data.startswith(b"\x1f\x8b"):
    data = gzip.decompress(data)

with tarfile.open(fileobj=io.BytesIO(data), mode="r:*") as tar:
    m8_member = next(m for m in tar.getmembers() if m.name.endswith(".m8") and "report" not in m.name)
    df = pd.read_csv(io.StringIO(tar.extractfile(m8_member).read().decode()), sep="\t", header=None)

# --- 2. PULCHRA SETUP ---
PULCHRA_BIN = "/content/pulchra"
if os.path.exists("/content/SoftAlign/softalign/pulchra"):
    shutil.copy("/content/SoftAlign/softalign/pulchra", PULCHRA_BIN)
os.chmod(PULCHRA_BIN, os.stat(PULCHRA_BIN).st_mode | stat.S_IEXEC)

def rebuild_single_protein(row_data):
    row_idx, row = row_data
    val, coords_str = row.iloc[1], row.iloc[17]
    if not isinstance(coords_str, str): return None

    identifier = val.split()[0] if isinstance(val, str) and val.startswith("AF-") else f"row_{row_idx}"
    ca_name = f"{identifier}_{row_idx}.pdb"
    ca_path = f"/dev/shm/{ca_name}"

    # Fast CA-only write
    try:
        coords = np.fromstring(coords_str, sep=',').reshape(-1, 3)
        with open(ca_path, "w") as f:
            for i, c in enumerate(coords, 1):
                f.write(f"ATOM  {i:5d}  CA  ALA A{i:4d}    {c[0]:8.3f}{c[1]:8.3f}{c[2]:8.3f}  1.00  0.00           C\n")
            f.write("END\n")

        # Run Pulchra
        subprocess.run([PULCHRA_BIN, ca_name], cwd="/dev/shm", stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        rebuilt_src = ca_path.replace(".pdb", ".rebuilt.pdb")
        if os.path.exists(rebuilt_src):
            final_name = f"{identifier}_{row_idx}.pdb" # Keep name consistent for softalign
            shutil.move(rebuilt_src, f"{PDB_OUT}/{final_name}")
            if os.path.exists(ca_path): os.remove(ca_path)
            return final_name
    except:
        return None
    return None

# --- 3. EXECUTE REBUILD ---
rows_list = list(df.iterrows())
with ProcessPoolExecutor() as executor:
    rebuilt_names = list(tqdm(executor.map(rebuild_single_protein, rows_list), total=len(rows_list), desc="üß¨ Rebuilding"))

# --- 4. PREPARE SOFTALIGN DIRECTORIES ---
DST_DIR = "/content/softalign_pdbs"
SOFTALIGN_PATH = "/content/SoftAlign"
os.makedirs(DST_DIR, exist_ok=True)

# Ensure query is included and moved to the softalign work dir
shutil.copy(f"{QUERY_DIR}/{query_name}", f"{DST_DIR}/{query_name}")
clean_ids = [query_name]

for r_name in rebuilt_names:
    if r_name:
        shutil.copy(f"{PDB_OUT}/{r_name}", f"{DST_DIR}/{r_name}")
        clean_ids.append(r_name)

# --- 5. SOFTALIGN INPUT LOADING ---
# Using the MPNN parser provided in your previous snippet for N, CA, C, CB coverage
dicti_inputs = {}
for f in clean_ids:
    out = inp.get_inputs_mpnn(f"{DST_DIR}/{f}")
    if out:
        dicti_inputs[f] = out

# --- 6. SOFTALIGN CORE (UNTOUCHED) ---
ckpt = "CONT_SFT_06_T_3_1" if model_type == "Softmax" else "CONT_SW_05_T_3_1"
params = pickle.load(open(f"{SOFTALIGN_PATH}/models/{ckpt}", "rb"))
print(f"‚úÖ Loaded checkpoint: {ckpt}")

def enco_fn(x):
    return enco.ENCODING(64,64,64,3,64)(x)

ENCO = hk.transform(enco_fn)

@jax.jit
def enco_fast(p,k,x):
    return ENCO.apply(p,k,x)

# Encode
key = jax.random.PRNGKey(0)
ids_sa, encs, lengths = [], [], []
max_len = max(v[0].shape[1] for v in dicti_inputs.values())

for k,(X,m,c,r) in dicti_inputs.items():
    _,_,_,_,Xp,mp,rp,cp,_ = utils.pad_([X[0]],[m[0]],[r[0]],[c[0]],
                                      [X[0]],[m[0]],[r[0]],[c[0]],
                                      max_len)
    enc = enco_fast(params, key, (Xp,mp,rp,cp))[0]
    ids_sa.append(k)
    encs.append(enc[:X.shape[1]])
    lengths.append(X.shape[1])

dicti_encodings = dict(zip(ids_sa, encs))

# Search
thresholds = np.arange(100, max_len+100, 100)
target_data = search.setup_target_data(dicti_encodings, dicti_inputs, thresholds)

query_id = ids_sa[0]
search.compute_scores_for_query(
    query_id=query_id,
    target_data=target_data,
    model_type=model_type,
    l_query_pad=dicti_encodings[query_id].shape[0]
)

print(f"‚úÖ SoftAlign done with model = {model_type}")

In [4]:
# @title 3. Visualize Results (Query vs. Top Hit)
import py3Dmol
import pandas as pd
import os
import glob

# --- 1. DYNAMICALLY FIND SCORES ---
csv_filename = f"scores_sorted_{query_name}.csv"
scores_csv = os.path.join("/content/output", csv_filename)

if not os.path.exists(scores_csv):
    print(f"‚ùå Target CSV not found: {scores_csv}")
else:
    res_df = pd.read_csv(scores_csv)
    if res_df.shape[1] >= 2:
        res_df.columns = ["PDB ID", "SoftAlign Score"] + list(res_df.columns[2:])

    print(f"üèÜ Top Structural Matches for {query_name}:")
    display(res_df.head(10))

    # --- 2. SMART PDB LOCATOR (Checks Query and Rebuilt folders) ---
    def find_pdb_smart(pdb_id):
        base = pdb_id.replace(".pdb", "").replace(".rebuilt", "")

        # Search in both possible locations
        search_dirs = ["/content/query", "/content/rebuilt_pdbs"]

        for d in search_dirs:
            search_patterns = [
                os.path.join(d, pdb_id),
                os.path.join(d, f"{pdb_id}.pdb"),
                os.path.join(d, f"{base}*.pdb"),
            ]
            for pattern in search_patterns:
                matches = glob.glob(pattern)
                if matches:
                    return matches[0]
        return None

    # --- 3. 3D VISUALIZATION ---
    if len(res_df) > 1:
        query_id = res_df.iloc[0]["PDB ID"]
        hit_id = res_df.iloc[1]["PDB ID"]

        q_path = find_pdb_smart(query_id)
        h_path = find_pdb_smart(hit_id)

        if q_path and h_path:
            print(f"‚úÖ Found Query at: {q_path}")
            print(f"‚úÖ Found Hit at: {h_path}")

            view = py3Dmol.view(width=800, height=600)

            # Load Query (Cyan)
            with open(q_path, 'r') as f:
                view.addModel(f.read(), 'pdb')
            view.setStyle({'model': 0}, {'cartoon': {'color': 'cyan'}})

            # Load Top Hit (Magenta)
            with open(h_path, 'r') as f:
                view.addModel(f.read(), 'pdb')
            view.setStyle({'model': 1}, {'cartoon': {'color': 'magenta', 'opacity': 0.8}})

            view.zoomTo()
            view.show()
        else:
            if not q_path: print(f"‚ùå Still could not locate Query: {query_id}")
            if not h_path: print(f"‚ùå Still could not locate Hit: {hit_id}")

üèÜ Top Structural Matches for AF-A0A058ZRF3-F1-model_v6.pdb:


Unnamed: 0,PDB ID,SoftAlign Score
0,AF-A0A058ZRF3-F1-model_v6.pdb,1.0
1,AF-A0A4T0FCH3-F1-model_v6_267.pdb,0.957755
2,AF-A0A671MT66-F1-model_v6_233.pdb,0.957023
3,AF-A0A8B9X9G0-F1-model_v6_232.pdb,0.95556
4,AF-A0A3B5AV90-F1-model_v6_134.pdb,0.955011
5,AF-A0A0U5KT64-F1-model_v6_2.pdb,0.954279
6,AF-A0A3B4F831-F1-model_v6_122.pdb,0.953545
7,AF-A0A2V0PB33-F1-model_v6_79.pdb,0.953365
8,AF-A0A812CD68-F1-model_v6_246.pdb,0.95334
9,AF-A0A498SAU7-F1-model_v6_324.pdb,0.952999


‚úÖ Found Query at: /content/query/AF-A0A058ZRF3-F1-model_v6.pdb
‚úÖ Found Hit at: /content/rebuilt_pdbs/AF-A0A4T0FCH3-F1-model_v6_267.pdb
