In [None]:
#@title Install dependencies
%%bash -s $use_amber $use_templates $python_version

set -e

USE_AMBER=$1
USE_TEMPLATES=$2
PYTHON_VERSION=$3

if [ ! -f COLABFOLD_READY ]; then
  # install dependencies
  # We have to use "--no-warn-conflicts" because colab already has a lot preinstalled with requirements different to ours
  pip install -q --no-warn-conflicts "colabfold[alphafold-minus-jax] @ git+https://github.com/sokrypton/ColabFold"
  if [ -n "${TPU_NAME}" ]; then
    pip install -q --no-warn-conflicts -U dm-haiku==0.0.10 jax==0.3.25
  fi
  ln -s /usr/local/lib/python3.*/dist-packages/colabfold colabfold
  ln -s /usr/local/lib/python3.*/dist-packages/alphafold alphafold
  touch COLABFOLD_READY
fi

# Download params (~1min)
python -m colabfold.download

# setup conda
if [ ${USE_AMBER} == "True" ] || [ ${USE_TEMPLATES} == "True" ]; then
  if [ ! -f CONDA_READY ]; then
    wget -qnc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
    bash Miniconda3-latest-Linux-x86_64.sh -bfp /usr/local 2>&1 1>/dev/null
    rm Miniconda3-latest-Linux-x86_64.sh
    conda config --set auto_update_conda false
    touch CONDA_READY
  fi
fi
# setup template search
if [ ${USE_TEMPLATES} == "True" ] && [ ! -f HH_READY ]; then
  conda install -y -q -c conda-forge -c bioconda kalign2=2.04 hhsuite=3.3.0 python="${PYTHON_VERSION}" 2>&1 1>/dev/null
  touch HH_READY
fi
# setup openmm for amber refinement
if [ ${USE_AMBER} == "True" ] && [ ! -f AMBER_READY ]; then
  conda install -y -q -c conda-forge openmm=8.2.0 python="${PYTHON_VERSION}" pdbfixer 2>&1 1>/dev/null
  touch AMBER_READY
fi

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 242.2/242.2 kB 7.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.3/3.3 MB 61.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 373.9/373.9 kB 34.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 251.8/251.8 MB 10.0 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 4.7/4.7 MB 120.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.5/5.5 MB 112.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 76.7/76.7 kB 7.8 MB/s eta 0:00:00


Downloading alphafold2_multimer_v3 weights to /root/.cache/colabfold:   0%|          | 0/4099624960 [00:00<?, ?it/s]Downloading alphafold2_multimer_v3 weights to /root/.cache/colabfold:   1%|          | 23.9M/3.82G [00:00<00:16, 251MB/s]Downloading alphafold2_multimer_v3 weights to /root/.cache/colabfold:   1%|          | 47.9M/3.82G [00:00<00:16, 241MB/s]Downloading alphafold2_multimer_v3 weights to /root/.cache/colabfold:   2%|▏         | 70.9M/3.82G [00:00<00:17, 233MB/s]Downloading alphafold2_multimer_v3 weights to /root/.cache/colabfold:   2%|▏         | 93.1M/3.82G [00:00<00:17, 231MB/s]Downloading alphafold2_multimer_v3 weights to /root/.cache/colabfold:   3%|▎         | 115M/3.82G [00:00<00:18, 215MB/s] Downloading alphafold2_multimer_v3 weights to /root/.cache/colabfold:   4%|▎         | 141M/3.82G [00:00<00:17, 231MB/s]Downloading alphafold2_multimer_v3 weights to /root/.cache/colabfold:   4%|▍         | 166M/3.82G [00:00<00:16, 243MB/s]Downloading alphafold2_multime

In [None]:
!pip install biopython
# ──────────────────────────────────────────────────────────────────────────────
# CONFIGURACIÓN
# ──────────────────────────────────────────────────────────────────────────────
from pathlib import Path
from Bio import SeqIO
from colabfold.batch import get_queries, run
from colabfold.utils import setup_logging
from colabfold.download import default_data_dir

multi_fasta   = "filtrado_plddt0.8.fasta"     # multifasta de entrada
input_dir     = Path("single_fastas")        # nueva carpeta con un FASTA por secuencia
result_dir    = Path("predicciones_colabfold")

# ──────────────────────────────────────────────────────────────────────────────
# 1) GENERAR FASTA INDIVIDUALES
# ──────────────────────────────────────────────────────────────────────────────
input_dir.mkdir(exist_ok=True)
total, escritos = 0, 0

for rec in SeqIO.parse(multi_fasta, "fasta"):
    total += 1
    # nombre de archivo seguro: id limpio + índice para evitar colisiones
    safe_id = "".join(c if c.isalnum() or c in "-_." else "_" for c in rec.id.split()[0])
    fasta_path = input_dir / f"{safe_id}.fasta"
    # evita sobrescribir si ya existe (p. ej., ids duplicados)
    if fasta_path.exists():
        # añade sufijo incremental hasta que el nombre quede libre
        i = 1
        while (input_dir / f"{safe_id}_{i}.fasta").exists():
            i += 1
        fasta_path = input_dir / f"{safe_id}_{i}.fasta"
    SeqIO.write(rec, fasta_path, "fasta")
    escritos += 1

print(f"Multifasta dividido: {escritos}/{total} archivos creados en {input_dir.resolve()}")

# ──────────────────────────────────────────────────────────────────────────────
# 2) PREPARAR LOG Y QUERIES PARA COLABFOLD
# ──────────────────────────────────────────────────────────────────────────────
setup_logging(result_dir / "log.txt")
queries, is_complex = get_queries(str(input_dir))   # ahora igual que el pipeline original
print(f"Secuencias detectadas por ColabFold: {len(queries)}")

Multifasta dividido: 62/62 archivos creados en /content/single_fastas
Secuencias detectadas por ColabFold: 62


In [None]:
#@title Input protein sequence, then hit `Runtime` -> `Run all`

input_dir = '/content/single_fastas' #@param {type:"string"}
result_dir    = Path("predicciones_colabfold")

# number of models to use
#@markdown ---
#@markdown ### Advanced settings
msa_mode = "single_sequence" #@param ["MMseqs2 (UniRef+Environmental)", "MMseqs2 (UniRef only)","single_sequence","custom"]
num_models = 5 #@param [1,2,3,4,5] {type:"raw"}
num_recycles = 3 #@param [1,3,6,12,24,48] {type:"raw"}
stop_at_score = 100 #@param {type:"string"}
#@markdown - early stop computing models once score > threshold (avg. plddt for "structures" and ptmscore for "complexes")
use_custom_msa = False
num_relax = 0 #@param [0, 1, 5] {type:"raw"}
use_amber = num_relax > 0
relax_max_iterations = 200 #@param [0,200,2000] {type:"raw"}
use_templates = False #@param {type:"boolean"}
do_not_overwrite_results = True #@param {type:"boolean"}
zip_results = True #@param {type:"boolean"}


In [None]:
#@title Run Prediction

import sys

from colabfold.batch import get_queries, run
from colabfold.download import default_data_dir
from colabfold.utils import setup_logging
from pathlib import Path
result_dir    = Path("predicciones_colabfold")
# For some reason we need that to get pdbfixer to import
if use_amber and f"/usr/local/lib/python{python_version}/site-packages/" not in sys.path:
    sys.path.insert(0, f"/usr/local/lib/python{python_version}/site-packages/")

setup_logging(Path(result_dir).joinpath("log.txt"))

queries, is_complex = get_queries(input_dir)
run(
    queries=queries,
    result_dir=result_dir,
    use_templates=use_templates,
    num_relax=num_relax,
    relax_max_iterations=relax_max_iterations,
    msa_mode=msa_mode,
    model_type="auto",
    num_models=num_models,
    num_recycles=num_recycles,
    model_order=[1, 2, 3, 4, 5],
    is_complex=is_complex,
    data_dir=default_data_dir,
    keep_existing_results=do_not_overwrite_results,
    rank_by="auto",
    pair_mode="unpaired+paired",
    stop_at_score=stop_at_score,
    zip_results=zip_results,
    user_agent="colabfold/google-colab-batch",
)

2025-05-26 18:13:03,638 Running on GPU
2025-05-26 18:13:03,920 Found 2 citations for tools or databases
2025-05-26 18:13:03,921 Query 1/62: seq_1100 (length 99)
2025-05-26 18:14:16,256 alphafold2_ptm_model_1_seed_000 recycle=0 pLDDT=93.4 pTM=0.552
2025-05-26 18:14:16,482 alphafold2_ptm_model_1_seed_000 recycle=1 pLDDT=94.3 pTM=0.551 tol=0.185
2025-05-26 18:14:16,707 alphafold2_ptm_model_1_seed_000 recycle=2 pLDDT=94.3 pTM=0.541 tol=0.165
2025-05-26 18:14:16,931 alphafold2_ptm_model_1_seed_000 recycle=3 pLDDT=94.4 pTM=0.542 tol=0.0611
2025-05-26 18:14:16,931 alphafold2_ptm_model_1_seed_000 took 60.0s (3 recycles)
2025-05-26 18:14:17,174 alphafold2_ptm_model_2_seed_000 recycle=0 pLDDT=93.4 pTM=0.518
2025-05-26 18:14:17,400 alphafold2_ptm_model_2_seed_000 recycle=1 pLDDT=95 pTM=0.539 tol=0.46
2025-05-26 18:14:17,626 alphafold2_ptm_model_2_seed_000 recycle=2 pLDDT=95.1 pTM=0.541 tol=0.0468
2025-05-26 18:14:17,852 alphafold2_ptm_model_2_seed_000 recycle=3 pLDDT=95.1 pTM=0.546 tol=0.0352
202

{'rank': [['rank_001_alphafold2_ptm_model_3_seed_000',
   'rank_002_alphafold2_ptm_model_5_seed_000',
   'rank_003_alphafold2_ptm_model_2_seed_000',
   'rank_004_alphafold2_ptm_model_1_seed_000',
   'rank_005_alphafold2_ptm_model_4_seed_000'],
  ['rank_001_alphafold2_ptm_model_5_seed_000',
   'rank_002_alphafold2_ptm_model_2_seed_000',
   'rank_003_alphafold2_ptm_model_4_seed_000',
   'rank_004_alphafold2_ptm_model_3_seed_000',
   'rank_005_alphafold2_ptm_model_1_seed_000'],
  ['rank_001_alphafold2_ptm_model_5_seed_000',
   'rank_002_alphafold2_ptm_model_3_seed_000',
   'rank_003_alphafold2_ptm_model_2_seed_000',
   'rank_004_alphafold2_ptm_model_4_seed_000',
   'rank_005_alphafold2_ptm_model_1_seed_000'],
  ['rank_001_alphafold2_ptm_model_5_seed_000',
   'rank_002_alphafold2_ptm_model_2_seed_000',
   'rank_003_alphafold2_ptm_model_3_seed_000',
   'rank_004_alphafold2_ptm_model_1_seed_000',
   'rank_005_alphafold2_ptm_model_4_seed_000'],
  ['rank_001_alphafold2_ptm_model_5_seed_000',
 

In [None]:
# Ejemplo real
!zip -r -q PROTGPT_AF3.zip predicciones_colabfold/
from google.colab import files
files.download('PROTGPT_AF3.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Descomprimir con shell
!unzip -q PROTGPT_AF3.zip -d /content/descomprimido
print("✔️ Descomprimido en /content/descomprimido")
import re
from pathlib import Path
import statistics

# Ruta al log extraído (ajusta según tu carpeta de trabajo)
log_path = Path("/content/descomprimido/predicciones_colabfold/log.txt")

import re

# Expresiones regulares para identificar componentes clave
query_pattern = re.compile(r'Query \d+/\d+: (\S+)')
rank_pattern = re.compile(r'rank_\d+_.* pLDDT=([\d.]+) pTM=([\d.]+)')

# Variables para almacenar los resultados
results = []
current_seq = None
collecting_ranks = False
plddts = []
ptms = []

with open(log_path, 'r') as file:
    for line in file:
        # Buscar línea de 'Query' para identificar la secuencia actual
        query_match = query_pattern.search(line)
        if query_match:
            if current_seq is not None and plddts:
                # Calcular medias para la secuencia anterior
                mean_plddt = sum(plddts) / len(plddts)
                mean_ptm = sum(ptms) / len(ptms)
                results.append((current_seq, mean_plddt, mean_ptm))
                plddts.clear()
                ptms.clear()
            current_seq = query_match.group(1)
            collecting_ranks = False
        elif 'reranking models by' in line:
            # Comenzar a recolectar métricas de los modelos
            collecting_ranks = True
        elif collecting_ranks:
            rank_match = rank_pattern.search(line)
            if rank_match:
                # Extraer pLDDT y pTM de cada modelo
                plddts.append(float(rank_match.group(1)))
                ptms.append(float(rank_match.group(2)))
            else:
                # Dejar de recolectar si la línea no es un ranking
                collecting_ranks = False

# Añadir los resultados de la última secuencia procesada
if current_seq is not None and plddts:
    mean_plddt = sum(plddts) / len(plddts)
    mean_ptm = sum(ptms) / len(ptms)
    results.append((current_seq, mean_plddt, mean_ptm))

# Mostrar los resultados
for seq_id, avg_plddt, avg_ptm in results:
    print(f"Secuencia: {seq_id}")
    print(f"  Mean pLDDT: {avg_plddt:.2f}")
    print(f"  Mean pTM: {avg_ptm:.3f}\n")

# ─── 2) Guardar resultados en CSV ──────────────────────────────────────────────
import csv

csv_path = Path("/content/query_metrics.csv")
with csv_path.open("w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["sequence", "mean_pLDDT", "mean_pTM"])
    for seq_id, avg_plddt, avg_ptm in results:
        writer.writerow([seq_id, f"{avg_plddt:.2f}", f"{avg_ptm:.3f}"])
print(f"✔️ CSV guardado en {csv_path}")

# ─── 3) Copiar ZIPs según umbrales de pLDDT y pTM ──────────────────────────────
import shutil

# Umbrales
TH_P_LDDT = 85.0
TH_P_TM   = 0.50

# Directorios de origen y destino de los ZIPs
zips_dir      = Path("/content/descomprimido/predicciones_colabfold")
filtered_dir  = Path("/content/filtered_zips")
filtered_dir.mkdir(parents=True, exist_ok=True)

# Identificar secuencias que cumplen ambos umbrales
passed_seqs = {
    seq_id
    for seq_id, avg_plddt, avg_ptm in results
    if avg_plddt >= TH_P_LDDT and avg_ptm >= TH_P_TM
}

# Recorrer cada ZIP y copiar los que contengan alguna de las secuencias pasadas
total=0
pasan=0
for zip_file in zips_dir.glob("*.zip"):
    total+=1
    for seq in passed_seqs:
        if seq in zip_file.stem:
            pasan+=1
            shutil.copy2(zip_file, filtered_dir / zip_file.name)
            print(f"✔️ Copiado: {zip_file.name}")
            break
print(f"total de secuencias: {total}")
print(f"secuencias pasadas: {pasan}")

✔️ Descomprimido en /content/descomprimido
Secuencia: seq_1100
  Mean pLDDT: 94.88
  Mean pTM: 0.548

Secuencia: seq_11207
  Mean pLDDT: 93.04
  Mean pTM: 0.455

Secuencia: seq_11643
  Mean pLDDT: 97.24
  Mean pTM: 0.582

Secuencia: seq_12861
  Mean pLDDT: 97.24
  Mean pTM: 0.635

Secuencia: seq_12911
  Mean pLDDT: 97.22
  Mean pTM: 0.614

Secuencia: seq_14194
  Mean pLDDT: 95.42
  Mean pTM: 0.531

Secuencia: seq_14869
  Mean pLDDT: 95.00
  Mean pTM: 0.535

Secuencia: seq_15208
  Mean pLDDT: 95.92
  Mean pTM: 0.546

Secuencia: seq_15662
  Mean pLDDT: 97.08
  Mean pTM: 0.628

Secuencia: seq_16213
  Mean pLDDT: 85.00
  Mean pTM: 0.450

Secuencia: seq_16389
  Mean pLDDT: 93.52
  Mean pTM: 0.474

Secuencia: seq_19095
  Mean pLDDT: 95.92
  Mean pTM: 0.571

Secuencia: seq_19169
  Mean pLDDT: 93.78
  Mean pTM: 0.518

Secuencia: seq_21861
  Mean pLDDT: 85.66
  Mean pTM: 0.551

Secuencia: seq_24706
  Mean pLDDT: 97.20
  Mean pTM: 0.636

Secuencia: seq_25195
  Mean pLDDT: 78.64
  Mean pTM: 0.327

In [None]:
!zip -r -q PROTGPT_AF3_filtered.zip filtered_zips/
from google.colab import files
files.download('PROTGPT_AF3_filtered.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>