# 🧬 다중 펩타이드 후보 기반 단백질 결합력 예측 파이프라인
ProtGPT2로 여러 펩타이드 후보를 생성하고, 각 후보에 대해 구조 예측, 도킹, 상호작용 분석 및 결합력 예측(Pafnucy)을 수행합니다.

## 0. Google Drive 마운트

In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
work_dir = "/content/drive/MyDrive/peptide_docking_pipeline_multi"
os.makedirs(work_dir, exist_ok=True)
os.chdir(work_dir)
print(f"Working directory: {work_dir}")

Mounted at /content/drive
Working directory: /content/drive/MyDrive/peptide_docking_pipeline_multi


## 1. ProtGPT2로 펩타이드 후보 다중 생성

In [2]:
!pip install transformers sentencepiece

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("nferruz/ProtGPT2")
model = AutoModelForCausalLM.from_pretrained("nferruz/ProtGPT2").to("cuda" if torch.cuda.is_available() else "cpu")

N = 5  # 생성할 후보 수
peptides = []

for _ in range(N):
    input_ids = tokenizer("generate:", return_tensors="pt").input_ids.to(model.device)
    output = model.generate(input_ids, max_length=30, num_return_sequences=1, do_sample=True, top_k=950, top_p=0.96)
    sequence = tokenizer.decode(output[0], skip_special_tokens=True).replace("generate:", "").strip()
    peptides.append(sequence)

# 파일 저장
for i, pep in enumerate(peptides):
    with open(f"peptide_{i}.fasta", "w") as f:
        f.write(f">pep{i}\n{pep}\n")

print("✅ 생성된 펩타이드 후보:")
for i, seq in enumerate(peptides):
    print(f"[{i}] {seq}")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/850 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad

✅ 생성된 펩타이드 후보:
[0] GAQSMDGAQSNDGPQSKDGPQTMDGPQSNDGAQSHDGAQSQDGAQSHDGA
[1] VQRVNNSTQVSYAIGVADIGGDQ
[2] QGGPLMHVIAAKAVAFGEAAR
[3] AQ
[4] 


## 2. 단백질 서열 준비

In [3]:
# 사용자가 단백질 서열을 입력해야 합니다.
protein_sequence = "MTMKQLNDLENRLLGFLGNTILADATKSTQAKLEKELLGTTFGAEA"
with open("protein.fasta", "w") as f:
    f.write(">protein\n" + protein_sequence)

## 3. 구조 예측 (ColabFold/AlphaFold-Multimer)

In [4]:
merged_files = []
for i in range(N):
    fname = f"complex_{i}.fasta"
    with open(fname, "w") as out, open("protein.fasta") as pro, open(f"peptide_{i}.fasta") as pep:
        out.writelines(pro.readlines())
        out.writelines(pep.readlines())
    merged_files.append(fname)

print("✅ 복합체 FASTA 파일 생성 완료:")
for f in merged_files:
    print(f"- {f}")

✅ 복합체 FASTA 파일 생성 완료:
- complex_0.fasta
- complex_1.fasta
- complex_2.fasta
- complex_3.fasta
- complex_4.fasta


## 4. 구조 예측 결과 준비 및 평가용 출력

In [6]:
import os

# Ensure merged_files list is available (it should be from the previous cell 184805f7)
# If running this cell independently, you might need to re-run cell 184805f7 first.
# This loop assumes 'merged_files' contains the list of complex fasta files.

for f in merged_files:
    output_dir = f"prediction_{f.split('.')[0]}"
    print(f"Running colabfold_batch for {f} to {output_dir}")
    # Using !colabfold_batch to run the command line tool
    !colabfold_batch {f} {output_dir}

Running colabfold_batch for complex_0.fasta to prediction_complex_0
/bin/bash: line 1: colabfold_batch: command not found
Running colabfold_batch for complex_1.fasta to prediction_complex_1
/bin/bash: line 1: colabfold_batch: command not found
Running colabfold_batch for complex_2.fasta to prediction_complex_2
/bin/bash: line 1: colabfold_batch: command not found
Running colabfold_batch for complex_3.fasta to prediction_complex_3
/bin/bash: line 1: colabfold_batch: command not found
Running colabfold_batch for complex_4.fasta to prediction_complex_4
/bin/bash: line 1: colabfold_batch: command not found


In [5]:
import os

for i in range(N):
    pred_path = f"prediction_complex_{i}/complex_{i}_0.pdb"
    if os.path.exists(pred_path):
        print(f"[{i}] 구조 예측 완료: {pred_path}")
    else:
        print(f"[{i}] ❌ 예측 실패 또는 누락")


[0] ❌ 예측 실패 또는 누락
[1] ❌ 예측 실패 또는 누락
[2] ❌ 예측 실패 또는 누락
[3] ❌ 예측 실패 또는 누락
[4] ❌ 예측 실패 또는 누락


## 5. 도킹, PLIP, Pafnucy를 통한 결합력 평가 및 점수 계산

In [6]:
!apt-get install -y openbabel
!pip install -q plip
!git clone https://github.com/oddt/pafnucy.git
%cd pafnucy
!pip install -q -r requirements.txt
%cd ..

from plip.structure.preparation import PDBComplex
import pandas as pd

results = []

for i in range(N):
    pred_pdb = f"prediction_complex_{i}/complex_{i}_0.pdb"
    if not os.path.exists(pred_pdb):
        continue

    os.system(f"obabel {pred_pdb} -O receptor_{i}.pdbqt")
    os.system(f"cp receptor_{i}.pdbqt ligand_{i}.pdbqt")

    # AutoDock Vina 다운로드 및 실행
    if not os.path.exists("vina_1.2.3_linux_x86_64/vina"):
        !wget -q https://github.com/ccsb-scripps/AutoDock-Vina/releases/download/v1.2.3/vina_1.2.3_linux_x86_64.zip
        !unzip -q vina_1.2.3_linux_x86_64.zip
        !chmod +x vina_1.2.3_linux_x86_64/vina

    vina_cmd = f"./vina_1.2.3_linux_x86_64/vina --receptor receptor_{i}.pdbqt --ligand ligand_{i}.pdbqt --center_x 0 --center_y 0 --center_z 0 --size_x 20 --size_y 20 --size_z 20 --out output_{i}.pdbqt --log log_{i}.txt"
    os.system(vina_cmd)

    vina_score = None
    with open(f"log_{i}.txt") as f:
        for line in f:
            if "REMARK VINA RESULT" in line:
                vina_score = float(line.strip().split()[3])
                break

    # PLIP 상호작용 분석
    structure = PDBComplex()
    structure.load_pdb(f"output_{i}.pdbqt")
    structure.analyze()
    interaction_count = 0
    for ligand in structure.ligands:
        inter = structure.interaction_sets[ligand]
        interaction_count += len(inter.hbonds) + len(inter.hydrophobic_contacts) + len(inter.saltbridge_ligands)

    # Pafnucy 평가
    os.system(f"obabel output_{i}.pdbqt -O complex_final_{i}.pdb")
    os.system(f"python pafnucy/predict.py --pdb complex_final_{i}.pdb --out affinity_{i}.csv")
    pafnucy_df = pd.read_csv(f"affinity_{i}.csv")
    pafnucy_affinity = float(pafnucy_df['predicted_affinity'].iloc[0])

    final_score = (-1 * vina_score) + (-1 * pafnucy_affinity) + (0.5 * interaction_count)
    results.append({
        "index": i,
        "peptide": peptides[i],
        "vina_score": vina_score,
        "pafnucy": pafnucy_affinity,
        "interaction": interaction_count,
        "final_score": final_score
    })

df = pd.DataFrame(results)
df_sorted = df.sort_values("final_score", ascending=False)
df_sorted.to_csv("peptide_binding_rank.csv", index=False)
df_sorted


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libinchi1 libmaeparser1 libopenbabel7
The following NEW packages will be installed:
  libinchi1 libmaeparser1 libopenbabel7 openbabel
0 upgraded, 4 newly installed, 0 to remove and 35 not upgraded.
Need to get 3,903 kB of archives.
After this operation, 16.9 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libinchi1 amd64 1.03+dfsg-4 [455 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libmaeparser1 amd64 1.2.4-1build1 [88.2 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libopenbabel7 amd64 3.1.1+dfsg-6ubuntu5 [3,231 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 openbabel amd64 3.1.1+dfsg-6ubuntu5 [128 kB]
Fetched 3,903 kB in 3s (1,513 kB/s)
Selecting previously unselected package libinchi1.
(Reading database ... 126284 files and dire

ModuleNotFoundError: No module named 'plip'