## Modelling multiple protein structures together


We use a combination of homology and alphafold (AI models) available in SWISSMODEL, the automated protein structure homology-modelling server, accessible via the Expasy web server. 

Directly taken from;

https://swissmodel.expasy.org/docs/help#modelling_api



Steps:

1. Obtain API token: Downloaded from account @ swissmodel and stored in file @ ~/.config/swissmodel/apikey.txt
2. Get all pdb ids and download the corresponding fasta sequences from RCSB


In [None]:
import requests
from tqdm.notebook import tqdm
import re
pdb_ids = ["7WSM", "7MYJ", "6AE3","2HFP" ,"2M76", "4EYW", "6KSI", "7NYK", "1TNF"]
pdb_fasta_dict = {}
for pdb_id in tqdm(pdb_ids):
    fasta_url = f"https://www.rcsb.org/fasta/entry/{pdb_id}"
    r = requests.get(fasta_url)
    records = r.text.strip().split('>')
    sequences = [ "".join(record.splitlines()[1:]) for record in records if record ]
    for sequence in sequences:
        sequence = re.sub(r'^>.*\n', '', sequence, flags=re.MULTILINE)
    pdb_fasta_dict[pdb_id] = sequences

    with open("all_sequences.fasta", "w") as f:
        for pdb_id, sequences in pdb_fasta_dict.items():
            for seq in sequences:
                f.write(f">{pdb_id}\n{seq}\n")
    

In [None]:
import os

output_dir = "downloaded_models"
os.makedirs(output_dir, exist_ok=True)

for pdb in tqdm(pdb_ids, desc="Downloading PDB files"):
    url = f"https://files.rcsb.org/download/{pdb}.pdb"
    r = requests.get(url)
    if r.status_code == 200:
        file_path = os.path.join(output_dir, f"{pdb}_coordinates.pdb")
        with open(file_path, "w") as f:
            f.write(r.text)
    else:
        print(f"Failed to download {pdb}: status code {r.status_code}")

In [15]:
from tabulate import tabulate
import os
import re

results_dir="docking_results_20250501"

records = []
for root,_, files in os.walk(results_dir):
    if not os.path.basename(root)[0].isdigit():
        continue
    for file in files:
        if not file.endswith('.pdbqt'):
            continue
        pdb_id = os.path.basename(root)[:4]
        ligand = file[:3]
        score = None
        file_path = os.path.join(root, file)
        with open(file_path, 'r') as f:
            for line in f:
                if "VINA RESULT" in line:
                    m = re.search(r'VINA RESULT:\D*([-+]?[0-9]*\.?[0-9]+)', line)
                    if m:
                        score = -float(m.group(1))
                    break
        records.append((pdb_id, ligand, score))


# Aggregate scores by PDB id and ligand
score_dict = {}
for pdb, ligand, score in records:
    if pdb not in score_dict:
        score_dict[pdb] = {}
    score_dict[pdb][ligand] = score

# Determine unique PDB ids and ligand names
pdb_ids_sorted = sorted(score_dict.keys())
ligands_sorted = sorted({ligand for _, ligand, _ in records})

# Create the table with header and rows formatted for tabulate
header = ["PDB ID/LIG"] + ligands_sorted
table = []
for pdb in pdb_ids_sorted:
    row = [pdb] + [score_dict[pdb].get(ligand, "") for ligand in ligands_sorted]
    table.append(row)

# Transpose the table to flip rows and columns
table = list(map(list, zip(*([header] + table))))
header = table.pop(0)
print(tabulate(table, headers=header, tablefmt="grid"))


+--------------+---------+---------+---------+---------+--------+--------+--------+---------+
| PDB ID/LIG   |    1TNF |    2HFP |    4EYW |    6AE3 |   6KSI |   7MYJ |   7NYK |    7WSM |
| API          |  -8.717 |  -7.797 |  -8.908 |  -8.758 | -8.921 | -9.088 | -6.749 |  -8.583 |
+--------------+---------+---------+---------+---------+--------+--------+--------+---------+
| BAI          | -10.538 |  -9.794 |  -9.385 |  -9.546 | -9.369 | -9.548 | -7.896 |  -9.517 |
+--------------+---------+---------+---------+---------+--------+--------+--------+---------+
| BER          |  -9.385 |  -9.115 |  -8.384 |  -9.313 | -9.585 | -9.11  | -7.323 |  -9.646 |
+--------------+---------+---------+---------+---------+--------+--------+--------+---------+
| CAT          |  -9.085 |  -7.884 |  -8.749 |  -8.162 | -8.986 | -8.27  | -7.09  |  -8.316 |
+--------------+---------+---------+---------+---------+--------+--------+--------+---------+
| CUR          |  -8.95  |  -8.206 |  -9.925 |  -8.565 | -8.

In [18]:
import pandas as pd
df_scores = pd.DataFrame(table, columns=header)
df_scores.to_excel("docking_scores.xlsx", index=False)