In [1]:
from Bio.PDB import PDBList
import os
import requests
import pandas as pd
import subprocess

In [4]:
# Define PDB and Uniprot IDs
pdb_ids = ['1ENH', '1YMG', '3HDD']
uniprot_ids = ['Q9N2K0', 'P06624', 'P02836', 'P0A9P0', 'Q9H3D4', 'Q9NR30']

# Create dataframe with PDB IDs and Uniprot IDs
df_test = pd.DataFrame({'PDB IDs': pdb_ids, 'Uniprot IDs': ['Q9N2K0', 'P06624', 'P02836']})

# Add Uniprot IDs without PDB IDs
df_test = pd.concat([df_test, pd.DataFrame({'PDB IDs': ['NaN'] * len(uniprot_ids), 'Uniprot IDs': uniprot_ids})], ignore_index=True)
df_test = df_test.replace('NaN', pd.NA)
df_test = df_test.drop(index=range(3, 6))
print(df_test)

  PDB IDs Uniprot IDs
0    1ENH      Q9N2K0
1    1YMG      P06624
2    3HDD      P02836
6    <NA>      P0A9P0
7    <NA>      Q9H3D4
8    <NA>      Q9NR30


In [5]:
# Problem: input is made up, need to test with real input, also need to change pdb_dir
def download_structures(df, pdb_dir='checking'):
    pdbl = PDBList()
    if not os.path.exists(pdb_dir):
        os.makedirs(pdb_dir)
        
    for i, row in df.iterrows():
        pdb_id = row['PDB IDs']
        uniprot_id = row['Uniprot IDs']
        if not pd.isna(pdb_id):  # check for NaN value in PDB IDs column
            pdbl.retrieve_pdb_file(pdb_id, pdir=pdb_dir, file_format='pdb')
            os.rename(os.path.join(pdb_dir, f'{i}.ent'), os.path.join(pdb_dir, f'{i}.pdb'))
        elif isinstance(uniprot_id, str):  # download structure using UniProt ID
            url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
            response = requests.get(url)
            if response.ok:
                filename = f'{pdb_dir}/{uniprot_id}.pdb'
                with open(filename, 'wb') as f:
                    f.write(response.content)
                print(f"Downloaded file for {uniprot_id} to {filename}")
            else:
                print(f"Failed to download file for {uniprot_id}: {response.status_code} - {response.reason}")
        else:
            print(f"No PDB ID or UniProt ID available for index {i}")
            pass

In [6]:
download_structures(df=df_test, pdb_dir='checking')

Downloading PDB structure '1enh'...
Downloading PDB structure '1ymg'...
Downloading PDB structure '3hdd'...
Downloaded file for P0A9P0 to checking/P0A9P0.pdb
Downloaded file for Q9H3D4 to checking/Q9H3D4.pdb
Downloaded file for Q9NR30 to checking/Q9NR30.pdb


In [15]:
cmd = ['FATCAT', '-p1', 'P0A9P0.pdb', '-p2', 'Q9H3D4.pdb', '-i', './checking/', '-o', 'outfile', '-m']
subprocess.run(cmd)

CompletedProcess(args=['FATCAT', '-p1', 'P0A9P0.pdb', '-p2', 'Q9H3D4.pdb', '-i', './checking/', '-o', 'outfile', '-m'], returncode=1)

In [18]:
# open the aln file and extract the p-value
with open('./positive_ctl/pos_ctl.aln', 'r') as f:
    for line in f:
        if line.startswith('P-value'):
            p_value = line.split()[1]
            break

In [19]:
p_value

'4.53e-11'

In [None]:
def perform_flexible_alignment(protein_pairs, structures, chunk_size):
    # Implement performing flexible alignment on the 3D structures of protein pairs in chunks
    alignment_metrics = []

    for index, row in protein_pairs.iterrows():
        # TODO: Replace this with the actual subprocess call for structural alignment
        # Example: result = subprocess.run(["your_alignment_tool", "arg1", "arg2"])
        result = None

        alignment_metrics.append({"protein_id_1": row["protein_id_1"],
                                  "protein_id_2": row["protein_id_2"],
                                  "alignment_metric": result})

    return pd.DataFrame(alignment_metrics)

In [9]:
# only works with PDB IDs and experimentally resolved structures (no AF structures or Uniprot IDs)
pdbl = PDBList()
PDBList2=['1ENH', '3HDD']
for i in PDBList2:
    pdbl.retrieve_pdb_file(i, pdir='positive_ctl', file_format='pdb')

TypeError: retrieve_pdb_file() got an unexpected keyword argument 'filename'

In [None]:
os.rename(os.path.join('./positive_ctl', f'pdb1enh.ent'), os.path.join('./positive_ctl', f'1ENH.pdb'))

In [7]:
uniprot_ids = ['P0A9P0', 'Q9H3D4', 'Q9NR30', 'A0JNW5', 'P0DO02', 'A2RVM0', 'A2AF47', 'A1L1K8', 'A8Y9T9', 'P62968']

# Create a directory named "random_pdb" to store the downloaded files
if not os.path.exists('random_pdb'):
    os.makedirs('random_pdb')

for uniprot_id in uniprot_ids:
    url = f'https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb'
    response = requests.get(url)
    if response.ok:
        filename = f'random_pdb/{uniprot_id}.pdb'
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded file for {uniprot_id} to {filename}")
    else:
        print(f"Failed to download file for {uniprot_id}: {response.status_code} - {response.reason}")


Downloaded file for P0A9P0 to random_pdb/P0A9P0.pdb
Downloaded file for Q9H3D4 to random_pdb/Q9H3D4.pdb
Downloaded file for Q9NR30 to random_pdb/Q9NR30.pdb
Downloaded file for A0JNW5 to random_pdb/A0JNW5.pdb
Downloaded file for P0DO02 to random_pdb/P0DO02.pdb
Downloaded file for A2RVM0 to random_pdb/A2RVM0.pdb
Downloaded file for A2AF47 to random_pdb/A2AF47.pdb
Downloaded file for A1L1K8 to random_pdb/A1L1K8.pdb
Downloaded file for A8Y9T9 to random_pdb/A8Y9T9.pdb
Failed to download file for P62968: 404 - Not Found


In [None]:
# fatcat command: 
# FATCATQue.pl timeused pair.list -q >pair.aln (only work when files are in the same directory)
# FATCAT -p1 A0JNW5.pdb -p2 A1L1K8.pdb -o A0JNW5_A1L1K8 -m
# FATCAT -p1 P0A9P0.pdb -p2 Q9H3D4.pdb -i ./checking -o test -m (-i directory of PDB files), result file is in ./