In [1]:
import sqlite3
import pandas as pd
import FetchAlphaFoldPDBs as FETCH
import os
import shutil

ACCESSION_DB_PATH = ".\\..\\AlphaFold\\accession_id_db.db"
ACCESSION_ID_TABLE_NAME = "accession_ids"
TABLE_UNIPROT_ID_FEATURE_NAME = "UniProtAccessionID"
TABLE_ALPHAFOLD_DB_ID_FEATURE_NAME = "AlphaFoldDBID"

AlphaFold Database doesn't seem to have an API, instead an FTP server with all accession IDs mapped to AlphaFold IDs available [here](http://ftp.ebi.ac.uk/pub/databases/alphafold/)

[This article](https://www.blopig.com/blog/2022/08/retrieving-alphafold-models-from-alphafolddb/) gives an example of retrieving alphafold models using the above however involves loading a 7gb csv into memory which is very slow and leads to memory errors

So instead let's use a database approach and store accession IDs in a database which can be queried. 

- Note: it is much quicker to search a directory of pdb files however this DB approach is exhaustive in all available AlphaFold PDBs (which can subsequently be pulled from AF)

- Note: Comparatively fast when searching multiple IDs as database can be queried for uniprotIDs at once

- Note: Some IDs hyphenated '-', these are caught and reduced ID (without hyphen) searched if no match found for hyphen

#### Set Up Database Connection to Accession Info

Requires a local database of the accession IDs files available from [AlphaFolds FTP server](http://ftp.ebi.ac.uk/pub/databases/alphafold/)

In [2]:
# Set Up Connections
conn = sqlite3.connect(ACCESSION_DB_PATH)
cur = conn.cursor()

#### Pull list of Uniprot IDs to Check
This is the list of uniprot IDs needing to be queried, here llps_plus (see [pnas](https://www.pnas.org/doi/10.1073/pnas.2019053118)) used as a demo

In [3]:
DEMO_DATA_FILEPATH = ".\\demo_datasets\\demo_llps_plus.csv"
OUTPUT_FILEPATH = ".\\demo_datasets\\demo_llps_plus_AlphaFold_Info.csv"

In [4]:
# Get IDs to Query from demo data
dataset_ID_column_name = 'Uniprot_ID'
llpsPlusData = pd.read_csv(DEMO_DATA_FILEPATH)
uniqueUniProtIDs = list(set(llpsPlusData[dataset_ID_column_name]))
print(f'{len(llpsPlusData[dataset_ID_column_name])} IDs in llps minus, {len(uniqueUniProtIDs)} of which are unique')

137 IDs in llps minus, 77 of which are unique


#### Get AF info for each ID

In [5]:
# Get AF Identifiers from ID list
AF_info = FETCH.getAndSaveAFinfoForListOfUniProtIDs(cur, uniqueUniProtIDs, OUTPUT_FILEPATH, debug=False)

In [6]:
AF_info

Unnamed: 0,uniprot_ID_source,uniprot_ID_match,AF_DB_ID,firstResidueIndex,lastResidueIndex,latestVersion
0,P05453,P05453,AF-P05453-F1,1,685,4
1,P35637,P35637,AF-P35637-F1,1,526,4
2,P10636-8,P10636,AF-P10636-F1,1,758,4
3,P19659,P19659,AF-P19659-F1,1,1081,4
4,P91870,P91870,AF-P91870-F1,1,824,4
...,...,...,...,...,...,...
72,P10276,P10276,AF-P10276-F1,1,462,4
73,D6RBZ0,D6RBZ0,AF-D6RBZ0-F1,1,327,4
74,O14979,O14979,AF-O14979-F1,1,420,4
75,Q9UHD9,Q9UHD9,AF-Q9UHD9-F1,1,624,4


#### Check If PDB Present, If Not Fetch It, copying all to a target directory

Returns a dataframe linking original row with path to PDB

(Could be sped up with multithreading but not currently worth the hassle)

In [7]:
# target directory
COLLECTED_PDBS_DIR = '.\\demo_datasets\\collected_pdbs'

# List of directories that contain local pdb files
LOCAL_ALPHAFOLD_PDB_DIRECTORIES = ['.\\demo_datasets\\local_pdbs']

FINAL_OUTPUT_PATH = ".\\demo_datasets\\demo_llps_plus_AlphaFold_Info_with_PDB_Paths.csv"

In [8]:
AF_info_with_PDB_paths = FETCH.fetchPDBsFromAlphaFoldInfoDataFrame(AF_info, COLLECTED_PDBS_DIR, LOCAL_ALPHAFOLD_PDB_DIRECTORIES, outputPath=FINAL_OUTPUT_PATH)

P05453 (AF: AF-P05453-F1-model_v4.pdb) found locally
	copying .\demo_datasets\local_pdbs\AF-P05453-F1-model_v4.pdb to .\demo_datasets\collected_pdbs\AF-P05453-F1-model_v4.pdb
P35637 (AF: AF-P35637-F1-model_v4.pdb) found locally
	copying .\demo_datasets\local_pdbs\AF-P35637-F1-model_v4.pdb to .\demo_datasets\collected_pdbs\AF-P35637-F1-model_v4.pdb
P10636-8 (AF: AF-P10636-F1-model_v4.pdb) not found locally, pulling from AlphaFold
P19659 (AF: AF-P19659-F1-model_v4.pdb) not found locally, pulling from AlphaFold
P91870 (AF: AF-P91870-F1-model_v4.pdb) not found locally, pulling from AlphaFold
Q13151 (AF: AF-Q13151-F1-model_v4.pdb) not found locally, pulling from AlphaFold
Q9JIR4 (AF: AF-Q9JIR4-F1-model_v4.pdb) not found locally, pulling from AlphaFold
P78953 (AF: AF-P78953-F1-model_v4.pdb) not found locally, pulling from AlphaFold
G5EBV6 (AF: AF-G5EBV6-F1-model_v4.pdb) found locally
	copying .\demo_datasets\local_pdbs\AF-G5EBV6-F1-model_v4.pdb to .\demo_datasets\collected_pdbs\AF-G5EBV6-F1-

#### All Available AlphaFold PDBs will now be present in target direcory

In [9]:
AF_info_with_PDB_paths

Unnamed: 0,uniprot_ID_source,uniprot_ID_match,AF_DB_ID,firstResidueIndex,lastResidueIndex,latestVersion,PDB_path
0,P05453,P05453,AF-P05453-F1,1,685,4,.\demo_datasets\collected_pdbs\AF-P05453-F1-mo...
1,P35637,P35637,AF-P35637-F1,1,526,4,.\demo_datasets\collected_pdbs\AF-P35637-F1-mo...
2,P10636-8,P10636,AF-P10636-F1,1,758,4,.\demo_datasets\collected_pdbs\AF-P10636-F1-mo...
3,P19659,P19659,AF-P19659-F1,1,1081,4,.\demo_datasets\collected_pdbs\AF-P19659-F1-mo...
4,P91870,P91870,AF-P91870-F1,1,824,4,.\demo_datasets\collected_pdbs\AF-P91870-F1-mo...
...,...,...,...,...,...,...,...
72,P10276,P10276,AF-P10276-F1,1,462,4,.\demo_datasets\collected_pdbs\AF-P10276-F1-mo...
73,D6RBZ0,D6RBZ0,AF-D6RBZ0-F1,1,327,4,.\demo_datasets\collected_pdbs\AF-D6RBZ0-F1-mo...
74,O14979,O14979,AF-O14979-F1,1,420,4,.\demo_datasets\collected_pdbs\AF-O14979-F1-mo...
75,Q9UHD9,Q9UHD9,AF-Q9UHD9-F1,1,624,4,.\demo_datasets\collected_pdbs\AF-Q9UHD9-F1-mo...
