### Checking AlphaFold PDB Sequences to Source Sequences

TODO:
- Only checks for exact match but this might miss out matching sub-sequences in case where exact match does not exist

In [1]:
import CheckAFPDBSequences as CHECK
import pandas as pd
import os
from Bio.PDB import PDBParser
from Bio.SeqUtils import seq1 # convert 3 letter AA seq to 1 letter AA seq

#### Config


- **SOURCE_SEQUENCES_PATH**: _Path to csv containing sequences, with uniprot IDs which were used to query AF PDB database (see CheckAlphaFoldPDBSequences)_
- **ID_FEATURE_NAME**: _Name of the column in above csv specifying the uniprot ID (default: 'Uniprot_ID' creatively...)_
- **SEQUENCE_FEATURE_NAME**: _Name of the column in the above csv specifying the sequence (default: 'Sequence' creatively...)_
- **AF_PDB_INFO_PATH**: _Path to csv containing AlphaFold PDB info, importantly containing path to local AF PDB for each uniprot ID (generated from CheckAlphaFoldPDBSequences)_
- **AF_PDB_ROOT_DIR**: _PDB paths in above csv are relative, need to provide path to PDB root dir for full path to PDB_
- **OUTPUT_SAVEPATH**: _Where to save generated PDB sequence match info_

#### Outputs
- CSV containing match information for sequence to AlphaFold PDB at [OUTPUT_SAVEPATH]

#### Warning

**AF_PDB_INFO_PATH** must be generated from CheckAlphaFoldPDBSequences, if it contains duplicate uniprot IDs or sequences this checking will cause duplications which will halt program


In [2]:
CONFIG = {
    'SOURCE_SEQUENCES_PATH': "./demo_datasets/demo_llps_minus.csv",
    'ID_FEATURE_NAME': "Uniprot_ID",
    'SEQUENCE_FEATURE_NAME': "Sequence",

    # Dataframe linking uniprot ID with path to AlphaFold PDB (source uniprot ID feature name is standard)
    'AF_PDB_DATAFRAME': "./demo_datasets/demo_llps_minus_AF_PDB_info.csv",
    # Paths in csv are relative so provide path to root dir
    'AF_PDB_ROOT_DIR': "./demo_datasets/",

    # Save Path for sequence match info
    'OUTPUT_SAVEPATH': './demo_datasets/demo_llps_minus_AF_PDB_match_info.csv'
}

In [3]:
# Unpack CONFIG file
SOURCE_SEQUENCES_PATH, ID_FEATURE_NAME, SEQUENCE_FEATURE_NAME, AF_PDB_DATAFRAME, AF_PDB_ROOT_DIR, OUTPUT_SAVEPATH = CHECK.utility_UnPackConfig(CONFIG)

In [4]:
# Get PDB parser
pdbParser = PDBParser()

# Load Source and PDM paths
sourceProteinData = pd.read_csv(SOURCE_SEQUENCES_PATH)
afPDBData = pd.read_csv(AF_PDB_DATAFRAME)

In [5]:
# Merge
merged = sourceProteinData.merge(afPDBData, left_on=ID_FEATURE_NAME, right_on='uniprot_ID_source', how='left')

In [6]:
merged.head()

Unnamed: 0,Sequence,Uniprot_ID,uniprot_ID_source,uniprot_ID_match,AF_DB_ID,firstResidueIndex,lastResidueIndex,latestVersion,PDB_path
0,GQNTRWNNLDAPPSRGTSKWENRGARDERIEQELFSGQLSGINFDK...,D0PV95,D0PV95,D0PV95,AF-D0PV95-F1,1.0,708.0,4.0,./llps_minus_PDBs/AF-D0PV95-F1-model_v4.pdb
1,NTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRET...,P35637,P35637,P35637,AF-P35637-F1,1.0,526.0,4.0,./llps_minus_PDBs/AF-P35637-F1-model_v4.pdb
2,MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM...,P09651-2,P09651-2,P09651,AF-P09651-F1,1.0,372.0,4.0,./llps_minus_PDBs/AF-P09651-F1-model_v4.pdb
3,DRRGGRGGYDRGGYRGRGGDRGGFRGGRGGGDRGGFGPGK,P35637,P35637,P35637,AF-P35637-F1,1.0,526.0,4.0,./llps_minus_PDBs/AF-P35637-F1-model_v4.pdb
4,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,P35637,P35637,P35637,AF-P35637-F1,1.0,526.0,4.0,./llps_minus_PDBs/AF-P35637-F1-model_v4.pdb


In [7]:
# Check Sequences
matchedDF = CHECK.checkAFPDBSequenceForDataFrame(merged, AF_PDB_ROOT_DIR, pdbParser, sequenceFeatureName=SEQUENCE_FEATURE_NAME)

In [8]:
matchedDF.head()

Unnamed: 0,Sequence,Uniprot_ID,uniprot_ID_source,uniprot_ID_match,AF_DB_ID,firstResidueIndex,lastResidueIndex,latestVersion,PDB_path,NoMatch,ExactMatch,TruncatedMatch,AFTruncated,SourceTruncated,AFStartResidue,AFEndResidue,SourceStartResidue,SourceEndResidue
0,GQNTRWNNLDAPPSRGTSKWENRGARDERIEQELFSGQLSGINFDK...,D0PV95,D0PV95,D0PV95,AF-D0PV95-F1,1.0,708.0,4.0,./llps_minus_PDBs/AF-D0PV95-F1-model_v4.pdb,False,False,True,False,True,1,541,1,541
1,NTIFVQGLGENVTIESVADYFKQIGIIKTNKKTGQPMINLYTDRET...,P35637,P35637,P35637,AF-P35637-F1,1.0,526.0,4.0,./llps_minus_PDBs/AF-P35637-F1-model_v4.pdb,False,False,True,False,True,1,242,1,242
2,MSKSESPKEPEQLRKLFIGGLSFETTDESLRSHFEQWGTLTDCVVM...,P09651-2,P09651-2,P09651,AF-P09651-F1,1.0,372.0,4.0,./llps_minus_PDBs/AF-P09651-F1-model_v4.pdb,True,False,False,False,False,0,0,0,0
3,DRRGGRGGYDRGGYRGRGGDRGGFRGGRGGGDRGGFGPGK,P35637,P35637,P35637,AF-P35637-F1,1.0,526.0,4.0,./llps_minus_PDBs/AF-P35637-F1-model_v4.pdb,False,False,True,False,True,1,40,1,40
4,MASNDYTQQATQSYGAYPTQPGQGYSQQSSQPYGQQSYSGYSQSTD...,P35637,P35637,P35637,AF-P35637-F1,1.0,526.0,4.0,./llps_minus_PDBs/AF-P35637-F1-model_v4.pdb,False,False,True,False,True,1,163,1,163


In [9]:
# Small debug output
numSequences = len(matchedDF)
numExactMatches = sum(matchedDF['ExactMatch'].values)
numTruncatedMatches = sum(matchedDF['TruncatedMatch'].values)
    
PDBs = matchedDF['PDB_path']
noPDBs = [m for m in PDBs if not isinstance(m, str)]
PDB_mask = [isinstance(m, str) for m in matchedDF['PDB_path']]
numNoMatchInPDB = sum(matchedDF[PDB_mask]['NoMatch'].values)
print(f'{numSequences} Sequences ran\n\t{numExactMatches} exact sequence matches in PDB\n\t{numTruncatedMatches} truncated sequence matches in PDB\n\t{numNoMatchInPDB} no sequence match in PDB\n\t{len(noPDBs)} PDB available')

84 Sequences ran
	14 exact sequence matches in PDB
	49 truncated sequence matches in PDB
	18 no sequence match in PDB
	3 PDB available
