## A notebook to prepare data related to Surfacome
* Parse data from https://wlab.ethz.ch/cspa/
* map the data to the uniprot
* Move related pdbs from AF human database to a separate folder
* Filter out low plddt regions and save the new pdbs.

In [1]:
surfacome_list = "/work/lpdi/users/khakzad/Surfacome/database/surfaceome_ids.txt"
uniprot_list = "/work/lpdi/users/khakzad/Surfacome/database/uniprot_info.tsv"
pdb_data_chopped = "/work/lpdi/users/khakzad/Surfacome/pdbs/surfacome_chopped/"
pdb_data = "/work/lpdi/users/khakzad/Surfacome/pdbs/surfacome/"

In [2]:
import pandas as pd

In [4]:
targets = pd.read_csv(surfacome_list)
targets

Unnamed: 0,1A01_HUMAN
0,1A02_HUMAN
1,1A03_HUMAN
2,1A11_HUMAN
3,1A23_HUMAN
4,1A24_HUMAN
...,...
2880,ZP1_HUMAN
2881,ZP2_HUMAN
2882,ZP3_HUMAN
2883,ZP4_HUMAN


In [3]:
uniprot = pd.read_csv(uniprot_list, sep="\t")
uniprot

Unnamed: 0,From,Entry,Entry Name,Protein names,Gene Names,Organism,Length,Sequence,3D
0,4F2_HUMAN,P08195,4F2_HUMAN,4F2 cell-surface antigen heavy chain (4F2hc) (...,SLC3A2 MDU1,Homo sapiens (Human),630,MELQPPEASIAVVSIPRQLPGSHSEAGVQGLSAGDDSELGSHCVAQ...,Electron microscopy (8); X-ray crystallography...
1,5HT1A_HUMAN,P08908,5HT1A_HUMAN,5-hydroxytryptamine receptor 1A (5-HT-1A) (5-H...,HTR1A ADRB2RL1 ADRBRL1,Homo sapiens (Human),422,MDVLSPGQGNNTTSPPAPFETGGNTTGISDVTVSYQVITSLLLGTL...,Electron microscopy (3)
2,5HT1B_HUMAN,P28222,5HT1B_HUMAN,5-hydroxytryptamine receptor 1B (5-HT-1B) (5-H...,HTR1B HTR1DB,Homo sapiens (Human),390,MEEPGAQCAPPPPAGSETWVPQANLSSAPSQNCSAKDYIYQDSISL...,Electron microscopy (1); X-ray crystallography...
3,5HT1D_HUMAN,P28221,5HT1D_HUMAN,5-hydroxytryptamine receptor 1D (5-HT-1D) (5-H...,HTR1D HTR1DA HTRL,Homo sapiens (Human),377,MSPLNQSAEGLPQEASNRSLNATETSEAWDPRTLQALKISLAVVLS...,Electron microscopy (1)
4,5HT1E_HUMAN,P28566,5HT1E_HUMAN,5-hydroxytryptamine receptor 1E (5-HT-1E) (5-H...,HTR1E,Homo sapiens (Human),365,MNITNCTTEASMAIRPKTITEKMLICMTLVVITTLTTLLNLAVIMA...,Electron microscopy (1)
...,...,...,...,...,...,...,...,...,...
2708,ZP1_HUMAN,P60852,ZP1_HUMAN,Zona pellucida sperm-binding protein 1 (Zona p...,ZP1,Homo sapiens (Human),638,MAGGSATTWGYPVALLLLVATLGLGRWLQPDPGLPGLRHSYDCGIK...,
2709,ZP2_HUMAN,Q05996,ZP2_HUMAN,Zona pellucida sperm-binding protein 2 (Zona p...,ZP2 ZPA,Homo sapiens (Human),745,MACRQRGGSWSPSGWFNAGWSTYRSISLFFALVTSGNSIDVSQLVN...,
2710,ZP3_HUMAN,P21754,ZP3_HUMAN,Zona pellucida sperm-binding protein 3 (Sperm ...,ZP3 ZP3A ZP3B ZPC,Homo sapiens (Human),424,MELSYRLFICLLLWGSTELCYPQPLWLLQGGASHPETSVQPVLVEC...,
2711,ZP4_HUMAN,Q12836,ZP4_HUMAN,Zona pellucida sperm-binding protein 4 (Zona p...,ZP4 ZPB,Homo sapiens (Human),540,MWLLRCVLLCVSLSLAVSGQHKPEAPDYSSVLHCGPWSFQFAVNLN...,


In [4]:
uniprot_ids = uniprot['Entry'].unique()
uniprot_ids

array(['P08195', 'P08908', 'P28222', ..., 'P21754', 'Q12836', 'Q8TCW7'],
      dtype=object)

In [10]:
## Writing a csv file
df = uniprot.drop(uniprot.columns[[2, 3, 4, 5, 6, 7, 8]], axis=1)
df.rename(columns = {'Entry':'acc'}, inplace = True)
df.to_csv("/work/lpdi/users/khakzad/Surfacome/database/dataset_all.csv", sep=";")

In [8]:
import os
import shutil

In [11]:
AF_db = "/work/lpdi/users/khakzad/phospho_db/AF_dbs/Human_pdbs/"
target = '/work/lpdi/users/khakzad/Surfacome/pdbs/surfacome/'
# os.makedirs(target, exist_ok=True)

for index, row in uniprot.iterrows():
    try:
        acc = row['Entry']
    
        target_name = f"AF-{acc}-F1-model_v2.pdb.gz"
        pdb_to_move = os.path.join(AF_db, target_name)
        shutil.copyfile(pdb_to_move, f'{target}{acc}.pdb.gz')
    except (FileNotFoundError):
        print(acc)

O42043
O71037
P61566
P61570
Q69384
P61567
Q902F8
Q9UKH3
P58400
P58401
Q9HDB5
Q5TFQ8


* The list above which was not detected in AF human db, is added manually from uniprot.

## Filtering based on pLDDT and storing new pdbs in separate directory

In [15]:
import numpy as np
import os
import Bio.PDB as bpdb
from Bio.PDB import PDBParser

def get_bfactors(path):
    parser = PDBParser()
    structure = parser.get_structure('', path)
    residue2bfactor = {}
    for res in structure.get_residues():
        residue2bfactor[res.id[1]] = np.mean([at.bfactor for at in res.get_atoms()])
    return residue2bfactor

In [18]:
import warnings
warnings.filterwarnings('ignore')

acc2residue2bfactor = {}

class ResSelect(bpdb.Select):
    def accept_residue(self, res):
        if res.id[1] in remove_index:
            return False
        else:
            return True

for index, row in uniprot.iterrows():
    pdb = row['Entry']
    path = f'{pdb_data}/{pdb}.pdb'
    residue2bfactor = get_bfactors(path)

    remove_index = []
    for item in residue2bfactor:
        if residue2bfactor[item] < 60:
            remove_index.append(item)
            
    s = bpdb.PDBParser().get_structure('temp', path)
    io = bpdb.PDBIO()
    io.set_structure(s)
    io.save(os.path.join(pdb_data_chopped, pdb+'.pdb'), ResSelect())