In [1]:
from Bio.PDB import *
from Bio import PDB
from torch import nn
import numpy as np
from ase import Atoms, Atom
import dask.dataframe as dd
from ordered_set import OrderedSet
import os

In [2]:
local_folder="/Users/jessihoernschemeyer/pKaSchNet"
pkPDB_CSV = f"{local_folder}/pkas.csv"
#PKAD_CSV = f"{local_folder}/WT_pka.csv"

#make the dask data frame from the PYPKA csv
dk=dd.read_csv(pkPDB_CSV, delimiter=';', na_filter=False, dtype={'idcode':'category', 
                                                                  'residue_number':'uint8',
                                                                  'pk': 'float32',
                                                                  'residue_name':'category',
                                                                  'chain': 'category',
                                                                  'residue_name': 'category'
                                                                  })
                                                           
dk=dk.rename(columns={'idcode': 'PDB ID', 'residue_number': 'Res ID', 'residue_name': 'Res Name', 'residue_number': 'Res ID', 'pk': 'pKa', 'chain' : 'Chain'}) #rename columns to match df from pkad 
dk=dk.sort_values(['PDB ID', 'Res ID'], ascending=[True, True]) #sorts both
dk=dk.compute() #full pypka database
dff = dk.reset_index() #also the full db but with a reset index.

FileNotFoundError: An error occurred while calling the read_csv method registered to the pandas backend.
Original Message: [Errno 2] No such file or directory: '/Users/jessihoernschemeyer/pKaSchNet/pkas.csv'

In [3]:
#%%capture
def get_cutout(dask_df, distance_cutoff): #"PARENT" FUNCTION
    """gets pdb name from dask_df (PYPKA), downloads, then checks entire protein for undesirable atoms, and skips the entire structure in their presence.
        For those retained, it generates a cutout surrounding the target residue within target protein, and saves it as a PDB.
        inputs  | dask_df (dd):             the full pypka database
                | distance_cutoff (int):    cutoff radius (Å) from the titratable residue's COG for the neighbor search"""
    
    pdb_parser = PDB.PDBParser()
    pdbs = list(OrderedSet(list(dask_df["PDB ID"])))
    
    for i in range(3,4): #will equal len of set of pdbs in pypka, == 121294 
        #get target protein information, check the protein structure for questionable atoms, and gets biopython structure object in their absense. 
        pdbname = pdbs[i]
        Structure = pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower(pdbname),obsolete=False, pdir='PDB',file_format = 'pdb'))
        structure = check_atoms_protein(Structure, Structure.get_atoms())
        if structure == 0: #skip entire pdb and all its entries in pypka db if there are undesirables in pdb
            continue

        ns = PDB.NeighborSearch(list(structure.get_atoms())) #set up neighbor search for later execution 

        pdb_df = dask_df[dask_df.iloc[:, 1] == pdbname].drop(columns = ["PDB ID", "pKa"]) #make a subdf containing only residue entries which are in PYPKA (dask_df) TO Save time.

        #for each represented titratable residue in PYPKA, generate a cutout and saves to pdb 
        for j in range(len(pdb_df)): 
            chain=pdb_df.iloc[j]['Chain']
            res_id = int(pdb_df.iloc[j]['Res ID'])
            try:
                residue=structure[0][chain][res_id] 
                center = residue.center_of_mass(geometric=True)
                cutout = ns.search(center, distance_cutoff, "A")

                atoms_to_structure(cutout, f"{pdbname}_{chain}_{res_id}_{pdb_df.iloc[j]['Res Name']}") #save as pdb
            except:
                f"residue not found in pdb {pdbname}, skipping"
        print(time.time() - vor, "time to download and process one pdb")       
        os.remove(f"{local_folder}/PDB/pdb{pdbname}.ent")

def check_atoms_protein(structure, struc_atoms): 
    """internal function. checks every atom in the entire protein for metals, undesirables"""
    for atom in struc_atoms: #check if each atom is ?atoms
        element = atom.element

        if element in ["MG", "MN", "FE", "CO", "NI", "CU", "ZN"]:
            print(f"{element} present, pdb skipped")
            #structure="skip" #there is a "continue" (past entire pdb) if output = skip in parent function
            return 0
        else:
            atomid=atom.get_full_id()
            if atomid[3][0] not in [' '] and atomid[1] != ' ':       #check for hetero residues and "None" residues
                if element == 'S': #means that it is hetero and Sulfur, exclude.
                    print(f"{atomid}, hetero sulfur. pdb skipped ")
                    return 0
                #other salt
                if element in ['CA', 'CL', 'K', 'NA']:
                    for res in structure.get_residues():
                        if res.get_resname() in ["HIS", "CYS", "LYS", "ARG", "ASP", "GLU", "TYR", "MET"]: #MET is NTR. IS CTR EXCLUDED?? CHECK
                            d=np.linalg.norm(res.center_of_mass(geometric=True) - atom.get_coord()) #
                            if d < 3:
                                atom.get_parent().detach_child(atom.get_id())
                                print(f"salt {atom} deleted, {d} from {res}")
    return structure

def atoms_to_structure(cutout, filename): 
    """Internal function, under debugging construction. This one is responsible 
    for making my cutout info into a PDB. Last I checked it was working but not perfectly."""
    chain_dict = {}

    structure = Structure.Structure(filename)
    model = Model.Model(0)
    structure.add(model)

    for atom in cutout:
        res = atom.get_parent()  # a residue obj
        res_id = res.get_id()
        resname = res.get_resname()
        chain_id = res.get_full_id()[2]

        if chain_id not in chain_dict:
            chain = Chain.Chain(chain_id) #make new chain
            chain_dict[chain_id] = chain
            model.add(chain) #add it

        else:
            chain = chain_dict[chain_id]

        if res_id in [r.get_id() for r in chain.get_residues()]:
            residue = [r for r in chain.get_residues() if r.get_id() == res_id][0]
        else:
            residue = Residue.Residue(res_id, resname, '') #make res
            chain.add(residue)

        residue.add(atom)

    # save the pdb
    io = PDBIO()
    io.set_structure(structure)
    io.save(f"{filename}.pdb")

get_cutout(dff,10)


NameError: name 'dff' is not defined

In [75]:
!cat '11as_A_4_NTR.pdb'

ATOM      1  O   ALA A   4      13.261  37.848  26.096  1.00 33.33           O  
ATOM      2  N   ALA A   4      11.746  37.328  28.300  1.00 35.74           N  
ATOM      3  C   ALA A   4      13.388  38.646  27.027  1.00 29.73           C  
ATOM      4  CA  ALA A   4      12.364  38.679  28.168  1.00 34.19           C  
ATOM      5  CB  ALA A   4      13.027  39.086  29.501  1.00 19.69           C  
HETATM    6  O   HOH A 373      10.957  40.899  25.434  1.00 30.23           O  
ATOM      7  CA  TYR A   5      15.490  39.455  26.171  1.00 12.03           C  
ATOM      8  N   TYR A   5      14.341  39.569  27.044  1.00 21.73           N  
TER       9      TYR A   5                                                       
END   


In [None]:
"""pdb_parser = PDB.PDBParser()
struct= pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower('11as'),obsolete=False, pdir='PDB',file_format = 'pdb'))

for atom in struct.get_atoms():
    res=atom.get_parent()""""""

In [82]:
!cat '11as_B_248_GLU.pdb'

ATOM      1  OE2 GLU B 248      43.320  47.360  18.973  1.00 22.83           O  
ATOM      2  OE1 GLU B 248      44.666  48.176  17.431  1.00 30.64           O  
ATOM      3  CD  GLU B 248      44.393  47.919  18.624  1.00 17.46           C  
ATOM      4  CB  GLU B 248      46.729  47.479  19.590  1.00  2.00           C  
ATOM      5  CG  GLU B 248      45.419  48.282  19.697  1.00 26.43           C  
ATOM      6  N   GLU B 248      48.678  46.595  20.837  1.00 16.44           N  
ATOM      7  CA  GLU B 248      47.300  47.074  20.952  1.00  2.77           C  
ATOM      8  O   GLU B 248      46.534  44.819  21.239  1.00  2.00           O  
ATOM      9  C   GLU B 248      46.426  46.000  21.579  1.00  5.74           C  
ATOM     10  NH1 ARG B 299      44.734  44.465  17.380  1.00  9.59           N  
ATOM     11  N   LEU B 249      45.707  46.396  22.621  1.00  2.00           N  
ATOM     12  CA  LEU B 249      44.769  45.524  23.300  1.00  2.28           C  
TER      13      LEU B 249  

1. "highlight" protonatable residues which are in my cutouts but not my dataset. 
After? if I match 

In [None]:
dff

Unnamed: 0,index,PDB ID,Chain,Res Name,Res ID,pKa
0,947317,107l,A,NTR,1,8.12807
1,2525126,107l,A,GLU,5,3.47286
2,1774291,107l,A,ASP,10,1.21981
3,3149365,107l,A,GLU,11,4.16394
4,2152859,107l,A,LYS,16,10.31040
...,...,...,...,...,...,...
12628143,2567963,2n9a,A,NTR,1,7.25285
12628144,2939746,2n9a,A,LYS,8,10.34350
12628145,2646745,2n9a,A,CTR,11,2.79366
12628146,2632960,6uoq,A,NTR,24,7.93290


In [None]:
""""""
def titratable_res_check(neighbors):
    for atom in neighbors:
        parent = atom.get_parent()
        #if parent.get_resname() in ["HIS", "CYS", "LYS", "ARG", "ASP", "GLU", "TYR"]:
        if parent.get_resname() in ["HIS", "CYS", "ASP"]:
            d = np.linalg.norm(parent.center_of_mass() - atom.get_coord()) 
            if d > 1.5:
                #print(parent, d)
                2

            #d = np.linalg.norm(parent.center_of_mass() - atom.get_coord())) for ]
            #print([np.mean([np.linalg.norm(parent.center_of_mass() - atom.get_coord())) for ])
            #2

    #res_list = [atom.get_parent() for atom in neighbors]
    
    #res_names_list = [residue.get_resname() for residue in res_list]
    #print(res_names)
    #atom.get_parent()

    #for i in res_names_list


def check_atoms(neighbors, residue, center):
    print('check atoms being called')
    for atom in neighbors:
        S = atom.element
        if S not in ['C', 'O', 'N']: 

            if atom.element == 'S' and atom.get_full_id()[3][0] not in [' ']:
                print(np.linalg.norm(atom.get_coord() - center), f"Å {atom} from {atom.get_parent()} to {residue.get_full_id}")
                (titratable_res_check(neighbors))
                
                #print(atom.get_full_id(), "sulfur")
            
            if atom.element in ['CA', 'CL', 'K', 'NA']:
                print(atom.get_full_id(), "investigate")

            if "HETATM" in atom.get_parent():
                print(atom.get_parent())
#next_pdb = input_df.iloc[i+1]['PDB ID']

def create_ASE_objs(residue, neighbors):
    i1=0
    i2=0
    z, n_z, pos, n_pos = [],[], [], []
    #pos = [atom.get_coord() for atom in residue] ##
    #print(len(pos))
    #|n_pos = [atom.get_coord() for atom in neighbors]
    #print(len(n_pos))
    #z

    for atom in residue:
        i1 += 1
        pos.append(atom.get_coord())
        #if atom.element in dictionary.values():
            #print("hi")
            #print([int(d['Atomic_No']) for d in dictionary.values()], "test")
        for d in dictionary.values():
            if d['Element_Symbol'] == atom.element:
                #print(d['Element_Symbol'] != atom.element)
                z.append(int(d['Atomic_No'])) ##
            if d['Element_Symbol'] != atom.element:
                #print(atom.element)
                2
                #z = [atom.get_coord() for atom in neighbors] 
                #print(atom.element)
        for atom in neighbors:
            i2 += 1
            n_pos.append(atom.get_coord())
            for d in dictionary.values():
                if d['Element_Symbol'] == atom.element:
        #print(len(n_pos))
                    n_z.append(int(d['Atomic_No']))
        #n_z = [atom.get_coord() for atom in neighbors]

    print(i1, i2)

    Z = z + n_z
    R = pos + n_pos

    print(len(z), "z") #
    print(len(n_z), "z neighbors")
    print(len(pos), "pos") #i+1
    print(len(n_pos), "pos neighbors")
    #print(z)
    #print(pos)
    atoms = Atoms(z + n_z, pos + n_pos)
    return Z


neighbors, residue = get_neighbors(dff, 20)
Z = create_ASE_objs(residue, neighbors)


##check_atoms(get_neighbors(dff, 90))

#np.array([atom.get_coord() for atom in n])



dictionary = {
    1 : {
        'Atomic_No': 1,
        'Atomic_Mass': 1,
        'Element_Name': 'Hydrogen',
        'Element_Symbol': 'H',
        'Group_No': 1,
        'Period_No': 1,
        'Relative_Atomic_Mass': 1.0079
    },
    2 : {
        'Atomic_No': 2,
        'Atomic_Mass': 4,
        'Element_Name': 'Helium',
        'Element_Symbol': 'He',
        'Group_No': 18,
        'Period_No': 1,
        'Relative_Atomic_Mass': 4.0026
    },
    3 : {
        'Atomic_No': 3,
        'Atomic_Mass': 7,
        'Element_Name': 'Lithium',
        'Element_Symbol': 'Li',
        'Group_No': 1,
        'Period_No': 2,
        'Relative_Atomic_Mass': 6.941
    },
    4 : {
        'Atomic_No': 4,
        'Atomic_Mass': 9,
        'Element_Name': 'Beryllium',
        'Element_Symbol': 'Be',
        'Group_No': 2,
        'Period_No': 2,
        'Relative_Atomic_Mass': 9.0122
    },
    5 : {
        'Atomic_No': 5,
        'Atomic_Mass': 11,
        'Element_Name': 'Boron',
        'Element_Symbol': 'B',
        'Group_No': 13,
        'Period_No': 2,
        'Relative_Atomic_Mass': 10.811
    },
    6 : {
        'Atomic_No': 6,
        'Atomic_Mass': 12,
        'Element_Name': 'Carbon',
        'Element_Symbol': 'C',
        'Group_No': 14,
        'Period_No': 2,
        'Relative_Atomic_Mass': 12.0107
    },
    7 : {
        'Atomic_No': 7,
        'Atomic_Mass': 14,
        'Element_Name': 'Nitrogen',
        'Element_Symbol': 'N',
        'Group_No': 15,
        'Period_No': 2,
        'Relative_Atomic_Mass': 14.0067
    },
    8 : {
        'Atomic_No': 8,
        'Atomic_Mass': 16,
        'Element_Name': 'Oxygen',
        'Element_Symbol': 'O',
        'Group_No': 16,
        'Period_No': 2,
        'Relative_Atomic_Mass': 15.9994
    },
    9 : {
        'Atomic_No': 9,
        'Atomic_Mass': 19,
        'Element_Name': 'Fluorine',
        'Element_Symbol': 'F',
        'Group_No': 17,
        'Period_No': 2,
        'Relative_Atomic_Mass': 18.9984
    },
    10 : {
        'Atomic_No': 10,
        'Atomic_Mass': 20,
        'Element_Name': 'Neon',
        'Element_Symbol': 'Ne',
        'Group_No': 18,
        'Period_No': 2,
        'Relative_Atomic_Mass': 20.1797
    },
  11 : {
        'Atomic_No': 11,
        'Atomic_Mass': 23,
        'Element_Name': 'Sodium',
        'Element_Symbol': 'Na',
        'Group_No': 1,
        'Period_No': 3,
        'Relative_Atomic_Mass': 22.9897
    },
    12 : {
        'Atomic_No': 12,
        'Atomic_Mass': 24,
        'Element_Name': 'Magnesium',
        'Element_Symbol': 'Mg',
        'Group_No': 2,
        'Period_No': 3,
        'Relative_Atomic_Mass': 24.305
    },
    13 : {
        'Atomic_No': 13,
        'Atomic_Mass': 27,
        'Element_Name': 'Aluminium',
        'Element_Symbol': 'Al',
        'Group_No': 13,
        'Period_No': 3,
        'Relative_Atomic_Mass': 26.9815
    },
    14 : {
        'Atomic_No': 14,
        'Atomic_Mass': 28,
        'Element_Name': 'Silicon',
        'Element_Symbol': 'Si',
        'Group_No': 14,
        'Period_No': 3,
        'Relative_Atomic_Mass': 28.0855
    },
    15 : {
        'Atomic_No': 15,
        'Atomic_Mass': 31,
        'Element_Name': 'Phosphorus',
        'Element_Symbol': 'P',
        'Group_No': 15,
        'Period_No': 3,
        'Relative_Atomic_Mass': 30.9738
    },
    16 : {
        'Atomic_No': 16,
        'Atomic_Mass': 32,
        'Element_Name': 'Sulphur',
        'Element_Symbol': 'S',
        'Group_No': 16,
        'Period_No': 3,
        'Relative_Atomic_Mass': 32.065
    },
    17 : {
        'Atomic_No': 17,
        'Atomic_Mass': 35.5,
        'Element_Name': 'Chlorine',
        'Element_Symbol': 'Cl',
        'Group_No': 17,
        'Period_No': 3,
        'Relative_Atomic_Mass': 35.453
    },
    18 : {
        'Atomic_No': 18,
        'Atomic_Mass': 40,
        'Element_Name': 'Argon',
        'Element_Symbol': 'Ar',
        'Group_No': 18,
        'Period_No': 3,
        'Relative_Atomic_Mass': 39.948
    },
    19 : {
        'Atomic_No': 19,
        'Atomic_Mass': 39,
        'Element_Name': 'Potassium',
        'Element_Symbol': 'K',
        'Group_No': 1,
        'Period_No': 4,
        'Relative_Atomic_Mass': 39.0983
    },
    20 : {
        'Atomic_No': 20,
        'Atomic_Mass': 40,
        'Element_Name': 'Calcium',
        'Element_Symbol': 'Ca',
        'Group_No': 2,
        'Period_No': 4,
        'Relative_Atomic_Mass': 40.078
    },
    21 : {
        'Atomic_No': 21,
        'Atomic_Mass': 45,
        'Element_Name': 'scandium',
        'Element_Symbol': 'Sn',
        'Group_No': 3,
        'Period_No': 4,
        'Relative_Atomic_Mass': 44.956
    },
    22 : {
        'Atomic_No': 22,
        'Atomic_Mass': 48,
        'Element_Name': 'Titanium',
        'Element_Symbol': 'Ti',
        'Group_No': 4,
        'Period_No': 4,
        'Relative_Atomic_Mass': 47.867
    },
    23 : {
        'Atomic_No': 23,
        'Atomic_Mass': 51,
        'Element_Name': 'Vanadium',
        'Element_Symbol': 'V',
        'Group_No': 5,
        'Period_No': 4,
        'Relative_Atomic_Mass': 50.942
    },
    24 : {
        'Atomic_No': 24,
        'Atomic_Mass': 52,
        'Element_Name': 'Chiromium',
        'Element_Symbol': 'Cr',
        'Group_No': 6,
        'Period_No': 4,
        'Relative_Atomic_Mass': 51.996
    },
    25 : {
        'Atomic_No': 25,
        'Atomic_Mass': 55,
        'Element_Name': 'Manganese',
        'Element_Symbol': 'Mn',
        'Group_No': 7,
        'Period_No': 4,
        'Relative_Atomic_Mass': 54.938
    },
    26 : {
        'Atomic_No': 26,
        'Atomic_Mass': 56,
        'Element_Name': 'Iron',
        'Element_Symbol': 'Fe',
        'Group_No': 8,
        'Period_No': 4,
        'Relative_Atomic_Mass': 55.845
    },
    27 : {
        'Atomic_No': 27,
        'Atomic_Mass': 59,
        'Element_Name': 'Cobalt',
        'Element_Symbol': 'Co',
        'Group_No': 9,
        'Period_No': 4,
        'Relative_Atomic_Mass': 58.933
    },
    28 : {
        'Atomic_No': 28,
        'Atomic_Mass': 59,
        'Element_Name': 'Nickel',
        'Element_Symbol': 'Ni',
        'Group_No': 10,
        'Period_No': 4,
        'Relative_Atomic_Mass': 58.693
    },
    29 : {
        'Atomic_No': 29,
        'Atomic_Mass': 64,
        'Element_Name': 'Copper',
        'Element_Symbol': 'Cu',
        'Group_No': 11,
        'Period_No': 4,
        'Relative_Atomic_Mass': 63.546
    },
    30 : {
        'Atomic_No': 30,
        'Atomic_Mass': 65,
        'Element_Name': 'Zinc',
        'Element_Symbol': 'Zn',
        'Group_No': 12,
        'Period_No': 4,
        'Relative_Atomic_Mass': 65.38
    },
    31 : {
        'Atomic_No': 31,
        'Atomic_Mass': 70,
        'Element_Name': 'Gallium',
        'Element_Symbol': 'Ga',
        'Group_No': 13,
        'Period_No': 4,
        'Relative_Atomic_Mass': 69.723
    },
    32 : {
        'Atomic_No': 32,
        'Atomic_Mass': 73,
        'Element_Name': 'Germanium',
        'Element_Symbol': 'Ge',
        'Group_No': 14,
        'Period_No': 4,
        'Relative_Atomic_Mass': 72.64
    },
    33 : {
        'Atomic_No': 33,
        'Atomic_Mass': 74.922,
        'Element_Name': 'Arsenic',
        'Element_Symbol': 'As',
        'Group_No': 15,
        'Period_No': 4,
        'Relative_Atomic_Mass': 74.922
    },
    34 : {
        'Atomic_No': 34,
        'Atomic_Mass': 79,
        'Element_Name': 'Selenium',
        'Element_Symbol': 'Se',
        'Group_No': 16,
        'Period_No': 4,
        'Relative_Atomic_Mass': 78.96
    },
    35 : {
        'Atomic_No': 35,
        'Atomic_Mass': 80,
        'Element_Name': 'Bromine',
        'Element_Symbol': 'Br',
        'Group_No': 17,
        'Period_No': 4,
        'Relative_Atomic_Mass': 80
    },
    36 : {
        'Atomic_No': 36,
        'Atomic_Mass': 84,
        'Element_Name': 'Krypton',
        'Element_Symbol': 'Kr',
        'Group_No': 18,
        'Period_No': 4,
        'Relative_Atomic_Mass': 83.798
    },
    37 : {
        'Atomic_No': 37,
        'Atomic_Mass': 85,
        'Element_Name': 'Rubidium',
        'Element_Symbol': 'Rb',
        'Group_No': 1,
        'Period_No': 5,
        'Relative_Atomic_Mass': 85.468
    },
    38 : {
        'Atomic_No': 38,
        'Atomic_Mass': 88,
        'Element_Name': 'Strontium',
        'Element_Symbol': 'Sr',
        'Group_No': 2,
        'Period_No': 5,
        'Relative_Atomic_Mass': 87.62
    },
    39 : {
        'Atomic_No': 39,
        'Atomic_Mass': 88.906,
        'Element_Name': 'Yttrium',
        'Element_Symbol': 'Y',
        'Group_No': 3,
        'Period_No': 5,
        'Relative_Atomic_Mass': 88.906
    },
    40 : {
        'Atomic_No': 40,
        'Atomic_Mass': 91,
        'Element_Name': 'Zirconium',
        'Element_Symbol': 'Zr',
        'Group_No': 4,
        'Period_No': 5,
        'Relative_Atomic_Mass': 91.224
    }
}""""""

Structure exists: 'PDB/pdb107l.ent' 
check atoms being called
11.287019 Å <Atom S2> from <Residue BME het=H_BME resseq=901 icode= > to <bound method Entity.get_full_id of <Residue GLU het=  resseq=5 icode= >>
10.745317 Å <Atom S2> from <Residue BME het=H_BME resseq=902 icode= > to <bound method Entity.get_full_id of <Residue GLU het=  resseq=5 icode= >>
('', 0, 'A', ('H_CL', 173, ' '), ('CL', ' ')) investigate
667 6003
666 z
6003 z neighbors
667 pos
6003 pos neighbors


ValueError: Array "positions" has wrong length: 6670 != 6669.