In [1]:
from Bio.PDB import *
from Bio import PDB
from torch import nn
import numpy as np
from ase import Atoms, Atom
import dask.dataframe as dd
from ordered_set import OrderedSet

In [2]:
local_folder="/Users/jessihoernschemeyer/pKaSchNet"
pkPDB_CSV = f"{local_folder}/pkas.csv"
#PKAD_CSV = f"{local_folder}/WT_pka.csv"

#dataframe
dk=dd.read_csv(pkPDB_CSV, delimiter=';', na_filter=False, dtype={'idcode':'category', 
                                                                  'residue_number':'uint8',
                                                                  'pk': 'float32',
                                                                  'residue_name':'category',
                                                                  'chain': 'category',
                                                                  'residue_name': 'category'
                                                                  })
                                                           
dk=dk.rename(columns={'idcode': 'PDB ID', 'residue_number': 'Res ID', 'residue_name': 'Res Name', 'residue_number': 'Res ID', 'pk': 'pKa', 'chain' : 'Chain'}) #rename columns to match df from pkad 
dk=dk.sort_values(['PDB ID', 'Res ID'], ascending=[True, True]) #sorts both
dk=dk.compute() #full pypka database
dff = dk.reset_index() #also the full db but with a reset index.

In [3]:

#%%capture
def get_neighbors(dask_df, distance_cutoff): #"PARENT" FUNCTION
    """gets pdb name from dask df. checks for undesirable atoms/pdbs. outputs atomic neighbors for a residue"""
    pdb_parser = PDB.PDBParser()
    pdbs = list(OrderedSet(list(dask_df["PDB ID"])))
    for i in range(4): #will equal len of set of pdbs in pypka, == 121294 aka all unique pdb entries #iterates per pdb in pypka!!
        pdbname = pdbs[i]
        structure = pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower(pdbname),obsolete=False, pdir='PDB',file_format = 'pdb'))

        #check the protein structure for questionable atoms, returns biopython structure object
        structure = check_atoms_protein(structure, structure.get_atoms())
        if structure == "skip": #skip entire pdb and all its entries in pypka db if there are undesirables/exclusion criteria in pdb
            continue
        
        pdb_df = dask_df[dask_df.iloc[:, 1] == pdbname].drop(columns = ["PDB ID"])
        for j in range(len(pdb_df)):
            try:
                chain=pdb_df.iloc[j]['Chain']
                res_id = int(pdb_df.iloc[j]['Res ID'])
                residue=structure[0][chain][res_id] #model0, chain, res number
                #print('success')

                #neighbor search. gets the atomic neighbors 10å from the center of geometry of target residue.
                ns = PDB.NeighborSearch(list(structure.get_atoms()))
                center = residue.center_of_mass(geometric=True)
                neighbors = ns.search(center, distance_cutoff, "A") 

                #makes the atomic information a pdb
                atoms_to_structure(neighbors, f"{pdbname}_{chain}_{res_id}_{pdb_df.iloc[j]['Res Name']}")
                #print(Struct)
                #print(neighbors)
            except:
                #print(f"skipping {pdb_df.iloc[j]['Res ID']}")
                #residue=
                pass
  
    #atoms_to_structure(pdbname, )

    return neighbors

def check_atoms_protein(structure, struc_atoms): 
    """internal function. checks every atom in the entire protein for metals, undesirables"""
    for atom in struc_atoms: #check if each atom is ?atoms
        element = atom.element

        if element in ["MG", "MN", "FE", "CO", "NI", "CU", "ZN"]:
            print(f"{element} present, pdb skipped")
            structure="skip" #IIRC will "continue" in parent function making the entire PDB skipped, because parent function iterates per pdb.
        else:
            atomid=atom.get_full_id()
            if atomid[3][0] not in [' '] and atomid[1] != ' ':       #check for hetero residues and "None" residues
                if element == 'S': #means that it is hetero and Sulfur, exclude.
                    #print(atom.get_full_id())
                    structure="skip"
                    print(f"{atomid}, hetero sulfur. pdb skipped ")
                
                #other salt
                if element in ['CA', 'CL', 'K', 'NA']:
                    for res in structure.get_residues():
                        if res.get_resname() in ["HIS", "CYS", "LYS", "ARG", "ASP", "GLU", "TYR", "MET"]: #MET is NTR. IS CTR EXCLUDED?? CHECK
                            d=np.linalg.norm(res.center_of_mass(geometric=True) - atom.get_coord()) #
                            if d < 3:
                                #print(atom.get_full_id())
                                atom.get_parent().detach_child(atom.get_id())

                                print(f"salt {atom} deleted, {d} from {res}")
    return structure


def atoms_to_structure(neighbors, filename): 
    "internal function, under debugging construction. this one is responsible for making my cutout info into a pdb. last i checked it was working but not perfectly."
    structuree = Structure.Structure(filename)
    model = Model.Model(0)
    for atom in neighbors:
        res = atom.get_parent() #a residue
        C = res.get_full_id()[2]
        #print(C)
        #print(Residue)
        residue = PDB.Residue.Residue(res.get_id(), C, res.get_resname())
        #print(Residue.get_resname())
        residue.add(atom)
        #print(res.get_id(), residue)

        #

        chain = Chain.Chain(C)
    chain.add(residue)
    model.add(chain)
    structuree.add(model)

        #residue.add(atom)
    #chain.add(residue)
    

        #print(model.get_chains())
    #structuree.add(model)
    #print(structuree)

    #print(structuree.get_chains())
    for chain in structuree.get_chains():
        print(chain)
    #for residue in structuree.get_residues():
        #print(residue)
    #for model in structuree.get_models():
        #print(model)
        #print(residue.get_coord(), "structuree")
    io = PDBIO()
    io.set_structure(structuree)
    io.save(f"{filename}.pdb")
    print(type(structuree))

    # Create a chain object
    #chain = Chain.Chain("A")
    #model.add(chain)
    #chain.add(residue)

    #residue = Residue.Residue((' ', residue_id, ' '), residue_name, "")
    return structuree


#neighbors = get_neighbors(dff, 4)
#cutout = create_ASE_objs(residue, neighbors)


##check_atoms(get_neighbors(dff, 90))

#np.array([atom.get_coord() for atom in n])

In [4]:
n = get_neighbors(dff, 4)



Structure exists: 'PDB/pdb107l.ent' 
('', 0, 'A', ('H_BME', 901, ' '), ('S2', ' ')), skipped 
('', 0, 'A', ('H_BME', 902, ' '), ('S2', ' ')), skipped 
Structure exists: 'PDB/pdb112l.ent' 
('', 0, 'A', ('H_BME', 901, ' '), ('S2', ' ')), skipped 
('', 0, 'A', ('H_BME', 902, ' '), ('S2', ' ')), skipped 
Structure exists: 'PDB/pdb113l.ent' 
('', 0, 'A', ('H_BME', 901, ' '), ('S2', ' ')), skipped 
('', 0, 'A', ('H_BME', 902, ' '), ('S2', ' ')), skipped 
Structure exists: 'PDB/pdb11as.ent' 
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>




<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=A>
<class 'Bio.PDB.Structure.Structure'>
<Chain id=B>
<class 'Bio.PDB.St

In [52]:
n

[<Atom OE2>,
 <Atom NH1>,
 <Atom OE1>,
 <Atom CD>,
 <Atom CB>,
 <Atom CG>,
 <Atom N>,
 <Atom CA>,
 <Atom O>,
 <Atom C>,
 <Atom N>,
 <Atom CA>]

In [258]:
io = PDBIO()
io.set_structure(structure)
io.save("out.pdb")

In [75]:
!cat "11as_A_235_ASP.pdb"

ATOM      1  N     A A 236      20.291  12.217  14.460  1.00  2.00       ILE N  
TER       2        A A 236                                                       
END   


1. "highlight" protonatable residues which are in my cutouts but not my dataset. 
After? if I match 

In [303]:
pdb_parser = PDB.PDBParser()
structure = pdb_parser.get_structure("",  "11as_A_235_ASP.pdb")

ValueError: Empty file.

In [180]:
f = dff[dff.iloc[:, 1] == '107l'].drop(columns = ["PDB ID"])

print(neighbors)

In [None]:
for atom in struc.get_atoms():
    print(2)
    print(atom)

In [241]:
print(type(neighbors))

<class 'Bio.PDB.Residue.Residue'>


In [264]:
print(neighbors)

[<Atom OE2>, <Atom NH1>, <Atom OE1>, <Atom CD>, <Atom CB>, <Atom CG>, <Atom N>, <Atom CA>, <Atom O>, <Atom C>, <Atom N>, <Atom CA>]


In [183]:

def get_neighbors(dask_df, distance_cutoff):
    """gets pdb name from dask df. outputs atomic neighbors for a residue"""
    #nahe_z, _zres =np.zeros(1), np.zeros(1)
    pdb_parser = PDB.PDBParser()
    pdbs = list(OrderedSet(list(dask_df["PDB ID"])))
    for i in range(2): #len of set is 121294 aka all unique pdb entries
        pdbname = pdbs[i]
        structure = pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower(pdbname),obsolete=False, pdir='PDB',file_format = 'pdb'))
        struc_atoms = structure.get_atoms()

        #check the protein structure for questionable atoms, returns t
        structure = check_atoms_protein(structure, struc_atoms)
        if structure == "metals present, skipping pdb f{pdbname}":
            continue
        
        pdb_df = dask_df[dask_df.iloc[:, 1] == pdbname].drop(columns = ["PDB ID"])

        for j in range(len(pdb_df)):
            residue=structure[0][(pdb_df.iloc[j]['Chain'])][int(pdb_df.iloc[j]['Res ID'])] #model0, chain, res number

        #neighbor search
        ns = PDB.NeighborSearch(list(structure.get_atoms()))
        center = residue.center_of_mass(geometric=True)
        neighbors = ns.search(center, distance_cutoff, "A") #finds neighbors at level "A" (atomwise) #Bio.PDB.Atom.Atom list
        
        #check_atoms_protein(structure)
        #check_atoms(neighbors, residue, center)
        #titratable_res_check(neighbors)

        #cutout = create_ASE_objs(residue, neighbors)

    return residue, neighbors #list of atoms objects Bio.PDB.Atom.Atom

def check_atoms_protein(structure, struc_atoms):
    #print(structure.get_residues())
    #print(structure.)
    #for res in structure.get_residues():
    #atom.get_full_id()[3][0] not in [' ']
    #salt = ['CA', 'CL', 'K', 'NA']
    #pdb_parser = PDB.PDBParser()
    #structure = pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower(pdbname),obsolete=False, pdir='PDB',file_format = 'pdb'))
    #struc_atoms = structure.get_atoms()
    #exclude pdbs based on.. 1) metals anywhere

    
    for atom in struc_atoms: #check if each atom is ?atoms
        element = atom.element
        if element in ["MG", "MN", "FE", "CO", "NI", "CU", "ZN"]:
            print("simulation of deleting pdb")
            structure="error"
        else:
            atomid=atom.get_full_id()
            if atomid[3][0] not in [' '] and atomid[1] != ' ':       #check for hetero residues and "None" residues
                if element == 'S': #means that it is hetero and Sulfur, exclude.
                    #print(atom.get_full_id())
                    print("simulation of deleting pdb")
                
                #other salt
                if element in ['CA', 'CL', 'K', 'NA']:
                    for res in structure.get_residues():
                        if res.get_resname() in ["HIS", "CYS", "LYS", "ARG", "ASP", "GLU", "TYR", "MET"]: #MET is NTR. IS CTR EXCLUDED?? CHECK
                            d=np.linalg.norm(res.center_of_mass(geometric=True) - atom.get_coord()) #
                            if d < 3:
                                #print(atom.get_full_id())
                                atom.get_parent().detach_child(atom.get_id())

                                print(f"{atom} deleted, {d} from {res}")
    return structure

                        #print(res.get_resname())
                    #res.center_of_mass(geometric=True)
                    #d = np.linalg.norm(res.center_of_mass(geometric=True) - atom.get_coord())
                #print(sorted([np.linalg.norm(res.center_of_mass(geometric=True) - atom.get_coord()) for res in structure.get_residues()]))





def titratable_res_check(neighbors):
    for atom in neighbors:
        parent = atom.get_parent()
        if parent.get_resname() in ["HIS", "CYS", "LYS", "ARG", "ASP", "GLU", "TYR"]:

        #if parent.get_resname() in ["HIS", "CYS", "ASP"]:
            d = np.linalg.norm(parent.center_of_mass() - atom.get_coord()) 
            if d > 1.5:
                #print(parent, d)
                2

            #d = np.linalg.norm(parent.center_of_mass() - atom.get_coord())) for ]
            #print([np.mean([np.linalg.norm(parent.center_of_mass() - atom.get_coord())) for ])
            #2

    #res_list = [atom.get_parent() for atom in neighbors]
    #res_names_list = [residue.get_resname() for residue in res_list]

def check_atoms(neighbors, residue, center):
    print('check atoms being called')
    for atom in neighbors:
        S = atom.element
        if atom.get_full_id()[3][0] != ' ':
            #if S not in ['C', 'O', 'N']: 
            if atom.element == 'S' and atom.get_full_id()[3][0] not in [' ']:
                #print(atom.get_full_id())
                #print(np.linalg.norm(atom.get_coord() - center), f"Å {atom} from {atom.get_parent()} to {residue.get_full_id}")
                (titratable_res_check(neighbors))
                #print(atom.get_full_id(), "sulfur")
            
            if atom.element in ['CA', 'CL', 'K', 'NA']: #delete atom and continue
                #print(np.linalg.norm(atom.get_coord() - center), f"Å {atom} from {atom.get_parent()} to {residue.get_full_id}")
                2
            #if "HETATM" in atom.get_parent():
                #print(atom.get_parent(), "HETATM")




def create_ASE_objs(residue, neighbors):
    z, n_z, pos, n_pos = [],[], [], []
    for atom in residue:
        b=2
        pos.append(atom.get_coord())

        for d in dictionary.values():
                if d['Element_Symbol'].casefold() == atom.element.casefold():
                    z.append(int(d['Atomic_No']))

        for atom in neighbors:
            n_pos.append(atom.get_coord())

            for d in dictionary.values():
                if (d['Element_Symbol']).casefold() == atom.element.casefold():
                    n_z.append(int(d['Atomic_No']))

    cutout = Atoms(z + n_z, pos + n_pos).set_cell([[1,0,0], [0,1,0], [0,0,1]])

    return cutout


neighbors, residue = get_neighbors(dff, 20)
#cutout = create_ASE_objs(residue, neighbors)


##check_atoms(get_neighbors(dff, 90))

#np.array([atom.get_coord() for atom in n])

Structure exists: 'PDB/pdb107l.ent' 
simulation of deleting pdb
simulation of deleting pdb
Structure exists: 'PDB/pdb112l.ent' 
simulation of deleting pdb
simulation of deleting pdb


In [121]:
dff

Unnamed: 0,index,PDB ID,Chain,Res Name,Res ID,pKa
0,947317,107l,A,NTR,1,8.12807
1,2525126,107l,A,GLU,5,3.47286
2,1774291,107l,A,ASP,10,1.21981
3,3149365,107l,A,GLU,11,4.16394
4,2152859,107l,A,LYS,16,10.31040
...,...,...,...,...,...,...
12628143,2567963,2n9a,A,NTR,1,7.25285
12628144,2939746,2n9a,A,LYS,8,10.34350
12628145,2646745,2n9a,A,CTR,11,2.79366
12628146,2632960,6uoq,A,NTR,24,7.93290


In [172]:
from Bio import PDB
def get_neighbors(dask_df, distance_cutoff):
    """gets pdb name from dask df. outputs atomic neighbors for a residue"""
    nahe_z, _zres =np.zeros(1), np.zeros(1)
    pdb_parser = PDB.PDBParser()

    for i in range(2):

        #download the pdb and get it as structure
        if dask_df.iloc[i]['Res Name'] not in ["CTR", "NTR"]: #exclude ctr and ntr
            structure = pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower(dask_df.iloc[i]['PDB ID']),obsolete=False, pdir='PDB',file_format = 'pdb'))

            residue=structure[0]['A'][int(dask_df.iloc[i]['Res ID'])] #model0, chain, res number
            center = residue.center_of_mass(geometric=True)

            ns = PDB.NeighborSearch(list(structure.get_atoms()))
            neighbors = ns.search(residue.center_of_mass(geometric=True), distance_cutoff, "A") #finds neighbors at level "A" (atomwise) #Bio.PDB.Atom.Atom list

            check_atoms(neighbors, residue, center)
            titratable_res_check(neighbors)

            #get pos of residue
            for atom in residue.get_atoms(): #calculate the center of the residue. get_atoms() is an iterable 
                pos = [atom.get_coord() for atom in residue.get_atoms()]
            n_pos = np.array([atom.get_coord() for atom in neighbors])

            #get pos of vicinity residues
            for atom in neighbors:
                for d in dictionary.values():
                    if d['Element_Symbol'] == (atom.element).upper:
                        nahe_z=np.append(nahe_z, int(d['Atomic_No']))
            
            cutout = Atoms()


            n_pos = np.array([atom.get_coord() for atom in neighbors])



    return residue, neighbors #list of atoms objects Bio.PDB.Atom.Atom



            

    

def titratable_res_check(neighbors):
    for atom in neighbors:
        parent = atom.get_parent()
        #if parent.get_resname() in ["HIS", "CYS", "LYS", "ARG", "ASP", "GLU", "TYR"]:
        if parent.get_resname() in ["HIS", "CYS", "ASP"]:
            d = np.linalg.norm(parent.center_of_mass() - atom.get_coord()) 
            if d > 1.5:
                #print(parent, d)
                2

            #d = np.linalg.norm(parent.center_of_mass() - atom.get_coord())) for ]
            #print([np.mean([np.linalg.norm(parent.center_of_mass() - atom.get_coord())) for ])
            #2

    #res_list = [atom.get_parent() for atom in neighbors]
    
    #res_names_list = [residue.get_resname() for residue in res_list]
    #print(res_names)
    #atom.get_parent()

    #for i in res_names_list


def check_atoms(neighbors, residue, center):
    print('check atoms being called')
    for atom in neighbors:
        S = atom.element
        if S not in ['C', 'O', 'N']: 

            if atom.element == 'S' and atom.get_full_id()[3][0] not in [' ']:
                print(np.linalg.norm(atom.get_coord() - center), f"Å {atom} from {atom.get_parent()} to {residue.get_full_id}")
                (titratable_res_check(neighbors))
                
                #print(atom.get_full_id(), "sulfur")
            
            if atom.element in ['CA', 'CL', 'K', 'NA']:
                print(atom.get_full_id(), "investigate")

            if "HETATM" in atom.get_parent():
                print(atom.get_parent())
#next_pdb = input_df.iloc[i+1]['PDB ID']

def create_ASE_objs(residue, neighbors):
    i1=0
    i2=0
    z, n_z, pos, n_pos = [],[], [], []
    #pos = [atom.get_coord() for atom in residue] ##
    #print(len(pos))
    #|n_pos = [atom.get_coord() for atom in neighbors]
    #print(len(n_pos))
    #z

    for atom in residue:
        i1 += 1
        pos.append(atom.get_coord())
        #if atom.element in dictionary.values():
            #print("hi")
            #print([int(d['Atomic_No']) for d in dictionary.values()], "test")
        for d in dictionary.values():
            if d['Element_Symbol'] == atom.element:
                #print(d['Element_Symbol'] != atom.element)
                z.append(int(d['Atomic_No'])) ##
            if d['Element_Symbol'] != atom.element:
                #print(atom.element)
                2
                #z = [atom.get_coord() for atom in neighbors] 
                #print(atom.element)
        for atom in neighbors:
            i2 += 1
            n_pos.append(atom.get_coord())
            for d in dictionary.values():
                if d['Element_Symbol'] == atom.element:
        #print(len(n_pos))
                    n_z.append(int(d['Atomic_No']))
        #n_z = [atom.get_coord() for atom in neighbors]

    print(i1, i2)

    Z = z + n_z
    R = pos + n_pos

    print(len(z), "z") #
    print(len(n_z), "z neighbors")
    print(len(pos), "pos") #i+1
    print(len(n_pos), "pos neighbors")
    #print(z)
    #print(pos)
    atoms = Atoms(z + n_z, pos + n_pos)
    return Z


neighbors, residue = get_neighbors(dff, 20)
Z = create_ASE_objs(residue, neighbors)


##check_atoms(get_neighbors(dff, 90))

#np.array([atom.get_coord() for atom in n])



dictionary = {
    1 : {
        'Atomic_No': 1,
        'Atomic_Mass': 1,
        'Element_Name': 'Hydrogen',
        'Element_Symbol': 'H',
        'Group_No': 1,
        'Period_No': 1,
        'Relative_Atomic_Mass': 1.0079
    },
    2 : {
        'Atomic_No': 2,
        'Atomic_Mass': 4,
        'Element_Name': 'Helium',
        'Element_Symbol': 'He',
        'Group_No': 18,
        'Period_No': 1,
        'Relative_Atomic_Mass': 4.0026
    },
    3 : {
        'Atomic_No': 3,
        'Atomic_Mass': 7,
        'Element_Name': 'Lithium',
        'Element_Symbol': 'Li',
        'Group_No': 1,
        'Period_No': 2,
        'Relative_Atomic_Mass': 6.941
    },
    4 : {
        'Atomic_No': 4,
        'Atomic_Mass': 9,
        'Element_Name': 'Beryllium',
        'Element_Symbol': 'Be',
        'Group_No': 2,
        'Period_No': 2,
        'Relative_Atomic_Mass': 9.0122
    },
    5 : {
        'Atomic_No': 5,
        'Atomic_Mass': 11,
        'Element_Name': 'Boron',
        'Element_Symbol': 'B',
        'Group_No': 13,
        'Period_No': 2,
        'Relative_Atomic_Mass': 10.811
    },
    6 : {
        'Atomic_No': 6,
        'Atomic_Mass': 12,
        'Element_Name': 'Carbon',
        'Element_Symbol': 'C',
        'Group_No': 14,
        'Period_No': 2,
        'Relative_Atomic_Mass': 12.0107
    },
    7 : {
        'Atomic_No': 7,
        'Atomic_Mass': 14,
        'Element_Name': 'Nitrogen',
        'Element_Symbol': 'N',
        'Group_No': 15,
        'Period_No': 2,
        'Relative_Atomic_Mass': 14.0067
    },
    8 : {
        'Atomic_No': 8,
        'Atomic_Mass': 16,
        'Element_Name': 'Oxygen',
        'Element_Symbol': 'O',
        'Group_No': 16,
        'Period_No': 2,
        'Relative_Atomic_Mass': 15.9994
    },
    9 : {
        'Atomic_No': 9,
        'Atomic_Mass': 19,
        'Element_Name': 'Fluorine',
        'Element_Symbol': 'F',
        'Group_No': 17,
        'Period_No': 2,
        'Relative_Atomic_Mass': 18.9984
    },
    10 : {
        'Atomic_No': 10,
        'Atomic_Mass': 20,
        'Element_Name': 'Neon',
        'Element_Symbol': 'Ne',
        'Group_No': 18,
        'Period_No': 2,
        'Relative_Atomic_Mass': 20.1797
    },
  11 : {
        'Atomic_No': 11,
        'Atomic_Mass': 23,
        'Element_Name': 'Sodium',
        'Element_Symbol': 'Na',
        'Group_No': 1,
        'Period_No': 3,
        'Relative_Atomic_Mass': 22.9897
    },
    12 : {
        'Atomic_No': 12,
        'Atomic_Mass': 24,
        'Element_Name': 'Magnesium',
        'Element_Symbol': 'Mg',
        'Group_No': 2,
        'Period_No': 3,
        'Relative_Atomic_Mass': 24.305
    },
    13 : {
        'Atomic_No': 13,
        'Atomic_Mass': 27,
        'Element_Name': 'Aluminium',
        'Element_Symbol': 'Al',
        'Group_No': 13,
        'Period_No': 3,
        'Relative_Atomic_Mass': 26.9815
    },
    14 : {
        'Atomic_No': 14,
        'Atomic_Mass': 28,
        'Element_Name': 'Silicon',
        'Element_Symbol': 'Si',
        'Group_No': 14,
        'Period_No': 3,
        'Relative_Atomic_Mass': 28.0855
    },
    15 : {
        'Atomic_No': 15,
        'Atomic_Mass': 31,
        'Element_Name': 'Phosphorus',
        'Element_Symbol': 'P',
        'Group_No': 15,
        'Period_No': 3,
        'Relative_Atomic_Mass': 30.9738
    },
    16 : {
        'Atomic_No': 16,
        'Atomic_Mass': 32,
        'Element_Name': 'Sulphur',
        'Element_Symbol': 'S',
        'Group_No': 16,
        'Period_No': 3,
        'Relative_Atomic_Mass': 32.065
    },
    17 : {
        'Atomic_No': 17,
        'Atomic_Mass': 35.5,
        'Element_Name': 'Chlorine',
        'Element_Symbol': 'Cl',
        'Group_No': 17,
        'Period_No': 3,
        'Relative_Atomic_Mass': 35.453
    },
    18 : {
        'Atomic_No': 18,
        'Atomic_Mass': 40,
        'Element_Name': 'Argon',
        'Element_Symbol': 'Ar',
        'Group_No': 18,
        'Period_No': 3,
        'Relative_Atomic_Mass': 39.948
    },
    19 : {
        'Atomic_No': 19,
        'Atomic_Mass': 39,
        'Element_Name': 'Potassium',
        'Element_Symbol': 'K',
        'Group_No': 1,
        'Period_No': 4,
        'Relative_Atomic_Mass': 39.0983
    },
    20 : {
        'Atomic_No': 20,
        'Atomic_Mass': 40,
        'Element_Name': 'Calcium',
        'Element_Symbol': 'Ca',
        'Group_No': 2,
        'Period_No': 4,
        'Relative_Atomic_Mass': 40.078
    },
    21 : {
        'Atomic_No': 21,
        'Atomic_Mass': 45,
        'Element_Name': 'scandium',
        'Element_Symbol': 'Sn',
        'Group_No': 3,
        'Period_No': 4,
        'Relative_Atomic_Mass': 44.956
    },
    22 : {
        'Atomic_No': 22,
        'Atomic_Mass': 48,
        'Element_Name': 'Titanium',
        'Element_Symbol': 'Ti',
        'Group_No': 4,
        'Period_No': 4,
        'Relative_Atomic_Mass': 47.867
    },
    23 : {
        'Atomic_No': 23,
        'Atomic_Mass': 51,
        'Element_Name': 'Vanadium',
        'Element_Symbol': 'V',
        'Group_No': 5,
        'Period_No': 4,
        'Relative_Atomic_Mass': 50.942
    },
    24 : {
        'Atomic_No': 24,
        'Atomic_Mass': 52,
        'Element_Name': 'Chiromium',
        'Element_Symbol': 'Cr',
        'Group_No': 6,
        'Period_No': 4,
        'Relative_Atomic_Mass': 51.996
    },
    25 : {
        'Atomic_No': 25,
        'Atomic_Mass': 55,
        'Element_Name': 'Manganese',
        'Element_Symbol': 'Mn',
        'Group_No': 7,
        'Period_No': 4,
        'Relative_Atomic_Mass': 54.938
    },
    26 : {
        'Atomic_No': 26,
        'Atomic_Mass': 56,
        'Element_Name': 'Iron',
        'Element_Symbol': 'Fe',
        'Group_No': 8,
        'Period_No': 4,
        'Relative_Atomic_Mass': 55.845
    },
    27 : {
        'Atomic_No': 27,
        'Atomic_Mass': 59,
        'Element_Name': 'Cobalt',
        'Element_Symbol': 'Co',
        'Group_No': 9,
        'Period_No': 4,
        'Relative_Atomic_Mass': 58.933
    },
    28 : {
        'Atomic_No': 28,
        'Atomic_Mass': 59,
        'Element_Name': 'Nickel',
        'Element_Symbol': 'Ni',
        'Group_No': 10,
        'Period_No': 4,
        'Relative_Atomic_Mass': 58.693
    },
    29 : {
        'Atomic_No': 29,
        'Atomic_Mass': 64,
        'Element_Name': 'Copper',
        'Element_Symbol': 'Cu',
        'Group_No': 11,
        'Period_No': 4,
        'Relative_Atomic_Mass': 63.546
    },
    30 : {
        'Atomic_No': 30,
        'Atomic_Mass': 65,
        'Element_Name': 'Zinc',
        'Element_Symbol': 'Zn',
        'Group_No': 12,
        'Period_No': 4,
        'Relative_Atomic_Mass': 65.38
    },
    31 : {
        'Atomic_No': 31,
        'Atomic_Mass': 70,
        'Element_Name': 'Gallium',
        'Element_Symbol': 'Ga',
        'Group_No': 13,
        'Period_No': 4,
        'Relative_Atomic_Mass': 69.723
    },
    32 : {
        'Atomic_No': 32,
        'Atomic_Mass': 73,
        'Element_Name': 'Germanium',
        'Element_Symbol': 'Ge',
        'Group_No': 14,
        'Period_No': 4,
        'Relative_Atomic_Mass': 72.64
    },
    33 : {
        'Atomic_No': 33,
        'Atomic_Mass': 74.922,
        'Element_Name': 'Arsenic',
        'Element_Symbol': 'As',
        'Group_No': 15,
        'Period_No': 4,
        'Relative_Atomic_Mass': 74.922
    },
    34 : {
        'Atomic_No': 34,
        'Atomic_Mass': 79,
        'Element_Name': 'Selenium',
        'Element_Symbol': 'Se',
        'Group_No': 16,
        'Period_No': 4,
        'Relative_Atomic_Mass': 78.96
    },
    35 : {
        'Atomic_No': 35,
        'Atomic_Mass': 80,
        'Element_Name': 'Bromine',
        'Element_Symbol': 'Br',
        'Group_No': 17,
        'Period_No': 4,
        'Relative_Atomic_Mass': 80
    },
    36 : {
        'Atomic_No': 36,
        'Atomic_Mass': 84,
        'Element_Name': 'Krypton',
        'Element_Symbol': 'Kr',
        'Group_No': 18,
        'Period_No': 4,
        'Relative_Atomic_Mass': 83.798
    },
    37 : {
        'Atomic_No': 37,
        'Atomic_Mass': 85,
        'Element_Name': 'Rubidium',
        'Element_Symbol': 'Rb',
        'Group_No': 1,
        'Period_No': 5,
        'Relative_Atomic_Mass': 85.468
    },
    38 : {
        'Atomic_No': 38,
        'Atomic_Mass': 88,
        'Element_Name': 'Strontium',
        'Element_Symbol': 'Sr',
        'Group_No': 2,
        'Period_No': 5,
        'Relative_Atomic_Mass': 87.62
    },
    39 : {
        'Atomic_No': 39,
        'Atomic_Mass': 88.906,
        'Element_Name': 'Yttrium',
        'Element_Symbol': 'Y',
        'Group_No': 3,
        'Period_No': 5,
        'Relative_Atomic_Mass': 88.906
    },
    40 : {
        'Atomic_No': 40,
        'Atomic_Mass': 91,
        'Element_Name': 'Zirconium',
        'Element_Symbol': 'Zr',
        'Group_No': 4,
        'Period_No': 5,
        'Relative_Atomic_Mass': 91.224
    }
}

Structure exists: 'PDB/pdb107l.ent' 
check atoms being called
11.287019 Å <Atom S2> from <Residue BME het=H_BME resseq=901 icode= > to <bound method Entity.get_full_id of <Residue GLU het=  resseq=5 icode= >>
10.745317 Å <Atom S2> from <Residue BME het=H_BME resseq=902 icode= > to <bound method Entity.get_full_id of <Residue GLU het=  resseq=5 icode= >>
('', 0, 'A', ('H_CL', 173, ' '), ('CL', ' ')) investigate
667 6003
666 z
6003 z neighbors
667 pos
6003 pos neighbors


ValueError: Array "positions" has wrong length: 6670 != 6669.

In [100]:
dff

Unnamed: 0,index,PDB ID,Chain,Res Name,Res ID,pKa
0,947317,107l,A,NTR,1,8.12807
1,2525126,107l,A,GLU,5,3.47286
2,1774291,107l,A,ASP,10,1.21981
3,3149365,107l,A,GLU,11,4.16394
4,2152859,107l,A,LYS,16,10.31040
...,...,...,...,...,...,...
12628143,2567963,2n9a,A,NTR,1,7.25285
12628144,2939746,2n9a,A,LYS,8,10.34350
12628145,2646745,2n9a,A,CTR,11,2.79366
12628146,2632960,6uoq,A,NTR,24,7.93290


In [48]:
residue

NameError: name 'residue' is not defined

In [43]:
"""(atom) __sub__(self, other)
 |      Calculate distance between two atoms.
 |      
 |      :param other: the other atom
 |      :type other: L{Atom}
 
 (Residue) blank means  not part of hetero-residue
 |      (or a water)"""

NameError: name 'atom' is not defined

In [39]:
from Bio import PDB#debug cell
def get_neighbors(dask_df, distance_cutoff):
    """gets pdb name from dask df. outputs atomic neighbors for a residue"""
    nahe_z, _zres =np.zeros(1), np.zeros(1)

    for i in range(1):

        #download the pdb and get it as structure
        pdb_parser = PDB.PDBParser()
        structure = pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower(dask_df.iloc[i]['PDB ID']),obsolete=False, pdir='PDB',file_format = 'pdb'))

        residue=structure[0]['A'][int(dask_df.iloc[i]['Res ID'])] #model0, chain, res number

        for atom in residue.get_atoms(): #calculate the center of the residue. get_atoms() is an iterable 
            pos = np.array([atom.get_coord() for atom in residue.get_atoms()]) 
            center = np.mean(pos, axis=0) #returns the center of the res

        ns = PDB.NeighborSearch(list(structure.get_atoms()))
        neighbors = ns.search(center, distance_cutoff, "A") #finds neighbors at level "A" (atomwise) #Bio.PDB.Atom.Atom list
        
        check_atoms(neighbors, residue, center)

        for atom in neighbors:
            S = atom.element
            
            for d in dictionary.values():
                if d['Element_Symbol'] == S:
                    nahe_z=np.append(nahe_z, int(d['Atomic_No']))


        n_pos = np.array([atom.get_coord() for atom in neighbors])


    return neighbors, center #list of atoms objects Bio.PDB.Atom.Atom








def check_atoms(neighbors, residue, center):
    for atom in neighbors:
        S = atom.element
        if S not in ['C', 'O', 'N']: 

            if atom.element == 'S' and atom.get_full_id()[3][0] not in [' ']:
                print(np.linalg.norm(atom.get_coord() - center), f"Å {atom} from {atom.get_parent()} to {residue.get_full_id}")
                #print(atom.get_full_id(), "sulfur")
            
            if atom.element in ['Ca', 'Cl', 'K', 'Na']:
                print(atom.get_full_id(), "investigate")
#next_pdb = input_df.iloc[i+1]['PDB ID']
                
type(get_neighbors(dff, 90)[0])
##check_atoms(get_neighbors(dff, 90))

#np.array([atom.get_coord() for atom in n])


def prot_res_check(vicinity):
    if vicinity_res in ["HIS", "LYS", "ARG", "ASP", "GLU"]:
        print(2)



Structure exists: 'PDB/pdb107l.ent' 
10.947641 Å <Atom S2> from <Residue BME het=H_BME resseq=901 icode= > to <bound method Entity.get_full_id of <Residue MET het=  resseq=1 icode= >>
10.019097 Å <Atom S2> from <Residue BME het=H_BME resseq=902 icode= > to <bound method Entity.get_full_id of <Residue MET het=  resseq=1 icode= >>


"Look for nearby residues that may be coordinating to the metal ion. In many cases, metal ions in protein structures are coordinated by specific amino acid residues, such as histidine, cysteine, or aspartate. If you find coordinating residues nearby, it's likely that the metal ion is part of a ligand molecule.

"In the context of a Protein Data Bank (PDB) file, a "LINK" record is used to specify a covalent bond between two atoms in the structure that are not part of the standard amino acids or nucleic acids. This can include bonds between ligands, cofactors, metal ions, or other non-standard residues."

from pdb 107l: "REMARK 500 DISTANCE CUTOFF:                                                     
REMARK 500 2.2 ANGSTROMS FOR CONTACTS NOT INVOLVING HYDROGEN ATOMS              
REMARK 500 1.6 ANGSTROMS FOR CONTACTS INVOLVING HYDROGEN ATOMS"

In [1]:


dictionary = {
    1 : {
        'Atomic_No': 1,
        'Atomic_Mass': 1,
        'Element_Name': 'Hydrogen',
        'Element_Symbol': 'H',
        'Group_No': 1,
        'Period_No': 1,
        'Relative_Atomic_Mass': 1.0079
    },
    2 : {
        'Atomic_No': 2,
        'Atomic_Mass': 4,
        'Element_Name': 'Helium',
        'Element_Symbol': 'He',
        'Group_No': 18,
        'Period_No': 1,
        'Relative_Atomic_Mass': 4.0026
    },
    3 : {
        'Atomic_No': 3,
        'Atomic_Mass': 7,
        'Element_Name': 'Lithium',
        'Element_Symbol': 'Li',
        'Group_No': 1,
        'Period_No': 2,
        'Relative_Atomic_Mass': 6.941
    },
    4 : {
        'Atomic_No': 4,
        'Atomic_Mass': 9,
        'Element_Name': 'Beryllium',
        'Element_Symbol': 'Be',
        'Group_No': 2,
        'Period_No': 2,
        'Relative_Atomic_Mass': 9.0122
    },
    5 : {
        'Atomic_No': 5,
        'Atomic_Mass': 11,
        'Element_Name': 'Boron',
        'Element_Symbol': 'B',
        'Group_No': 13,
        'Period_No': 2,
        'Relative_Atomic_Mass': 10.811
    },
    6 : {
        'Atomic_No': 6,
        'Atomic_Mass': 12,
        'Element_Name': 'Carbon',
        'Element_Symbol': 'C',
        'Group_No': 14,
        'Period_No': 2,
        'Relative_Atomic_Mass': 12.0107
    },
    7 : {
        'Atomic_No': 7,
        'Atomic_Mass': 14,
        'Element_Name': 'Nitrogen',
        'Element_Symbol': 'N',
        'Group_No': 15,
        'Period_No': 2,
        'Relative_Atomic_Mass': 14.0067
    },
    8 : {
        'Atomic_No': 8,
        'Atomic_Mass': 16,
        'Element_Name': 'Oxygen',
        'Element_Symbol': 'O',
        'Group_No': 16,
        'Period_No': 2,
        'Relative_Atomic_Mass': 15.9994
    },
    9 : {
        'Atomic_No': 9,
        'Atomic_Mass': 19,
        'Element_Name': 'Fluorine',
        'Element_Symbol': 'F',
        'Group_No': 17,
        'Period_No': 2,
        'Relative_Atomic_Mass': 18.9984
    },
    10 : {
        'Atomic_No': 10,
        'Atomic_Mass': 20,
        'Element_Name': 'Neon',
        'Element_Symbol': 'Ne',
        'Group_No': 18,
        'Period_No': 2,
        'Relative_Atomic_Mass': 20.1797
    },
  11 : {
        'Atomic_No': 11,
        'Atomic_Mass': 23,
        'Element_Name': 'Sodium',
        'Element_Symbol': 'Na',
        'Group_No': 1,
        'Period_No': 3,
        'Relative_Atomic_Mass': 22.9897
    },
    12 : {
        'Atomic_No': 12,
        'Atomic_Mass': 24,
        'Element_Name': 'Magnesium',
        'Element_Symbol': 'Mg',
        'Group_No': 2,
        'Period_No': 3,
        'Relative_Atomic_Mass': 24.305
    },
    13 : {
        'Atomic_No': 13,
        'Atomic_Mass': 27,
        'Element_Name': 'Aluminium',
        'Element_Symbol': 'Al',
        'Group_No': 13,
        'Period_No': 3,
        'Relative_Atomic_Mass': 26.9815
    },
    14 : {
        'Atomic_No': 14,
        'Atomic_Mass': 28,
        'Element_Name': 'Silicon',
        'Element_Symbol': 'Si',
        'Group_No': 14,
        'Period_No': 3,
        'Relative_Atomic_Mass': 28.0855
    },
    15 : {
        'Atomic_No': 15,
        'Atomic_Mass': 31,
        'Element_Name': 'Phosphorus',
        'Element_Symbol': 'P',
        'Group_No': 15,
        'Period_No': 3,
        'Relative_Atomic_Mass': 30.9738
    },
    16 : {
        'Atomic_No': 16,
        'Atomic_Mass': 32,
        'Element_Name': 'Sulphur',
        'Element_Symbol': 'S',
        'Group_No': 16,
        'Period_No': 3,
        'Relative_Atomic_Mass': 32.065
    },
    17 : {
        'Atomic_No': 17,
        'Atomic_Mass': 35.5,
        'Element_Name': 'Chlorine',
        'Element_Symbol': 'Cl',
        'Group_No': 17,
        'Period_No': 3,
        'Relative_Atomic_Mass': 35.453
    },
    18 : {
        'Atomic_No': 18,
        'Atomic_Mass': 40,
        'Element_Name': 'Argon',
        'Element_Symbol': 'Ar',
        'Group_No': 18,
        'Period_No': 3,
        'Relative_Atomic_Mass': 39.948
    },
    19 : {
        'Atomic_No': 19,
        'Atomic_Mass': 39,
        'Element_Name': 'Potassium',
        'Element_Symbol': 'K',
        'Group_No': 1,
        'Period_No': 4,
        'Relative_Atomic_Mass': 39.0983
    },
    20 : {
        'Atomic_No': 20,
        'Atomic_Mass': 40,
        'Element_Name': 'Calcium',
        'Element_Symbol': 'Ca',
        'Group_No': 2,
        'Period_No': 4,
        'Relative_Atomic_Mass': 40.078
    },
    21 : {
        'Atomic_No': 21,
        'Atomic_Mass': 45,
        'Element_Name': 'scandium',
        'Element_Symbol': 'Sn',
        'Group_No': 3,
        'Period_No': 4,
        'Relative_Atomic_Mass': 44.956
    },
    22 : {
        'Atomic_No': 22,
        'Atomic_Mass': 48,
        'Element_Name': 'Titanium',
        'Element_Symbol': 'Ti',
        'Group_No': 4,
        'Period_No': 4,
        'Relative_Atomic_Mass': 47.867
    },
    23 : {
        'Atomic_No': 23,
        'Atomic_Mass': 51,
        'Element_Name': 'Vanadium',
        'Element_Symbol': 'V',
        'Group_No': 5,
        'Period_No': 4,
        'Relative_Atomic_Mass': 50.942
    },
    24 : {
        'Atomic_No': 24,
        'Atomic_Mass': 52,
        'Element_Name': 'Chiromium',
        'Element_Symbol': 'Cr',
        'Group_No': 6,
        'Period_No': 4,
        'Relative_Atomic_Mass': 51.996
    },
    25 : {
        'Atomic_No': 25,
        'Atomic_Mass': 55,
        'Element_Name': 'Manganese',
        'Element_Symbol': 'Mn',
        'Group_No': 7,
        'Period_No': 4,
        'Relative_Atomic_Mass': 54.938
    },
    26 : {
        'Atomic_No': 26,
        'Atomic_Mass': 56,
        'Element_Name': 'Iron',
        'Element_Symbol': 'Fe',
        'Group_No': 8,
        'Period_No': 4,
        'Relative_Atomic_Mass': 55.845
    },
    27 : {
        'Atomic_No': 27,
        'Atomic_Mass': 59,
        'Element_Name': 'Cobalt',
        'Element_Symbol': 'Co',
        'Group_No': 9,
        'Period_No': 4,
        'Relative_Atomic_Mass': 58.933
    },
    28 : {
        'Atomic_No': 28,
        'Atomic_Mass': 59,
        'Element_Name': 'Nickel',
        'Element_Symbol': 'Ni',
        'Group_No': 10,
        'Period_No': 4,
        'Relative_Atomic_Mass': 58.693
    },
    29 : {
        'Atomic_No': 29,
        'Atomic_Mass': 64,
        'Element_Name': 'Copper',
        'Element_Symbol': 'Cu',
        'Group_No': 11,
        'Period_No': 4,
        'Relative_Atomic_Mass': 63.546
    },
    30 : {
        'Atomic_No': 30,
        'Atomic_Mass': 65,
        'Element_Name': 'Zinc',
        'Element_Symbol': 'Zn',
        'Group_No': 12,
        'Period_No': 4,
        'Relative_Atomic_Mass': 65.38
    },
    31 : {
        'Atomic_No': 31,
        'Atomic_Mass': 70,
        'Element_Name': 'Gallium',
        'Element_Symbol': 'Ga',
        'Group_No': 13,
        'Period_No': 4,
        'Relative_Atomic_Mass': 69.723
    },
    32 : {
        'Atomic_No': 32,
        'Atomic_Mass': 73,
        'Element_Name': 'Germanium',
        'Element_Symbol': 'Ge',
        'Group_No': 14,
        'Period_No': 4,
        'Relative_Atomic_Mass': 72.64
    },
    33 : {
        'Atomic_No': 33,
        'Atomic_Mass': 74.922,
        'Element_Name': 'Arsenic',
        'Element_Symbol': 'As',
        'Group_No': 15,
        'Period_No': 4,
        'Relative_Atomic_Mass': 74.922
    },
    34 : {
        'Atomic_No': 34,
        'Atomic_Mass': 79,
        'Element_Name': 'Selenium',
        'Element_Symbol': 'Se',
        'Group_No': 16,
        'Period_No': 4,
        'Relative_Atomic_Mass': 78.96
    },
    35 : {
        'Atomic_No': 35,
        'Atomic_Mass': 80,
        'Element_Name': 'Bromine',
        'Element_Symbol': 'Br',
        'Group_No': 17,
        'Period_No': 4,
        'Relative_Atomic_Mass': 80
    },
    36 : {
        'Atomic_No': 36,
        'Atomic_Mass': 84,
        'Element_Name': 'Krypton',
        'Element_Symbol': 'Kr',
        'Group_No': 18,
        'Period_No': 4,
        'Relative_Atomic_Mass': 83.798
    },
    37 : {
        'Atomic_No': 37,
        'Atomic_Mass': 85,
        'Element_Name': 'Rubidium',
        'Element_Symbol': 'Rb',
        'Group_No': 1,
        'Period_No': 5,
        'Relative_Atomic_Mass': 85.468
    },
    38 : {
        'Atomic_No': 38,
        'Atomic_Mass': 88,
        'Element_Name': 'Strontium',
        'Element_Symbol': 'Sr',
        'Group_No': 2,
        'Period_No': 5,
        'Relative_Atomic_Mass': 87.62
    },
    39 : {
        'Atomic_No': 39,
        'Atomic_Mass': 88.906,
        'Element_Name': 'Yttrium',
        'Element_Symbol': 'Y',
        'Group_No': 3,
        'Period_No': 5,
        'Relative_Atomic_Mass': 88.906
    },
    40 : {
        'Atomic_No': 40,
        'Atomic_Mass': 91,
        'Element_Name': 'Zirconium',
        'Element_Symbol': 'Zr',
        'Group_No': 4,
        'Period_No': 5,
        'Relative_Atomic_Mass': 91.224
    }
}

In [12]:
L = [d['Element_Symbol'] for d in dictionary.values()]
print(L)
a = [x.upper() for x in L]
print(a)


['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sn', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr']
['H', 'HE', 'LI', 'BE', 'B', 'C', 'N', 'O', 'F', 'NE', 'NA', 'MG', 'AL', 'SI', 'P', 'S', 'CL', 'AR', 'K', 'CA', 'SN', 'TI', 'V', 'CR', 'MN', 'FE', 'CO', 'NI', 'CU', 'ZN', 'GA', 'GE', 'AS', 'SE', 'BR', 'KR', 'RB', 'SR', 'Y', 'ZR']


In [None]:
from Bio import PDB
def get_neighbors(dask_df, distance_cutoff):
    """gets pdb name from dask df. outputs atomic neighbors for a residue"""
    nahe_z, _zres =np.zeros(1), np.zeros(1)
    pdb_parser = PDB.PDBParser()

    for i in range(2):

        #download the pdb and get it as stru∏cture
        if dask_df.iloc[i]['Res Name'] not in ["CTR", "NTR"]: #exclude ctr and ntr
            structure = pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower(dask_df.iloc[i]['PDB ID']),obsolete=False, pdir='PDB',file_format = 'pdb'))

            residue=structure[0]['A'][int(dask_df.iloc[i]['Res ID'])] #model0, chain, res number
            center = residue.center_of_mass(geometric=True)

            ns = PDB.NeighborSearch(list(structure.get_atoms()))
            neighbors = ns.search(residue.center_of_mass(geometric=True), distance_cutoff, "A") #finds neighbors at level "A" (atomwise) #Bio.PDB.Atom.Atom list

            check_atoms(neighbors, residue, center)
            titratable_res_check(neighbors)

            #get pos of residue

            #for atom in residue.get_atoms(): #calculate the center of the residue. get_atoms() is an iterable 
                #pos = [atom.get_coord() for atom in residue.get_atoms()]
            #n_pos = np.array([atom.get_coord() for atom in neighbors])

            #get pos of vicinity residues
            #for atom in neighbors:
                #for d in dictionary.values():
                    #if d['Element_Symbol'] == (atom.element).upper:
                        #nahe_z=np.append(nahe_z, int(d['Atomic_No']))
            
            cutout = Atoms()


            #n_pos = np.array([atom.get_coord() for atom in neighbors])



    return residue, neighbors #list of atoms objects Bio.PDB.Atom.Atom



            

    

def titratable_res_check(neighbors):
    for atom in neighbors:
        parent = atom.get_parent()
        #if parent.get_resname() in ["HIS", "CYS", "LYS", "ARG", "ASP", "GLU", "TYR"]:
        if parent.get_resname() in ["HIS", "CYS", "ASP"]:
            d = np.linalg.norm(parent.center_of_mass() - atom.get_coord()) 
            if d > 1.5:
                #print(parent, d)
                2

            #d = np.linalg.norm(parent.center_of_mass() - atom.get_coord())) for ]
            #print([np.mean([np.linalg.norm(parent.center_of_mass() - atom.get_coord())) for ])
            #2

    #res_list = [atom.get_parent() for atom in neighbors]
    
    #res_names_list = [residue.get_resname() for residue in res_list]
    #print(res_names)
    #atom.get_parent()

    #for i in res_names_list


def check_atoms(neighbors, residue, center):
    print('check atoms being called')
    for atom in neighbors:
        S = atom.element
        if S not in ['C', 'O', 'N']: 

            if atom.element == 'S' and atom.get_full_id()[3][0] not in [' ']:
                print(np.linalg.norm(atom.get_coord() - center), f"Å {atom} from {atom.get_parent()} to {residue.get_full_id}")
                (titratable_res_check(neighbors))
                
                #print(atom.get_full_id(), "sulfur")
            
            if atom.element in ['CA', 'CL', 'K', 'NA']:
                print(atom.get_full_id(), "investigate")

            if "HETATM" in atom.get_parent():
                print(atom.get_parent())
#next_pdb = input_df.iloc[i+1]['PDB ID']

def create_ASE_objs(residue, neighbors):
    i1=[]
    i2=[]
    z, n_z, pos, n_pos = [],[], [], []
    b=2
    
    #pos = [atom.get_coord() for atom in residue] ##
    #print(len(pos))
    #|n_pos = [atom.get_coord() for atom in neighbors]
    #print(len(n_pos))
    #z
    z, n_z, pos, n_pos = [],[], [], []
    print(z, n_z, pos, n_pos)
    for atom in residue:
        b=2
        pos.append(atom.get_coord())

        for d in dictionary.values():
                if d['Element_Symbol'].casefold() == atom.element.casefold():
        #print(len(n_pos))
                    i1.append(b)
                    z.append(int(d['Atomic_No']))

        #if atom.element in dictionary.values():
            #print("hi")
            #print([int(d['Atomic_No']) for d in dictionary.values()], "test")
        #for d in dictionary.values():
            #dic = list(d['Element_Symbol'].upper())
            #print(dic)
            #if (atom.element) not in a:
                #print(atom.element)
            



        for atom in neighbors:
            n_pos.append(atom.get_coord())

            for d in dictionary.values():
                if (d['Element_Symbol']).casefold() == atom.element.casefold():
                    i2.append(b)
        #print(len(n_pos))
                    n_z.append(int(d['Atomic_No']))
        #n_z = [atom.get_coord() for atom in neighbors]

    print(len(i1), len(i2))

    #Z = z + n_z    
    #R = pos + n_pos

    print(len(z), "z") #
    print(z)
    print(len(n_z), "z neighbors")
    print(len(pos), "pos") #i+1
    print(len(n_pos), "pos neighbors")
    #print(z)
    #print(pos)
    atoms = Atoms(z + n_z, pos + n_pos)
    Z=[]
    return Z


neighbors, residue = get_neighbors(dff, 20)
Z = create_ASE_objs(residue, neighbors)


##check_atoms(get_neighbors(dff, 90))

#np.array([atom.get_coord() for atom in n])