In [3]:
from Bio.PDB import *
from Bio import PDB
from torch import nn
import numpy as np
from ase import Atoms, Atom
import dask.dataframe as dd
from ordered_set import OrderedSet
import os


local_folder="/Users/jessihoernschemeyer/pKaSchNet"
pkPDB_CSV = f"{local_folder}/pkas.csv"
def read_database(path):
    """csv --> dask df"""
    #make the dask data frame from the PYPKA csv
    dk=dd.read_csv(path, delimiter=';', na_filter=False, dtype={'idcode':'category', 
                                                                    'residue_number':'uint8',
                                                                    'pk': 'float32',
                                                                    'residue_name':'category',
                                                                    'chain': 'category',
                                                                    'residue_name': 'category'
                                                                    })
                                                            
    dk=dk.rename(columns={'idcode': 'PDB ID', 'residue_number': 'Res ID', 'residue_name': 'Res Name', 'residue_number': 'Res ID', 'pk': 'pKa', 'chain' : 'Chain'}) #rename columns to match df from pkad 
    dk=dk.sort_values(['PDB ID', 'Res ID'], ascending=[True, True]) 
    dk=dk.compute() 
    dff = dk.reset_index() 

    return dff

def check_atoms_protein(structure, struc_atoms): 
    """internal function. checks every atom in the entire protein for metals, undesirables"""
    pdb_residues=[]
    for atom in struc_atoms: 
        resname, atomid=atom.get_parent().get_resname(), atom.get_full_id()[2:]
        element=atomid[2][0]

        if element in ["MG", "MN", "FE", "CO", "NI", "CU", "ZN"]:
            return 0#,0#print(f"{element} present, pdb skipped")
        
        else:
            #atomid=atom.get_full_id() #('', 0, 'B', (' ', 177, ' '), ('OH', ' '))
            if atomid[1][0] not in [' ']:
                if element == 'S': #check 4 hetero sulfur, exclude.
                    print(f"{atomid}, hetero sulfur. pdb skipped ")
                    return 0#,0
                
                if element in ['CA', 'CL', 'K', 'NA']: #other salt
                    for res in structure.get_residues():
                        if resname in ["GLU", "HIS", "ASP", "ARG", "TYR", "CYS", "LYS"]: #if the other salt is part of the residue (<3Ã¥ from geometric center), delete atom from residue
                            if np.linalg.norm(res.center_of_mass(geometric=True) - atom.get_coord()) < 3:
                                atom.get_parent().detach_child(atom.get_id()) #print(f"salt {atom} deleted, {d} from {res}")
    
    return structure#, set(pdb_residues) #('', 0, 'B', ('W', 371, ' '), ('O', ' '))

def atoms_to_structure(cutout, filename): 
    """Internal function (or not), cutout --> save to harddrive
    input: cutout: list of biopython atom objects (NOT ASE)"""
    chain_dict = {}

    structure = Structure.Structure(filename)
    model = Model.Model(0)
    structure.add(model)

    for atom in cutout:
        res = atom.get_parent()
        res_id, resname, chain_id = res.get_id(), res.get_resname(), res.get_full_id()[2]

        #make acidic GLH and ASH straight here. so change their name before saving 
        if resname == "GLU":
            resname="GLH"
            
        if resname=="ASP":
            resname="ASH"

        if resname=="HIS":
            resname="HIP"
            
        
        if chain_id not in chain_dict:
            chain = Chain.Chain(chain_id) #make new chain
            chain_dict[chain_id] = chain
            model.add(chain) #add it

        else:
            chain = chain_dict[chain_id]

        if res_id in [res.get_id() for res in chain.get_residues()]:
            residue = [res for res in chain.get_residues() if res.get_id() == res_id][0] 
        else:
            residue = Residue.Residue(res_id, resname, '') #make new res
            chain.add(residue)

        residue.add(atom)
    # save the pdb
    io = PDBIO()
    io.set_structure(structure)
    io.save(f"cuts/{filename}.pdb")

#dask_df = read_database(local_folder + pkPDB_CSV)

todo:(
    make sure changing index in merge or not is correct, as well as the modified fnames.

recut, deprotonate

to do: chek about those stupid indexes

In [4]:
#pdb_parser, pdbs = PDB.PDBParser(), list(OrderedSet(list(dask_df["PDB ID"])))
pdb_parser = PDB.PDBParser()
def generate_cutout_around_protonatable_site(residue, distance_cutoff, ns, counter, resname):
    """Residue wise resolurion. ns is neighbor search set up for the entire protein, residue is the single data point / 1 of several residues in a pdb & in pypka.
    input is one residue. output is the cutout around its titratable site, both of which can be plural e.g. his, mb asp and glu.
    residue (biopython Residue object): a single protonable residue """
    protonatable_sites = {"G":("OE1","OE2"), "A":("OD1","OD2"), "C":"SG", "L":"NZ", "H":("NE2", "ND1"), "T":"OH"}
    cuts = []
    if resname==0:
        #first atom is N and NTR
        #atoms=residue.
        center = residue['N'].get_coord()
        cut = ns.search(center, distance_cutoff, "A")
        cuts.append((counter, center, 'NTR', cut)) #counter is id!
        return cuts
    
    elif resname==1: #CTR
        try:
            center = residue['OXT'].get_coord()
            cut = ns.search(center, distance_cutoff, "A")
            cuts.append((counter, center, 'OX', cut))
        except:
            center = residue['C'].get_coord()
        cut = ns.search(center, distance_cutoff, "A")
        cuts.append((counter, center, 'X', cut)) #counter is id!
        return cuts
 
    else:
        if resname=="G": 
            sites=protonatable_sites[resname]
            atom1,atom2=residue[sites[0]],residue[sites[1]]
            if atom1.is_disordered(): 
                center, resname = atom1.get_coord(), resname + "D"
            elif atom2.is_disordered():
                center, resname = atom2.get_coord(), resname + "D"
            else:
                center=(atom1.get_coord() + atom2.get_coord()) / 2.0
            cut = ns.search(center, distance_cutoff, "A") #put ns search i n below? todo
            cuts.append((counter, center, resname, cut)) #counter is id!
            return cuts
        if resname=="A": 
            sites=protonatable_sites[resname]
            atom1,atom2=residue[sites[0]],residue[sites[1]]
            if atom1.is_disordered(): 
                center, resname = atom1.get_coord(), resname + "D"
                print(1)
            elif atom2.is_disordered():
                center, resname = atom2.get_coord(), resname + "D"
                print(2)
            else:
                center=(atom1.get_coord() + atom2.get_coord()) / 2.0
                #print(3)
            
            cut = ns.search(center, distance_cutoff, "A")
            #print("cut", cut)
            cuts.append((counter, center, resname, cut)) #counter is id!
            return cuts
        if resname=="C": 
            site=residue[protonatable_sites[resname]]
            if site.is_disordered(): 
                resname = resname + "D"
            center =site.get_coord()
            cut = ns.search(center, distance_cutoff, "A")
            cuts.append((counter, center, resname, cut)) #counter is id!
            return cuts
        if resname=="L": 
            site=residue[protonatable_sites[resname]]
            if site.is_disordered(): 
                resname = resname + "D"
            center =site.get_coord()
            cut = ns.search(center, distance_cutoff, "A")
            cuts.append((counter, center, resname, cut)) #counter is id!
            return cuts
        if resname=="T":
            site=residue[protonatable_sites[resname]]
            if site.is_disordered(): 
                resname = resname + "D"
            center =site.get_coord()
            cut = ns.search(center, distance_cutoff, "A")
            cuts.append((counter, center, resname, cut)) #counter is id!
            return cuts
              
        if resname=="H":
            sites=protonatable_sites[resname]
            atom1,atom2=residue[sites[0]],residue[sites[1]]            
            if atom1.is_disordered(): 
                resname = resname + "D"
            if atom2.is_disordered():
                resname = resname + "D"
            center1,center2=atom1.get_coord(), atom2.get_coord()
            cut1 = ns.search(center1, distance_cutoff, "A")
            cut2= ns.search(center2, distance_cutoff, "A")
            cuts.append([(counter+.1, center1, resname, cut1),(counter+.2, center2, resname, cut2)])
            #cuts.append((counter+.2, center2, resname, cut2))


    return cuts #plural because of sites with multiple sites.


In [5]:
#%%capture
import sys
np.set_printoptions(threshold=sys.maxsize)
def merge_or_not_cutouts(cutouts_apdb, distance_cutoff): #TODO: reduce dtypes #PDB WISE!
    """
    in: all of the cutouts from the pdb. returns the merged or solo cutout for each input residue of cutouts_apdb. len in = len out"""
    #protein wise ..
    dp_ids,centers,cuts, Ds_lite, cutouts, resnames,redunant_merged_is, done_pairs =[],[],[],[], [],[],[],[]

    for site in cutouts_apdb:
        if type(site)==tuple:
            #print("a site", site[0])
            dp_ids.append(site[0]) #1,2,3f,4,5,6,7,8,9.1, 9.2....
            centers.append(site[1]) 
            resnames.append(site[2])
            cuts.append(site[3])
        else:
            print("a site", site[0][0], site[1][0])
            #site1,site2=site[0],site[1]
            dp_ids.append(site[0][0]) #1,2,3,4,5,6,7,8,9.1, 9.2....
            dp_ids.append(site[1][0])

            centers.append(site[0][1]) 
            centers.append(site[1][1]) 

            resnames.append(site[0][2])
            resnames.append(site[1][2]) 

            cuts.append(site[0][3])
            cuts.append(site[1][3])
    print(resnames)
    num_residues=len(centers)
    distances = np.zeros((num_residues, num_residues))
    
    for i in range(num_residues):
        for j in range(i + 1, num_residues):
            distance = np.linalg.norm(centers[i] - centers[j]).astype(np.float32)

            if distance < distance_cutoff:
                distances[i, j] = distance.astype(np.float32)
                distances[j, i] = distance.astype(np.float32)

    #Ds lite correctly gets the nonzero entries from column? row? i of distances.
    Ds_lite = [distances[i][distances[i] != 0] for i in range(num_residues)] #nonzero entries for easier searching
    #print(len(Ds_lite))
#residuewise...
    for i in range(len(Ds_lite)): #=len IDs 
        a_residues_distance_array=Ds_lite[i]
        #print(f"residue {i}'s distance array, Ds_lite[i]", a_residues_distance_array)
        if a_residues_distance_array.any(): #if not empty
            index=i
            #tolerance = 1e-1  # Define an appropriate tolerance level
            #print((np.abs(distances[:, int(index)] - np.min(a_residues_distance_array)) < tolerance)[0])
            #closest_cutout_i = int(np.where(np.abs(distances[:, int(index)] - np.min(a_residues_distance_array)) < tolerance)[0])
            #print("where does the row of the residue's distances, equal the min of its D's lite?",distances[:,int(index)] == np.min(a_residues_distance_array))
            #print("distances column", distances[:,int(index)])
            closest_cutout_i = int(np.where((distances[int(index), :])==np.min(a_residues_distance_array))[0]) #int is unneccessary TODO
            
            print(resnames[index])
            print(resnames[closest_cutout_i])
            print(closest_cutout_i)
            print("")
            #print("closest cutout i", closest_cutout_i)
            #print("min of d array", np.min(a_residues_distance_array))
            pair_i = frozenset((index,dp_ids[closest_cutout_i]))#key #frozen set is immutable thus can be used as a dict key #also order doesnt matter, 2-1=1-2

            if not done_pairs: #if there are any yet merged
                cutout = (list(set(cuts[i] + cuts[closest_cutout_i])),resnames[i] + resnames[closest_cutout_i], (centers[i], centers[closest_cutout_i]))
                done_pairs.append(pair_i)
                redunant_merged_is.append(closest_cutout_i)
                
            else: #if there are already some generated
                if pair_i not in done_pairs: #if that mergedcut hasnt yet been made
                    cutout = (list(set(cuts[i] + cuts[closest_cutout_i])),resnames[i] + resnames[closest_cutout_i], (centers[i], centers[closest_cutout_i]))
                    done_pairs.append(pair_i)
                    redunant_merged_is.append(closest_cutout_i)

                else: #null
                    cutout = None

        else: #solo cutout
            cutout = (cuts[i], centers[i])

        cutouts.append(cutout)

    if len(cutouts) != len(dp_ids): #delete?
        return #this will make an exception if something went wrong
    
    return cutouts,redunant_merged_is #merged or solo

def get_cutout(dask_df, distance_cutoff): #"PARENT" FUNCTION
    """for each protein in dask_df (the entire PYPKA database), it iterates residue wise through the 121,294 proteins in PYPKA database and downloads
    the structure from RCSB with biopython. Then, it checks and skips the structure if metals & hetero sulfurs are present, and deletes non-sulfur
    salts from titratable residues.
    Then, for each structure residue represented in PYPKA, generates a cutout for each residue, appends the structure to cutouts_apdb"""
    #pdbname="11as"  #for now #delete
    all_fnames, all_cuts, all_centers = [],[],[]
    for i in range(19,20): #will equal len of set of pdbs in pypka, == 121294 
        cutouts_apdb, fnames, cutouts_1_datapoint, counter, pdbname, newfnames, centers_apdb = [],[], [],0, pdbs[i],[],[]
        Structure = pdb_parser.get_structure("",  PDBList().retrieve_pdb_file(str.lower(pdbname),obsolete=False, pdir='PDB',file_format = 'pdb'))
        structure= check_atoms_protein(Structure, Structure.get_atoms())
        if not structure: #skip entire pdb and all its entries in pypka db if there are undesirables in pdb
            continue
            
        ns = PDB.NeighborSearch(list(structure.get_atoms())) #set up ns , entire protein
        pdb_df = dask_df[dask_df.iloc[:, 1] == pdbname].drop(columns = ["PDB ID", "pKa"]) #make a subdf containing only residue entries which are in PYPKA (dask_df) 
        for j in range(len(pdb_df)):  #go through each residue in a pdb #each j is a datapoint!
            chain, res_id =pdb_df.iloc[j]['Chain'], int(pdb_df.iloc[j]['Res ID'])
            try: 
                residue=structure[0][chain][res_id] #a datapoint #TODO: make ID?
                pypka_resname, PDBresname = pdb_df.iloc[j]['Res Name'], residue.get_resname() #pypka error
                if pypka_resname=='NTR':
                    resname=0
                elif pypka_resname=='CTR':
                    resname,pypka_resname=1,"X" #carboxyl
                elif pypka_resname==PDBresname:
                    resname=pypka_resname[0]
                #pypka error: if not in ntr or == pdbresname, it will error and pass.
                    #elif pypka_resname == PDBresname: #ACHTUNG! this navigates the pyka error. #TODO: mail him #TODO: THIS EXCLUDES NTR AND CTR!

                cutouts_1_datapoint=generate_cutout_around_protonatable_site(residue, distance_cutoff, ns, counter, resname) #can be multiple #returns empty if disordered
                if cutouts_1_datapoint: #cutouts_1_datapoint DNE if titratable site is disordered
                    cutouts_apdb.append(*cutouts_1_datapoint) #append each residue/data point cutouts here #it will error here if disordered
                    #for _ in cutouts_1_datapoint:

                    #fnames.append(f"{pdbname}{chain}{res_id}_{pypka_resname}{counter}") 
                    if resname!="H": #TODO: check if its quicker to do "for cuts in cutouts a pdb" or if resname==H
                        #print(resname, 22)
                        fnames.append(f"{pdbname}{chain}{res_id}_{pypka_resname}{counter}") 
                    else:
                        print(resname, 22)
                        fnames.append(f"{pdbname}{chain}{res_id}_{pypka_resname}{counter + .1}") 
                        fnames.append(f"{pdbname}{chain}{res_id}_{pypka_resname}{counter + .2}") 
                        #print(resname, cutouts_1_datapoint, counter)
                    counter+=1
                else:
                    continue #pypka error

            
            except Exception as e:
                print(f"Exception caught: {e}")
                raise  # 
                #pass #means pypka res not found in PDB
        
        #os.remove(f"{local_folder}/PDB/pdb{pdbname}.ent")  
        if cutouts_apdb:
            merged_and_solos, greaterN_pair_i =merge_or_not_cutouts(cutouts_apdb, distance_cutoff)#make a merged cutout or not based off radius criteria
            print(fnames)
            for cut, fname in zip(merged_and_solos, fnames):
                if not cut:
                    continue
                
                elif len(cut)==3: #means it is a merged cutout. second argument is the pairid #it is still 1-to-1 here but ima destroy it
                    #print(greaterN_pair_i[0])
                    #print(fname, fnames[greaterN_pair_i[0]],cut[1])
                    #print(cut[1])
                    Fname="".join([fname,'_',fnames[greaterN_pair_i[0]],"_",cut[1]]) #cut1 is pairid AT, HH,...
                    newfnames.append(Fname)
                    centers_apdb.append(cut[2])
                    del greaterN_pair_i[0]
                    atoms_to_structure(cut[0], Fname) #save as pdb) #cut 

                else:
                    newfnames.append(fname)
                    centers_apdb.append(cut[1])
                    #print(2,cut)
                    atoms_to_structure(cut[0], fname) 
        all_fnames.append(newfnames)
        all_cuts.append(cutouts_apdb)
        all_centers.append(centers_apdb)

    return all_fnames, all_cuts, all_centers #[c[1] for c in cutouts_apdb] #centers


#fs, all_cuts, all_centers = get_cutout(dask_df, 5)

todo now: find the H's fo CTR and NTR >.<


make sure that TIP3p and ff14sb in same

In [None]:
#%%capture

import time
from collections import Counter
from collections import defaultdict
from itertools import chain, product
#protonate("194l", )
def amber(input_pdb):
    skript = f"""source leaprc.protein.ff14SB
    source leaprc.water.tip3p
    loadOff "/Users/jessihoernschemeyer/miniconda3/envs/cfcnn/dat/leap/lib/amino19.lib"
    mol = loadpdb "/Users/jessihoernschemeyer/pKaSchNet/cuts/{input_pdb}.pdb"
    savepdb mol "/Users/jessihoernschemeyer/pKaSchNet/prot/{input_pdb}.pdb"

    quit"""
    with open("ascript.py","w") as file: 
        file.writelines(skript)
    return


for f in fs[0]:
    amber(f)
    !tleap -s -f /Users/jessihoernschemeyer/pKaSchNet/ascript.py

In [None]:
!cat '/Users/jessihoernschemeyer/pKaSchNet/prot/199lA70_ASP22_199lA31_HIS11.2_AH.pdb'

In [None]:
protonatable_sites = {"G":"HE2", 
                      "C":"HG", 
                      "L":"HZ1", 
                      "A":"HD2",
                      "H": ("HD1","HE2"), #this needs to be fixed 
                      "T": "HH"}


#for fnames_apdb in all_fnames:
    #for fname in fnames_apdb:
        #recut(fname)
def recut(fnames_apdb, centers_apdb, distance_cutoff): #the centers come in #fnames after protonation
        
    for fname, center in zip(fnames_apdb, centers_apdb):
        struct = pdb_parser.get_structure("",  f'/Users/jessihoernschemeyer/pKaSchNet/prot/{fname}.pdb')
        ns = PDB.NeighborSearch(list(struct.get_atoms())) #set up ns , entire protein
        if type(center)==tuple: #merged
            
            cut = set(ns.search(center[0], distance_cutoff, "A")) | set(ns.search(center[1], distance_cutoff, "A"))
            deprotonate_merged(cut,fname)
            fname.split("_")[4]
            cuts.append((fname.split("_")[4],cut))
        else: #single
            cut = ns.search(center, distance_cutoff, "A")
            deprotonate_singles(cut, fname)
        
        
        cuts.append((fname.split("_")[1], cut))

    return cuts
    
        
#for fname,centers in zip(fs,all_centers):
    
    #recut(fname, centers,5)

#def deprotonate(cut):
     
    #Hatom


protonatable_sites2 = {"G":"GLH", 
                      "C":"CYS", 
                      "L":"LYS", 
                      "A":"ASH",
                      "H": "HIP",
                      "T": "TYR"}

cut = pdb_parser.get_structure("",  local_folder + '/prot/199lA11_GLU3.pdb')

    
def deprotonate_singles(cut, f): #turn one acidic into all its others?
    res = f.split("_")[1][0]
    #print(len([atom for atom in cut.get_atoms()]))
    atom_to_delete = protonatable_sites[res]
    found_atoms = [atom for atom in cut.get_atoms() if atom.get_name() == atom_to_delete and atom.get_parent().get_resname()[0] == res] #to delete
    #dads = str(atom.get_parent()
    #found_atoms_res = [(atom.get_parent().get_resname())[0] for atom in found_atoms_all]# if ] # if atom.get_parent().get_id() == 'GLH']
    for atom in found_atoms:
        atom.get_parent().detach_child(atom_to_delete)
    #print(len([atom for atom in cut.get_atoms()]))
    #atoms_to_structure(cut, 'test')
deprotonate_singles(cut, fs[0])



def deprotonate_merged(cut,fname):
    key=fname.split("_")[4] #AT...
    keys=(key[0],key[1])
    found_atoms = [atom for atom in cut if atom.get_name() in [protonatable_sites[keys[0]],protonatable_sites[keys[1]]]] #gets all the atoms for both
    for atom in found_atoms:
        res=atom.get_parent().get_resname()
        print(2,protonatable_sites2[res[0]])


#cuts=recut(fs[0][1:2],all_centers[0][1:2], 5)
    

In [74]:

protonatable_sites2 = {"G":"GLH", 
                      "C":"CYS", 
                      "L":"LYS", 
                      "A":"ASH",
                      "H": "HIP",
                      "T": "TYR"}

protonatable_sites = {"G":"HE2", 
                      "C":"HG", 
                      "L":"HZ1", 
                      "A":"HD2",
                      "H": ("HD1","HE2"), #this needs to be fixed 
                      "T": "HH"}


cut = pdb_parser.get_structure("",  '/Users/jessihoernschemeyer/pKaSchNet/prot/199lA10_ASP2_199lA161_TYR37_AT.pdb')
#residue1
key='/Users/jessihoernschemeyer/pKaSchNet/prot/199lA10_ASP2_199lA161_TYR37_AT.pdb'.split("_")[4] #AT...
key1,key2=key[0],key[1] #A,T
res1,res2=protonatable_sites2[key1], protonatable_sites2[key2] #ASH, TYR
for atom in cut.get_atoms():
    #for 2 option residues, not HIS
    if protonatable_sites[key1] == atom.get_name(): #if hd2 == atom name
        if protonatable_sites2[key1] == atom.get_parent().get_resname():
            #detach atom
            #get the new cut with first removed
            #save
            
        if protonatable_sites2[key2] == atom.get_parent().get_resname():
    
    #res2

    
    #if atom.get_name() in protonatable_sites
#found_atoms = [atom for atom in cut.get_atoms() if atom.get_name() in [protonatable_sites[keys[0]],protonatable_sites[keys[1]]] and atom.get_parent().get_resname() in (protonatable_sites2[keys[0]], protonatable_sites2[keys[1]])]  #gets all the atoms for both
#for atom in found_atoms:
    #res=atom.get_parent().get_resname()
    #print(atom, atom.get_parent())
#found_atoms

IndentationError: expected an indented block (2145313325.py, line 29)

here

In [102]:
key='/Users/jessihoernschemeyer/pKaSchNet/prot/199lA10_ASP2_199lA161_TYR37_AT.pdb'.split("_")[4] #AT...
found_atoms=[]
key1,key2=key[0],key[1] #A,T
res1,res2, atom1, atom2=protonatable_sites2[key1], protonatable_sites2[key2], protonatable_sites[key1], protonatable_sites[key2]  #ASH, TYR 
if key1=='H':
    if key2=='H':
        print("double H",key1,key2)
    else:
        print(f"{res1} is his. {res2} is else")

elif key2=='H':
    if key1=='H':
        print("double H",key1,key2)
    else:
        print(f"{res2} is his. {res1} is else")
    


else: #not his
    for atom in cut.get_atoms():
        #for 2 option residues, not HIS
        if atom.get_name() in (atom1, atom2): #if hd2 == atom name
            
            parent_res = atom.get_parent()
            parent_res_name = parent_res.get_resname()
            found_atoms.append((parent_res, atom))

    for ion in found_atoms:
            ion_resname, ion_atomname = ion[0].get_resname(), ion[1].get_name()
            if ion_resname==res1:
                if ion_atomname==atom1: #if both are true!!
                    print(f"detach {ion_atomname} {atom1} atom1 {res1}")
                    cutt="im detached cut of resA"
                    #detach atom1
                    #get the new cut with first removed cut_resA_d = 
                    #save
                    atoms_to_iterate_thru = tuple(t[1].get_name() for t in found_atoms)
                    if atom2 in atoms_to_iterate_thru:
                        print(f"detach  {ion_atomname} {atom2} from {cutt}")
                        #save double deprotonated
            elif ion_resname==res2:
                if ion_atomname==atom2:
                    print(f"detach {ion_atomname} {atom2} atom2 {res2}")
                    #save resB_d. name in file   




detach HH HH atom2 TYR
detach HD2 HD2 atom1 ASH
detach  HD2 HH from im detached cut of resA


In [77]:
found_atoms

[]

In [14]:
fs=['199lA11_GLU3', '199lA10_ASP2_199lA161_TYR37_AT', '199lA70_ASP22_199lA31_HIS11.2_AH']
all_centers = [np.array([46.052, 26.737, 23.817]), (np.array([40.873,  9.228,  6.324]), np.array([40.835,  7.343,  4.385])), (np.array([40.873,  9.228,  6.324]), np.array([40.835,  7.343,  4.385]))]


functioniert !!!!! but i just used a merged cutout to engineer the deprotonate single for his, oops

In [None]:
cut = pdb_parser.get_structure("",  '/Users/jessihoernschemeyer/pKaSchNet/prot/199lA70_ASP22_199lA31_HIS11.2_AH.pdb')
cut2 = cut.copy()
print(len([atom for atom in cut.get_atoms()]))
print(len([atom for atom in cut2.get_atoms()]))
hisatoms = protonatable_sites["H"]
found_atoms = [atom for atom in cut.get_atoms() if atom.get_name() in hisatoms and atom.get_parent().get_resname()[0] == 'H']
residue=found_atoms[0].get_parent()
residue.detach_child(hisatoms[0])
print(len([atom for atom in cut.get_atoms()]))
#save --> HIE
residue.detach_child(hisatoms[1])
print(len([atom for atom in cut.get_atoms()]))
#save --> HIS
found_atoms = [atom for atom in cut2.get_atoms() if atom.get_name() == hisatoms[1] and atom.get_parent().get_resname()[0] == 'H'][0]
#print(len([atom for atom in cut2.get_atoms()]))
residue=found_atoms.get_parent()

residue.detach_child(hisatoms[1])
#Save --> HID

print(len([atom for atom in cut2.get_atoms()]))

In [None]:


cut = pdb_parser.get_structure("",  local_folder + '/prot/199lA11_GLU3.pdb')

def deprotonate_singles(cut, f): #turn one acidic into all its others?
    res = f.split("_")[1][0]
    print(len([atom for atom in cut.get_atoms()]))
    found_atoms = [atom for atom in cut.get_atoms() if atom.get_name() == protonatable_sites[res] and atom.get_parent().get_resname()[0] == res] #to delete
    #dads = str(atom.get_parent()
    #found_atoms_res = [(atom.get_parent().get_resname())[0] for atom in found_atoms_all]# if ] # if atom.get_parent().get_id() == 'GLH']
    for atom in found_atoms:
        atom.get_parent().detach_child('HE2')
    #print(len([atom for atom in cut.get_atoms()]))
    if res == 'H':
        hisatoms = protonatable_sites[res]
        found_atoms = [atom for atom in cut.get_atoms() if atom.get_name() in hisatoms and atom.get_parent().get_resname()[0] == res]
        
    return found_atoms
atoms = deprotonate_singles(cut, fs[2])
#a=found_atoms[0]
#a.get_parent().detach_child(a.get_id())
#for a in atoms:
    #print(a.get_parent())
atoms

In [None]:
!cat 'prot/199lA162_LYS39_199lA159_ASP36_LA.pdb'

protonatable_sites2 = {"G":"GLH", 
                      "C":"CYS", 
                      "L":"LYS", 
                      "A":"ASH",
                      "H": "HIP",
                      "T": "TYR"}

we dont take it from the hydrogen atoms for constantness, since those Hs arent in all, dont wanna bias

In [110]:
# Replace with your actual PDB file path
import time
pdb_file = f'{local_folder}/prot/097_1a0f_GLU_B_190+99_1a0f_ASP_B_193~GA.pdb.pdb'

# Parse the structure
parser = PDBParser()
structure = parser.get_structure("example", pdb_file)

# Start timing
start_time = time.time()

# Search for atom by name
found_atoms = [atom for atom in structure.get_atoms() if atom.get_name() == "OE1"]
a=found_atoms[0]
a.get_parent().detach_child(a.get_id())
# End timing
end_time = time.time()

print(f"Biopython search time: {end_time - start_time} seconds")

Biopython search time: 0.0002980232238769531 seconds


In [82]:
recut(fs[0],all_centers[0], 5)

TypeError: unhashable type: 'list'

In [27]:
fname="1a0fB190_GLU97_1a0fB193_ASP99_GA"
key = fname.split("_")[4]
res1,res2=key[0],key[1]

to do move everything to same directory

can delete from cutouts as we go

In [None]:
#acidic
protonatable_sites = ["HE2", "HG", "HZ1", "OD2", "HE2", "HD1", "HH"] # glu cys lys asp hie hid tyr
protonatable_sites = {"G":"HE2", "C":"HG", "L":"HZ1", "A":"HD2","H1": "HD1", "H2":"HE2", "T": "HH"}
in="194l_A_7_GLU-0_0.pdb"

In [None]:
!sed '/HH/d' /Users/jessihoernschemeyer/pKaSchNet/194l_A_53_TYR-9.pdb #

In [11]:
from Bio.PDB import PDBParser

# Create a PDB parser object
parser = PDBParser()

# Load the PDB file with a non-standard extension
structure = parser.get_structure('protein', '/Users/jessihoernschemeyer/pKaSchNet/cuts/og.rtf.HIE')

# Now you can work with the 'structure' object as usual
print(structure)

<Structure id=protein>




!sed '/delete_this/d' file > newfile

if a solo cutout we can use sed

In [58]:
from Bio.PDB import *
from Bio import PDB
from ase import Atoms, Atom
import torch
from matscipy.neighbours import neighbour_list as msp_neighbor_list
pdb_parser = PDB.PDBParser()
def PDB_to_schnet_input_and_names_map(cut,r):

    pos, names, B, a = [],[], [], []
    z_symbol = {'H' : 1,
        'C' : 6,
        'N' : 7,
        'O' : 8,
        'S': 16}
    #struct = pdb_parser.get_structure("",  f'/Users/jessihoernschemeyer/pKaSchNet/{file}')

    for atom in cut:
        id=atom.get_full_id()
        res, name = id[3][1], id[4][0]
        names.append(name)
        pos.append(atom.get_coord())
        a.append(atom)


    z=[z_symbol.get(name[0]) for name in names]
    #Z IS MADE FROM THE NAMES
    atoms = Atoms([z_symbol.get(name[0]) for name in names], pos)
    atoms.set_cell([[1,0,0], [0,1,0], [0,0,1]])

    d, i, j = msp_neighbor_list('dij',  atoms, [r for i in range(len(atoms))])
    inputs = {'Z':torch.tensor(z).long(), 'R':torch.tensor(d).float(), 'idx_i':torch.tensor(i).long(), 'idx_j': torch.tensor(j).long()}

  
    return inputs, [names, a]

In [None]:
struct = pdb_parser.get_structure("",  f'/Users/jessihoernschemeyer/pKaSchNet/prot_194l_A_18_ASP-2.pdb')
ns = PDB.NeighborSearch(list(struct.get_atoms())) #set up ns , entire protein
cut1 = ns.search(cs[2], 5, "A")
ns = PDB.NeighborSearch(list(struct.get_atoms())) #set up ns , entire protein
cut2 = ns.search(cs[1], 5, "A")

In [68]:
!cat prot_194l_A_13_LYS-1.pdb

ATOM      1  N   LEU     1     -16.599  20.501   9.740  1.00  0.00
ATOM      2  H   LEU     1     -16.579  19.602  10.200  1.00  0.00
ATOM      3  CA  LEU     1     -16.662  20.530   8.292  1.00  0.00
ATOM      4  HA  LEU     1     -17.566  21.052   7.978  1.00  0.00
ATOM      5  CB  LEU     1     -15.454  21.253   7.705  1.00  0.00
ATOM      6  HB2 LEU     1     -15.429  22.278   8.073  1.00  0.00
ATOM      7  HB3 LEU     1     -14.541  20.737   8.003  1.00  0.00
ATOM      8  CG  LEU     1     -15.558  21.262   6.183  1.00  0.00
ATOM      9  HG  LEU     1     -15.583  20.237   5.814  1.00  0.00
ATOM     10  CD1 LEU     1     -16.835  21.984   5.766  1.00  0.00
ATOM     11 HD11 LEU     1     -16.811  23.010   6.134  1.00  0.00
ATOM     12 HD12 LEU     1     -16.910  21.991   4.678  1.00  0.00
ATOM     13 HD13 LEU     1     -17.699  21.468   6.185  1.00  0.00
ATOM     14  CD2 LEU     1     -14.351  21.985   5.596  1.00  0.00
ATOM     15 HD21 LEU     1     -13.438  21.469   5.894  1.00  

In [70]:
struct = pdb_parser.get_structure("",  f'/Users/jessihoernschemeyer/pKaSchNet/prot_194l_A_18_ASP-2.pdb')
ns = PDB.NeighborSearch(list(struct.get_atoms())) #set up ns , entire protein
cut1 = ns.search(cs[2], 5, "A")
ns = PDB.NeighborSearch(list(struct.get_atoms())) #set up ns , entire protein
cut2 = ns.search(cs[1], 5, "A")

array([-17.552,  18.998,  11.717], dtype=float32)

In [108]:
%%capture
!cat 194l_A_18_ASP-2.pdb
!cat 194l_A_13_LYS-1.pdb
struct = pdb_parser.get_structure("",  f'/Users/jessihoernschemeyer/pKaSchNet/prot_194l_A_18_ASP-2.pdb')


ns = PDB.NeighborSearch(list(struct.get_atoms())) #set up ns , entire protein
cut1 = ns.search(cs[2], 5, "A")
input1, extras1 = PDB_to_schnet_input_and_names_map(cut1,6)

struct = pdb_parser.get_structure("",  f'/Users/jessihoernschemeyer/pKaSchNet/prot_194l_A_13_LYS-1.pdb')

len([atom for atom in struct.get_atoms()])

ns = PDB.NeighborSearch(list(struct.get_atoms())) #set up ns , entire protein
cut2 = ns.search(cs[1], 5, "A")



NameError: name 'cs' is not defined

In [111]:
a=struct.get_residues()
for b in a:
    print(b['N'])

<Atom N>
<Atom N>
<Atom N>
<Atom N>


In [65]:
for atom in cut1:
    print(atom.get_parent())
print("")
for atom in cut2:
    print(atom.get_parent())

<Residue LYS het=  resseq=1 icode= >
<Residue LYS het=  resseq=1 icode= >
<Residue LYS het=  resseq=1 icode= >
<Residue LYS het=  resseq=1 icode= >
<Residue LYS het=  resseq=1 icode= >
<Residue LYS het=  resseq=1 icode= >
<Residue LYS het=  resseq=1 icode= >
<Residue LYS het=  resseq=1 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASN het=  resseq=4 icode= >
<Residue ASN het=  resseq=4 icode= >
<Residue ASN het=  resseq=4 icode= >
<Residue ASN het=  resseq=4 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue ASN het=  resseq=4 icode= >
<Residue LEU het=  resseq=2 icode= >
<Residue LEU het=  resseq=2 icode= >
<Residue ASH het=  resseq=3 icode= >
<Residue LEU het=  resseq=2 icode= >
<Residue LEU het=  resseq=2 icode= >
<

In [None]:
len(cut1)

46

In [None]:
len(input1.get('Z'))

46

In [None]:
from schnetpack.representation.schnet import SchNet
from torch.nn import Sequential
from schnetpack.model import NeuralNetworkPotential
from torch import nn
from schnetpack.nn import Dense
from schnetpack.nn.radial import GaussianRBF
from schnetpack.nn.cutoff import CosineCutoff

In [None]:

torch.set_printoptions(profile="full")
weights = torch.load('tensor_dict.pth')
r=10
output_weight = torch.load('output_tensor.pth')

Model = SchNet(n_atom_basis=128, n_interactions=6, radial_basis=GaussianRBF(50, r), cutoff_fn=CosineCutoff(r))
for keys, weight in weights.items():
    left = f"Model.{keys}" 
    right = f"torch.nn.Parameter(torch.{weight})"
    execu = f"{left} = {right}"
    st = execu.replace("\n       ","")
    st2 = st.replace("representation.","")
    try:
        exec(st2)
    finally:
        right2 = f"torch.nn.Parameter({weight})"
        E=f"{left} = {right2}"
        s = execu.replace("\n       ","")
        s2 = s.replace("representation.","")
        exec(s2)

modelll=nn.Sequential(Dense(128,64), Dense(64,1))

with torch.no_grad():
    modelll[0].weight = nn.Parameter(output_weight.get('dense1_weight'))
    modelll[0].bias = nn.Parameter(output_weight.get('dense1_bias'))
    modelll[1].weight = nn.Parameter(output_weight.get('dense2_weight'))
    modelll[1].bias = nn.Parameter(output_weight.get('dense2_bias'))

#input1, extras1 = PDB_to_schnet_input_and_names_map(cut1)
#outputs1 = Model(input1)
#E1 = modelll(outputs1.get('scalar_representation'))
#Eatoms1 = [[atom for atom in extras1[1]], [e for e in E1]]
#Eatoms1_dict = dict(zip([atom for atom in extras1[1]], [e for e in E1]))

NameError: name 'torch' is not defined

In [None]:
outputs1 = Model(input1)

: 

In [None]:
r=10
input1, extras1 = PDB_to_schnet_input_and_names_map(cut1)
len(input1.get('Z'))

62

In [None]:
from schnetpack.representation.schnet import SchNet
from torch.nn import Sequential
from schnetpack.model import NeuralNetworkPotential
from torch import nn
from schnetpack.nn import Dense
from schnetpack.nn.radial import GaussianRBF
from schnetpack.nn.cutoff import CosineCutoff
torch.set_printoptions(profile="full")
weights = torch.load('tensor_dict.pth')
r=10
output_weight = torch.load('output_tensor.pth')

Model = SchNet(n_atom_basis=128, n_interactions=6, radial_basis=GaussianRBF(50, r), cutoff_fn=CosineCutoff(r))
for keys, weight in weights.items():
    left = f"Model.{keys}" 
    right = f"torch.nn.Parameter(torch.{weight})"
    execu = f"{left} = {right}"
    st = execu.replace("\n       ","")
    st2 = st.replace("representation.","")
    try:
        exec(st2)
    finally:
        right2 = f"torch.nn.Parameter({weight})"
        E=f"{left} = {right2}"
        s = execu.replace("\n       ","")
        s2 = s.replace("representation.","")
        exec(s2)

modelll=nn.Sequential(Dense(128,64), Dense(64,1))

with torch.no_grad():
    modelll[0].weight = nn.Parameter(output_weight.get('dense1_weight'))
    modelll[0].bias = nn.Parameter(output_weight.get('dense1_bias'))
    modelll[1].weight = nn.Parameter(output_weight.get('dense2_weight'))
    modelll[1].bias = nn.Parameter(output_weight.get('dense2_bias'))

input1, extras1 = PDB_to_schnet_input_and_names_map(cut1)
outputs1 = Model(input1)
E1 = modelll(outputs1.get('scalar_representation'))
Eatoms1 = [[atom for atom in extras1[1]], [e for e in E1]]
Eatoms1_dict = dict(zip([atom for atom in extras1[1]], [e for e in E1]))

Model = SchNet(n_atom_basis=128, n_interactions=6, radial_basis=GaussianRBF(50, r), cutoff_fn=CosineCutoff(r))
for keys, weight in weights.items():
    left = f"Model.{keys}" 
    right = f"torch.nn.Parameter(torch.{weight})"
    execu = f"{left} = {right}"
    st = execu.replace("\n       ","")
    st2 = st.replace("representation.","")
    try:
        exec(st2)
    finally:
        right2 = f"torch.nn.Parameter({weight})"
        E=f"{left} = {right2}"
        s = execu.replace("\n       ","")
        s2 = s.replace("representation.","")
        exec(s2)

modelll=nn.Sequential(Dense(128,64), Dense(64,1))

with torch.no_grad():
    modelll[0].weight = nn.Parameter(output_weight.get('dense1_weight'))
    modelll[0].bias = nn.Parameter(output_weight.get('dense1_bias'))
    modelll[1].weight = nn.Parameter(output_weight.get('dense2_weight'))
    modelll[1].bias = nn.Parameter(output_weight.get('dense2_bias'))

input2, _ = PDB_to_schnet_input_and_names_map(cut2)
outputs2 = Model(input2)
E2 = modelll(outputs2.get('scalar_representation'))

Eatoms2 = [[atom for atom in _[1]], [e for e in E2]]
Eatoms2_dict = dict(zip([atom for atom in _[1]], [e for e in E2]))

common_entries = set(Eatoms1[0]).intersection(set(Eatoms2[0]))
for x in list(common_entries):
    print(Eatoms2_dict.get(x), Eatoms1_dict.get(x))


NameError: name 'torch' is not defined

In [None]:
print(len(modelll.state_dict()))

NameError: name 'modelll' is not defined

In [None]:
common_entries = set(Eatoms1[0]).intersection(set(Eatoms2[0]))
for x in list(common_entries):
    print(Eatoms2_dict.get(x), Eatoms1_dict.get(x))

In [None]:
type(cut[0].get_full_id())

tuple