In [None]:
from Bio.PDB import PDBList
from Bio.PDB import *
import pandas as pd
import MDAnalysis as mda
import numpy as np
np.set_printoptions(threshold=np.inf) #1000 is default
import random
import math
from IPython.display import display, HTML
import re

In [None]:
#PANDAS DATAFRAME PRODUCTION CELL FOR PKAD

def input_df(external_file_name):

#load the PKAD csv into pandas 
    df = pd.read_csv(external_file_name) 

#extract the needed info from the PKAD csv
    df = df[["PDB ID","Res ID","Res Name", "Expt. pKa"]]

#get rid of the N/A values (For Jesse: they are N/A because the CSV has an empty cell where the Res ID should be)
    df=df.dropna(ignore_index=True)

#make an identifier 
    ID=[]
    for i in range(len(df)):
        pdb=str.lower(df.iloc[i]['PDB ID'])
        res_n=str(math.trunc(df.iloc[i]['Res ID']))
        res=str(df.iloc[i]['Res Name'])
        ids="-".join([pdb,res,res_n]) #combine with a hyphen
        ID=np.append(ID,ids) #array with the unique IDs
    
#add identifier array to df
    col = 'ID' #column name
    df[col] = ID #adding array
    
    df = df.rename(columns={'Expt. pKa': 'pKa'})

    return df
#inputdf=input_df("/Users/jessihoernschemeyer/Desktop/Thesis/WT_pka.csv")
#display(HTML(inputdf.to_html()))

In [None]:
#pypka/pkPDB dataframe production

def input_pkPDB_df(csv_file):
    df = pd.read_csv(csv_file, nrows=100, delimiter=";") [["idcode","residue_name","residue_number", "pk"]]



#make an identifier 
    ID=[]
    for i in range(len(df)):
        pdb=str(df.iloc[i]['idcode'])
        res_n=str(df.iloc[i]['residue_number'])
        res=str(df.iloc[i]['residue_name'])
        ids="-".join([pdb,res,res_n]) #combine with a hyphen
        ID=np.append(ID,ids) #array with the unique IDs
    
#add identifier array to df
    col = 'ID' #column name
    df[col] = ID #adding array
    
    
#rename columns to match df from pkad 
    df = df.rename(columns={'idcode': 'PDB ID', 'residue_number': 'Res ID', 'residue_name': 'Res Name', 'residue_number': 'Res ID', 'pk': 'pKa'})

    
    return df    

In [None]:
#only run this cell if you want to download pdb files..

def download_pdbs(df):
#get an unordered list of the pdbs in order to download them
    matrix = df[df.columns[0]]
    PDB_list = matrix.tolist()
    PDB_list=list(set(PDB_list)) 


#download the pdbs
    pdbl = PDBList()
    for i in PDB_list:
        pdbl.retrieve_pdb_file(i,pdir='PDB',file_format = 'pdb')


download_pdbs(input_df("/Users/jessihoernschemeyer/Desktop/Thesis/WT_pka.csv"))

In [None]:
def mini_batches(input_df):
    #this function generates the atom information about the neighborhood of a titratable residue which is retrieved
    #from inputted data. It then adds the pdb name and pka value of the titratable residue for which the atoms belong
    #to, to the new dataframe, which becomes the new main dataframe. 
    #each new df made in this function corresponds to 1 row on the original/input/old df.
    
#get pdb name from input df to put into mda.
    pdb_res=input_df #just me being lazy, ignore..
    
    df_list=[]
    
    for i in range(1): #range(len(pdb_res)) #change this when Pypka storage issue is fixed! TODO
        x=[]
        y=[]
        z=[]
    
        pdbname=str.lower(pdb_res.iloc[i]['PDB ID']) #zooming into pdb name
        #For Jesse: file name needs manual change of everything preceding {} for code to work
        pdb_file = "/Users/jessihoernschemeyer/PDB/pdb{}.ent".format(pdbname) 
    
#make mda universe with the pdb
        u = mda.Universe(pdb_file)

#get resids in format for mda to read.

        if bool(isinstance(pdb_res.iloc[1]['Res ID'], float) == True): #because pkPDB is already a truncated integer. 
            val=math.trunc(pdb_res.iloc[i]['Res ID']) 
        else:
            val=pdb_res.iloc[i]['Res ID'] 
        
        
        resid_in="resid {}".format(val) #string in string


#zooming into residue of 1 pdb: selects the atoms of the residue given by res id above and makes an atom group.
        res_atoms=u.select_atoms(resid_in) #makes an mda atom group. ORDER NOT PRESERVED AND DUPLICATES REMOVED FOR AGs

#get atoms in distance of residue we are focusing on
        string_nahe='around 10 {}'.format(resid_in) #all atoms around the word after the number, which is the cutoff distance
        nahe=u.select_atoms(string_nahe) #atom group 

#make a residue group of residues which are within the atom group of atoms in the vicinity of the t. residue.
        residues = nahe.residues #vicinity residues
    
#get the vicinity residues' names 
        res_name = residues.atoms.resnames
    
#get the vicinity residues' residue number
        res_ids = residues.atoms.resids
    
#get atom NAMES of atoms in (each) vicinity residue  
        atomname = residues.atoms.names 
    
#get the positions of atoms in (each) vicinity residue
        positions=residues.atoms.positions

#seperate them into x, y, and z coordinate arrays for easier processing into the df
        for j in range(len(positions)):

    #x coordinate, 1 atom, 1 residue
            x=np.append(x,positions[j,0]) #goes through each atom of 1 pdb file and takes the x coordinate. 
            
        
        #pdb_res[column_name_x] = x #adds the x position as a column to the df.
    
    #y coordinate

            y=np.append(y,positions[j,1])
        #pdb_res[column_name_y] = y

    #z coordinate

            z=np.append(z,positions[j,2])
        
#get pka values
        pka = pdb_res.iloc[i]['pKa']

#make a dataframe of the neighborhood information
        df = pd.DataFrame({"PDB": pdbname,
                    "Res Name": res_name,
                    "Res No": res_ids,
                    "pKa": pka,
                    "Atom Name": atomname,
                    "x": x,
                    "y":y,
                    "z":z
                          })

#name each df (mini batch) with the name of the unique identifier from the imported df.
        df.name = str((pdb_res.iloc[i]['ID']))

#makes the index column name = the identifer for which the neighborhood information belongs to    
        df.index.name = df.name
    
#sort res name alphabetically 
        df = df.sort_values('Res Name')
        df = df.reset_index(drop=True)
        df = df.rename(columns={'Res Name': 'Res_Name'})
        
#add each df, which represents the information about the neighborhood of the titratable residue, to a list
        df_list.append(df) #append a list
    
#prepare residue name list for eventual export...
        resnames_set=list(set(res_name))
        
#TODO: make a check such that the number of rows in original input data frame = the length of df_list.  
    

    return df_list, resnames_set #df_list is a list of all the mini batches / new data frames. 

In [None]:
#pkPDB
input_df_pkPDB=input_pkPDB_df("/Users/jessihoernschemeyer/Downloads/pkas.csv")

#for Jesse (& Jessi) to download 1 pdb
pdbl = PDBList()
pdbl.retrieve_pdb_file("2bb7",pdir='PDB',file_format = 'pdb')

#run the function for the data from pkPDB
df_pkPDB, resnames_pkPDB = mini_batches(input_df_pkPDB) 

#display first (only) mini batch
df_pkPDB[0]

#optional: unhash to see full table
#display(HTML(df_pkPDB[0].to_html()))



In [None]:

#get the atom types from charmm topology file
rtf = "/Users/jessihoernschemeyer/pKaSchNet/rtf/top_all36_prot.rtf"
with open(rtf, 'r') as file:
    text = file.read()

def partial_charges(text, df):

    n_dup=0
    reslist=list(df['Res_Name'])
    final_name=df.Res_Name[len(df)-1]
    final_n=reslist.count(final_name)
    for i in range(len(df)-final_n-1):
            if i+n_dup - 1 == len(df):
                break
            name=df.Res_Name[i+n_dup] #we are iterating down the list of resnames. gets the res name
            n_dup=reslist.count(name) #number we skip by at the end.

     
    
#parsing the rtf file...

    #isolate the part of the topology file which is the partial charges
            index1 = text.find(name)
            index2 = text.find("BOND",index1)
            string=text[index1:index2]
        
    #turn the long string into a list of strings
            listtt=re.split(r'\s',string)
            
    #remove empty entries
            listtt = [x for x in listtt if x.strip()]


      
    #get partial charges 
            charges=[]
            listt=listtt
            while len(listt) >= 4: #3 is the size the information i am extracting from the list of strings
                if listt.count("ATOM") == 0:
                    break
                index = listt.index("ATOM")
                
                charge = listt[index+1:index+4] #gets the atom name, atom type, and partial charge 
                charges.append(charge)
    #delete the charge which was just stored, from the list we are extracting info from
                listt=listt[index+4:len(string)]


            dict1 = {}
            dict2 = {}
            keys = []
            types=[]
            partials=[]
#make the keys, aka the atom names
            for i in range(len(charges)):
                key = charges[i][0]
                keys.append(key)

#get the atom type (vals for dic 1)
            for i in range(len(charges)):
                atype = charges[i][1]
                types.append(atype)

        
        
#get partial charge (vals for dic 2)
            for i in range(len(charges)):
                partial = charges[i][2]
                partials.append(partial)

        #make the dictionaries
            dict_types  = { keys[j] : types[j] for j in range(len(keys)) }
            dict_partials = { keys[j] : partials[j] for j in range(len(keys)) }

            if keys.count("CD") > 0: #charm uses cd for atom name cd1 so i need to add a dict entry
                cd_type=dict_types["CD"] #retrieve value
                cd_charge=dict_partials["CD"]
                dict_partials["CD1"]=cd_charge #add entry to dict
                dict_types["CD1"]=cd_type

        #mapping to atom name in pandas
            df.loc[df.Res_Name==name, 'Atom Type'] = df["Atom Name"].map(dict_types)
            df.loc[df.Res_Name==name, 'Partial Charge'] = df["Atom Name"].map(dict_partials)
        
        
        
    return df
dff=df_pkPDB[0]
dfpkPDB = partial_charges(text,dff)
display(HTML(dfpkPDB.to_html()))

In [None]:
#PKAD
input_df_PKAD=input_df("/Users/jessihoernschemeyer/Desktop/Thesis/WT_pka.csv")

#for Jesse to download 1 pdb
pdbl = PDBList()
pdbl.retrieve_pdb_file("1a2p",pdir='PDB',file_format = 'pdb')

df_PKAD, resnames_PKAD = mini_batches(input_df_PKAD)
dfff=df_PKAD[0]
dfPKAD = partial_charges(text,dfff)

#df_PKAD[0]
display(HTML(dfPKAD.to_html()))



In [None]:
#junk cell / ignore 


#make unique identifier 
    data=pdb_res.to_numpy() #pandas to numpy
    pdb=[]
    res=[]
    randomm=[]
    rand=random.sample(range(1000,9999), len(data))
    ID=[]

    for i in range(len(data)):
        pdb=np.append(pdb,str(data[i,0]))
        res=np.append(res,str(math.trunc(data[i,1])))
        randomm=np.append(randomm,str(rand[i])) #random 4 digit number 
        ids="-".join([pdb[i],res[i],randomm[i]]) #combine with a hyphen
        ID=np.append(ID,ids) #array with the unique IDs
    
#add the unique ID to the dataframe 
    varA = 'Unique ID' #column name
    pdb_res[varA] = ID #adding the array i wanna add to df
    pdb_res
    
import pyth
help(pyth)
import rtf2xml
string[10].isnumeric()
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False