In [1]:
import torch
from torch import jit, Tensor
from aimnet.data import SizeGroupedDataset
from aimnet.config import build_module, load_yaml
from aimnet.modules import Forces
import yaml
import math
import numpy as np
from torch.optim.lr_scheduler import StepLR

In [None]:
def fetch_pdb(pdbid):
  print("Downloading : http://www.rcsb.org/pdb/files/%s.pdb" % pdbid)
  url = 'http://www.rcsb.org/pdb/files/%s.pdb' % pdbid
  file = urlopen(url)
  contents = file.read().decode('utf-8')
  file.close()
  file = StringIO(contents)
  outfile=pdbid+".pdb"
  with open(outfile, 'w') as f2:
      for line in contents:
          f2.write(line)

In [None]:
pdbid="6XSP"
fetch_pdb(pdbid)

In [None]:
pdbid

In [None]:
interdir=Path(os.path.join(workdir,pdbid+"_intermediate_files"))
if interdir.exists() and interdir.is_dir():
    shutil.rmtree(interdir)
os.mkdir(interdir)

In [None]:
##Define Protonation state of Amino Acids using pkaani

In [None]:
cmd='pkaani -i '+pdbid
_o=!{cmd}

In [None]:
class protonation:
      def __init__(self,pkaanilog):
          self.logfile=pkaanilog
          #self.amber_type=dict()

      def get_amber_res_type(self,residue,pka):
        # pka in water:
        #   glu 4.5,
        #   asp 3.8,
        #   his 6.5,
        #   lys 10.5,
        #   tyr 10.0 --> no different
        #                amber type so it will skip it
        # I want to use ~0.5 tolerance
        # my models mse are below and about 0.5
        #but we can talk about it
        newres=residue
        if(residue=='GLU' and pka>5.0):
           newres='GLH'
        if(residue=='ASP' and pka>4.3):
           newres='ASH'
        if(residue=='HIS' and pka>7.0):
           newres='HIP'
        if(residue=='LYS' and pka<9.0):
           # I want to be more relaxed for LYS
           print(pka)
           newres='LYN'

        return newres

      def decision(self):

        newres=dict()
        with open(self.logfile, 'r+') as fd:
          contents = fd.readlines()[1:]
          #Residue	Chain	pKa
          #TYR-3	A	10.85
          #first find if there is an SSBOND
          for line in contents:
              row=line.strip().split()
              resname=row[0].split('-')[0]
              resno=row[0].split('-')[1]
              chain=row[1]
              pka=row[2]

              restyp=self.get_amber_res_type(resname,float(pka))
              if(restyp!=resname):
                key=resname+'-'+chain+'-'+resno
                newres[key]=[restyp,float(pka)]


        return newres

In [None]:
decider=protonation(pdbid+'_pka.log')
protonations=decider.decision()
print('%-10s %12s %6s'%("Res-Ch-Rno","Amber_name","pka"))
for key,values in protonations.items():
    print('%-10s %10s %8.2f'%(key,values[0],values[1]))

#clean up
if not os.path.exists('pkaani_results'): os.mkdir('pkaani_results')
srcf=os.path.join(workdir,pdbid+'_pka.log')
destf=os.path.join(workdir+"/pkaani_results",pdbid+'_pka.log')
os.rename(srcf,destf)

srcf=os.path.join(workdir,pdbid+'_pkaani.pdb')
destf=os.path.join(workdir+"/pkaani_results",pdbid+'_pkaani.pdb')
os.rename(srcf,destf)

In [None]:
## Check disulfide bridge

In [None]:
def ssbonds(pdbin):
   ssbond=dict()
   ss_id=1
   with open(pdbin, 'r+') as fd:
       contents = fd.readlines()
       #first find if there is an SSBOND
       for line in contents:
          if('SSBOND' in line):
             #SSBOND   1 CYS A    6    CYS A  127
             row=line.strip().split()
             if(row[0]=='SSBOND'):
                 ch1=row[3]
                 ch2=row[6]
                 r1=row[4]
                 r2=row[7]
                 key=ss_id
                 ssbond[key]=[r1,ch1,r2,ch2]
                 ss_id=ss_id+1

   return ssbond

In [None]:
pdbin=pdbid+".pdb"
ssbond=ssbonds(pdbin)
print(ssbond)

In [None]:
ssbond_amber=None
if(len(ssbond)!=0):
   ssbond_amber,_outs=get_ssbond_rno(pdbin,ssbond)

   #clean up
   if not os.path.exists('pdb4amber_outs'): os.mkdir('pdb4amber_outs')
   for _o in _outs:
      srcf=os.path.join(workdir,_o)
      destf=os.path.join(workdir+"/pdb4amber_outs",_o)
      os.rename(srcf,destf)

In [None]:
## Clean PDB structures


In [None]:
def clean_pdb(pdbin,keep_res_list,model_no=1,ssbonds=None,protonation=None):

    prot_clean=[]
    apoprot=[]
    current_model=1

    with open(pdbin, 'r+') as fd:
        contents = fd.readlines()
        for line in contents:

            row=line.strip().split()


            #for multi-model PDB files
            if row[0]=='MODEL':
              current_model=int(float(row[-1].strip()))

              if(current_model==model_no):
                  prot_clean.append(line)
                  apoprot.append(line)

            if(current_model==model_no):
              if(ssbonds is not None and
                 row[0]=='ATOM' and
                 row[3].strip()=='CYS'):
                  for key,values in ssbonds.items():
                      if( (row[5]==values[0] and row[4]==values[1]) or
                          (row[5]==values[2] and row[4]==values[3])):
                          #{1: ['30', 'A', '33', 'A'], 2: ['30', 'B', '33', 'B']}
                          newline = line[0:16] + " " + "CYX" + line[20: ]
                          line=newline
                          #print(values)

              if(row[0]=='ATOM' or row[0]=='HETATM'):
                  if(row[3] in proteinResidues or
                     row[3] in keep_res_list):

                     #we have decided protonation states so lets do them
                     if(protonation is not None):

                        for key,values in protonation.items():
                            #key: ASP-A-71
                            #values: ['ASH',pka]
                            key_split=key.split('-')
                            if(key_split[0]==row[3] and
                               key_split[1]==row[4] and
                               key_split[2]==row[5]):
                                newline = line[0:16] + " " + values[0] + line[20:]
                                line=newline

                     #select only protein residues and the organic molecule in keep_res_list
                     prot_clean.append(line)

                  #only apoprotein
                  if(row[3] in proteinResidues):

                     #we have decided protonation states so lets do them
                     if(protonation is not None):

                        for key,values in protonation.items():
                            #key: ASP-A-71
                            #values: ['ASH',pka]
                            key_split=key.split('-')
                            if(key_split[0]==row[3] and
                               key_split[1]==row[4] and
                               key_split[2]==row[5]):
                                newline = line[0:16] + " " + values[0] + line[20:]
                                line=newline

                     #select only protein residues
                     apoprot.append(line)


              elif(row[0]=='TER' or row[0]=='ENDMDL'):
                  prot_clean.append(line)
                  apoprot.append(line)


            if(row[0]=='END'):
                 prot_clean.append(line)
                 apoprot.append(line)


    pdbid=pdbin.split('.')[0]
    orig_pdb=pdbid+"_original.pdb"
    os.rename(pdbin,orig_pdb)

    with open(pdbin, 'w') as f:
       for line in prot_clean:
           f.write(line)

    with open(pdbid+"_protein.pdb", 'w') as f:
       for line in apoprot:
           f.write(line)

    print("OUTPUTS: ")
    print("  Original PDB     : ",pdbid+"_original.pdb")
    print("  Clean PDB        : ",pdbin)
    print("  Apoprotein PDB   : ",pdbid+"_protein.pdb")

In [None]:
## Run antechamber

def run_atechamber(ligpdb,charge,ligname,ff="gaff2"):

    old_stdout = sys.stdout
    log_file = open('antechamber_'+ligname+".log","w")
    sys.stdout = log_file

    fname=ligpdb.split(".")[0]
    molout=fname+"_"+ligname+".mol2"

    cmd = "antechamber -i "+ligpdb+" -fi pdb "
    cmd+= "-o "+molout+" -fo mol2 "
    cmd+= "-c bcc -nc "+str(charge)
    cmd+= " -rn "+ligname
    cmd+= " -at "+ff
    cmd+= " -s 0 -pf y"

    print(cmd)
    p = sp.call(cmd, shell=True)

    sys.stdout = old_stdout
    log_file.close()

    return molout


In [None]:
if os.path.exists(pdbin.split('.')[0]+".nonprotein.pdb"):
    ligpdb=pdbin.split('.')[0]+".nonprotein.pdb"
    ante_molout=run_atechamber(pdbin.split('.')[0]+".nonprotein.pdb",-1,ligand,ff="gaff2")

In [None]:
#clean
if not os.path.exists('antechamber_outs'): os.mkdir('antechamber_outs')
srcf=os.path.join(workdir,'sqm.in')
destf=os.path.join(workdir+"/antechamber_outs",'sqm.in')
os.rename(srcf,destf)
srcf=os.path.join(workdir,'sqm.out')
destf=os.path.join(workdir+"/antechamber_outs",'sqm.out')
os.rename(srcf,destf)
srcf=os.path.join(workdir,'sqm.pdb')
destf=os.path.join(workdir+"/antechamber_outs",'sqm.pdb')
os.rename(srcf,destf)
srcf=os.path.join(workdir,'antechamber_'+ligand+'.log')
destf=os.path.join(workdir+"/antechamber_outs",'antechamber_'+ligand+'.log')
os.rename(srcf,destf)

In [None]:
_oname=ante_molout.split(".")[0]
print(_oname)
frcmodf=_oname+".frcmod"
cmd="parmchk2 -i "+ante_molout+" -f mol2 -o "+frcmodf+" -s gaff2"
print(cmd)
#p = sp.call(cmd, shell=True)
!{cmd}

In [None]:
def save_lib_tleap(molf,frcmodf,libf,topf,rstf):

    # Append tleap.in
    in_file = open('tleap.in', 'a')
    in_file.write('source leaprc.protein.ff14SB\n')
    in_file.write('source leaprc.DNA.bsc1\n')
    in_file.write('source leaprc.water.tip3p\n')
    in_file.write('source leaprc.gaff\n')
    wrttxt="mol = loadmol2 "+molf+"\n"
    in_file.write(wrttxt)
    in_file.write('check mol\n')
    wrttxt="loadamberparams "+frcmodf+"\n"
    in_file.write(wrttxt)
    wrttxt="saveoff mol "+libf+"\n"
    in_file.write(wrttxt)
    wrttxt="saveamberparm mol "+topf+" "+rstf+"\n"
    in_file.write(wrttxt)
    in_file.write('quit\n')
    in_file.close()
    p = sp.call('tleap -s -f tleap.in > tleap.log', shell=True)

In [None]:
save_lib_tleap(ante_molout,frcmodf,_oname+".lib",_oname+".top",_oname+".rst")

In [None]:
#clean
if not os.path.exists('ligand_files'): os.mkdir('ligand_files')

srcf=os.path.join(workdir,ante_molout)
destf=os.path.join(workdir+"/ligand_files",ante_molout)
os.rename(srcf,destf)
ligfiles=[destf]


srcf=os.path.join(workdir,frcmodf)
destf=os.path.join(workdir+"/ligand_files",frcmodf)
os.rename(srcf,destf)
ligfiles.append(destf)

srcf=os.path.join(workdir,_oname+".lib")
destf=os.path.join(workdir+"/ligand_files",_oname+".lib")
os.rename(srcf,destf)
ligfiles.append(destf)

srcf=os.path.join(workdir,_oname+".top")
destf=os.path.join(workdir+"/ligand_files",_oname+".top")
os.rename(srcf,destf)

srcf=os.path.join(workdir,_oname+".rst")
destf=os.path.join(workdir+"/ligand_files",_oname+".rst")
os.rename(srcf,destf)

srcf=os.path.join(workdir,"tleap.in")
destf=os.path.join(workdir+"/ligand_files","tleap_ligand.in")
os.rename(srcf,destf)

srcf=os.path.join(workdir,"tleap.log")
destf=os.path.join(workdir+"/ligand_files","tleap_ligand.log")
os.rename(srcf,destf)

In [None]:
##prep complex

def prep_complex(pdbin,ligfrcmod,liglib,ligmolf,solvate=False,getvol=False,addions=None):

    COMPLEXTOP=pdbin.split("_")[0]+"_complex_gas.top"
    COMPLEXRST=pdbin.split("_")[0]+"_complex_gas.rst"
    COMPLEXPDB=pdbin.split("_")[0]+"_complex_gas.pdb"

    if(solvate):
       COMPLEXTOP=pdbin.split("_")[0]+"_complex_sol.prmtop"
       COMPLEXRST=pdbin.split("_")[0]+"_complex_sol.inpcrd"
       COMPLEXPDB=pdbin.split("_")[0]+"_complex_sol.pdb"


    ff_p="source leaprc.protein.ff14SB \n"
    ff_w="source leaprc.water.tip3p \n"
    ff_small="source leaprc.gaff2 \n"

    frcmod="loadamberparams "+ligfrcmod+" \n"
    hetlib="loadoff "+liglib+" \n"
    hetmol="hetmol=loadmol2 "+ligmolf+" \n"

    prot="protein = loadpdb "+pdbin+" \n"

    compl="complex = combine {protein hetmol} \n"

    chk="check complex \n"

    solv="solvatebox complex TIP3PBOX 12.0 \n"
    box="setbox complex vdw \n"

    savecomplex="saveamberparm complex "+COMPLEXTOP+" "+COMPLEXRST+" \n"

    #write tleap file
    if os.path.exists("complex_tleap.in"): os.remove("complex_tleap.in")

    in_file = open("complex_tleap.in", 'a')
    in_file.write(ff_p)
    in_file.write(ff_w)
    in_file.write(ff_small)
    in_file.write(frcmod)
    in_file.write(hetlib)
    in_file.write(hetmol)
    in_file.write(prot)
    in_file.write(compl)
    in_file.write(chk)

    if(not getvol):
      if(solvate and addions is not None):
        in_file.write(addions)

    if(solvate):
        in_file.write(solv)
        in_file.write(box)

    if(not getvol):
      in_file.write(savecomplex)

    in_file.write("quit \n")

    in_file.close()

    #run tleap
    p = sp.call('tleap -s -f complex_tleap.in > complex_tleap.log', shell=True)
    if os.path.exists("leap.log"): os.remove("leap.log")

    if(not getvol):
      cmd='ambpdb -p '+COMPLEXTOP+" -c "+COMPLEXRST+" > "+COMPLEXPDB
      p = sp.call(cmd, shell=True)

      return COMPLEXTOP,COMPLEXRST,COMPLEXPDB

    else:
        #compute volume and charge of system
        #vol=$(grep "Volume:" leap_solv_0.log | awk '{print $2}')
        #chg=$(grep "The unperturbed charge of the unit" leap_solv_0.log | awk '{print $7}' | sed 's/(//g' | sed 's/)//g')
        with open("complex_tleap.log", 'r+') as fd:
           contents = fd.readlines()
           for line in contents:
               row=line.strip().split()
               if("Volume:" in line):
                   volume=float(row[1])
               if("The unperturbed charge of the unit" in line):
                   charge=row[6].strip("'").strip("(").strip(")")

        return volume,charge
