In [1]:
import rdkit
import selfies
from rdkit import Chem
import os
import pandas as pd
from io import StringIO
import re

In [2]:
def parse(f):
    file = open(f,'r')
    content = file.readlines()
    cl_no=0
    saved_linenos=list()
    dic=dict()
    for line in content:
        #print(line)
        cline=line.strip()
        if cline == "## Molecule":
            #save next line that contains the SMILES
            saved_linenos.append(cl_no+1)
            curr_mollineno=cl_no+1
        if cline == "## SMILES":
            #save current line plus 4
            toks_clean=[x.strip() for x in content[cl_no+1].strip().split("|")][2:-1]
            attrel_clean=[x.strip() for x in content[cl_no+3].strip().split("|")][2:-1]
            attabs_clean=[x.strip() for x in content[cl_no+4].strip().split("|")][2:-1]
            dic[content[curr_mollineno].strip()]=[toks_clean,attrel_clean,attabs_clean]
        cl_no+=1
        
    for key, value in dic.items():
        #print(key, value)
        assert(len(value[1])==(len(value[0])))
        #for it in value:
            #print("________",it)
    return dic

In [3]:
def parse_plusselfies(f):
    file = open(f,'r')
    content = file.readlines()
    cl_no=0
    saved_linenos=list()
    smiles_dic=dict()
    selfies_dic=dict()
    for line in content:
        #print(line)
        #print("current line number:",cl_no)
        cline=line.strip()
        if cline == "## Molecule":
            #save next line that contains the SMILES
            saved_linenos.append(cl_no+1)
            curr_mollineno=cl_no+1
        if cline == "## SMILES":
            #save current line plus 4
            toks_clean=[x.strip() for x in content[cl_no+1].strip().split("|")][2:-1]
            attrel_clean=[x.strip() for x in content[cl_no+3].strip().split("|")][2:-1]
            attabs_clean=[x.strip() for x in content[cl_no+4].strip().split("|")][2:-1]
            smiles_dic[content[curr_mollineno].strip()]=[toks_clean,attrel_clean,attabs_clean]
        if cline == "## SELFIES":
            #save current line plus 4
            toks_clean=[x.strip() for x in content[cl_no+1].strip().split("|")][2:-1]
            attrel_clean=[x.strip() for x in content[cl_no+3].strip().split("|")][2:-1]
            attabs_clean=[x.strip() for x in content[cl_no+4].strip().split("|")][2:-1]
            selfies_dic[content[curr_mollineno].strip()]=[toks_clean,attrel_clean,attabs_clean]
        cl_no+=1
    #print("saved linenos",saved_linenos)
    for key, value in smiles_dic.items():
        #print(key, value)
        assert(len(value[1])==(len(value[0])))
        #for it in value:
        #    print("________",it)
            
    for key, value in selfies_dic.items():
        #print(key, value)
        assert(len(value[1])==(len(value[0])))
        #for it in value:
         #   print("________",it)
           
    return smiles_dic, selfies_dic

In [4]:
def create_identities(smiles):
    selfies_string = selfies.encoder(smiles)
    remaining_smiles_string, attributions= selfies.decoder(selfies_string, attribute=True)
    result_smiles = ""
    for att in attributions:
        next_token = att.token
        if not any([char.isalpha() for char in next_token]):
            continue
        while remaining_smiles_string:
            if remaining_smiles_string.startswith(next_token):
                remaining_smiles_string = remaining_smiles_string[len(next_token):]
                if len(next_token)==1:
                    idied_token = f"[{next_token}:{att.attribution[-1].index}]"
                else:
                    idied_token = f"[{next_token[1:-1]}:{att.attribution[-1].index}]"
                result_smiles+=idied_token
                break
            else:
                result_smiles+=remaining_smiles_string[0]
                remaining_smiles_string = remaining_smiles_string[1:]
    return result_smiles, selfies_string  

In [5]:
def create_mapping_betweenSMISEL(mapped_smiles,selfies,selfies_val):
    smi_toks = re.findall('\[(.*?)\]', mapped_smiles)
    print("smitoks: ",smi_toks)
    sel_toks = re.findall('\[(.*?)\]', selfies)
    #iterate through smi_toks, extract numbers
    smi_nums = [int((re.search(r':(\d+)', smi_tok)).group(1)) for smi_tok in smi_toks]
    print(smi_nums)
    #select SELFIES tokens accordingly to map output and get rid off branch and overloaded tokens
    selfies_sel = [sel_toks[num] for num in smi_nums]
    atts_rel = selfies_val[1]
    atts_abs = selfies_val[2]
    atts_rel_clean = [atts_rel[num] for num in smi_nums]
    atts_abs_clean = [atts_abs[num] for num in smi_nums]
    assert(len(atts_rel_clean)==(len(atts_abs_clean)))
    assert(len(selfies_sel)==(len(smi_toks)))
    print(selfies_sel)
    print(smi_toks)
    return selfies_sel, atts_rel_clean, atts_abs_clean

In [6]:
def exec_antechamber(inputfile,ftype):
    inputfile_noex=os.path.splitext(inputfile)[0]
    print("inputfile no extension",inputfile_noex)
    if ftype=="pdb":
        os.system(f"antechamber -i {inputfile} -fi pdb -o {inputfile_noex}_ass.mol2 -fo mol2 -at amber")
    elif ftype=="mol2":
        os.system(f"antechamber -i {inputfile} -fi mol2 -o {inputfile_noex}_ass.mol2 -fo mol2 -at amber")
    else:
        print("Execution of antechamber failed. Wrong filetype given. Filetype needs to be pdb or mol2.")
        return None
    return f"{inputfile_noex}_ass.mol2"

In [7]:
def smilestofile(smiles,no,ftype):
    #obabel [-i<input-type>] <infilename> [-o<output-type>] -O<outfilename> [Options]
    if ftype=="pdb":
        os.system(f'obabel -:"{smiles}" -o pdb -O ./mols/mol_{no}.pdb')
        return f"./mols/mol_{no}.pdb"
    elif ftype=="mol2":
        os.system(f'obabel -:"{smiles}" -o mol2 -O ./mols/mol_{no}.mol2 --gen3d')
        return f"./mols/mol_{no}.mol2"
    else:
        print("Execution of obabel failed, no file could be created. Wrong filetype given. Output filetype needs to be pdb or mol2.")
        return None

In [8]:
def getatom_ass(mol2):
    #extract lines between @<TRIPOS>ATOM and @<TRIPOS>BOND to get atom asss
    with open(mol2) as infile:
        lines = infile.read().splitlines()
    start = [i for i, line in enumerate(lines) if line.startswith("@<TRIPOS>ATOM")][0]
    end = [i for i, line in enumerate(lines) if line.startswith("@<TRIPOS>BOND")][0]
    extract="\n".join(lines[start+1:end])
    print("\n_______________________extraction \n", extract)
    pddf = pd.read_csv(StringIO(extract), header=None, delimiter=r"\s+")
    #extract 5th column with atom_asss
    atoms_ass_list = pddf.iloc[:,5].tolist()
    atoms_ass_set = set(atoms_ass_list)
    #print("atoms assigned", atoms_ass_list)
    #print()
    #print("assignments set", atoms_ass_set)
    return atoms_ass_list, atoms_ass_set

In [9]:
def clean_SMILES(SMILES_tok, attention_rel, attention_abs):
    SMILES_tok_prep=list()
    attention_relprep=list()
    attention_absprep=list()
    struc_toks=r"()=-:~1234567890#"
    for i in range(len(SMILES_tok)):
        #when it's an H in the SMILES, ignore, cannot deal
        #print(SMILES_tok[i])
        if SMILES_tok[i]!="H" and not SMILES_tok[i].isdigit():
            if any(elem in struc_toks for elem in SMILES_tok[i])==False:
                SMILES_tok_prep.append(SMILES_tok[i])
                attention_relprep.append(attention_rel[i])
                attention_absprep.append(attention_abs[i])
                #print("not H and not number: ", SMILES_tok[i])
            #if any(elem in struc_toks for elem in SMILES_tok[i])==True:
             #   print("H, structure or number: ", SMILES_tok[i])
    print(SMILES_tok_prep)
    return SMILES_tok_prep,attention_relprep,attention_absprep

In [10]:
def clean_acout(ac_out):
    ac_out_noH=list()
    for j in ac_out:
        #save only when it's mot H
        if not j.startswith('H'):
            ac_out_noH.append(j)
            #print(f"-----------------this is not H, this is: {j}")
    #print("before: ", ac_out)
    #print("after: ", ac_out_noH)
    return ac_out_noH

In [11]:
def check_acVStoks(SMILES_tok_prep,ac_out_noH):
    for k in range(len(ac_out_noH)):
        #compare first letter in 
        #print(ac_out_noH[k][0])
        compSMI=SMILES_tok_prep[k][0]
        if SMILES_tok_prep[k][0]=="[":
            compSMI=SMILES_tok_prep[k][1]
        if (ac_out_noH[k][0].upper()!=compSMI.upper()):
            print(f"ac_out and SMILES not the same: {ac_out_noH[k][0]} vs {compSMI}")
        #else:
        #    print(f"letters same: {ac_out_noH[k][0]} vs {compSMI}")

In [12]:
def link_atomstoatoms(ac_out, SMILES_tok, attention_rel, attention_abs):
    dic=dict()
    SMILES_tok_prep=list()
    attention_relprep=list()
    attention_absprep=list()
    print("len smiles tok",len(SMILES_tok))
    print("smiles tok",SMILES_tok)
    print("len_attention rel",len(attention_rel))
    print("len attention abs", len(attention_abs))
    
    assert(len(SMILES_tok)==(len(attention_rel)))
    assert(len(SMILES_tok)==(len(attention_abs)))
    SMILES_tok_prep, attention_relprep, attention_absprep = clean_SMILES(SMILES_tok,attention_rel,attention_abs)
    ac_out_noH = clean_acout(ac_out)
    
    #compare atoms in SMILES_tok with ac_out_noH
   # for h in ac_out_noH:
    if (len(ac_out_noH)!=(len(SMILES_tok_prep))):
        print("!!!!!!!!!!!!!!!!!!ac_out_noH and SMILES_tok_prep do NOT have same lengths: {} vs {}".format(len(ac_out_noH),len(SMILES_tok_prep)))
        print("!!!!!!!!!!!!!!printing all items in ac_out_noH:")
        for g in ac_out_noH:
            print(f"\t {g}")
        print("!!!!!!!!!!!!!!printing all items in SMILES_tok_prep:")
        for h in SMILES_tok_prep:
            print(f"\t {h}")
    #check whether assignment and SMILES tok have same length
    assert(len(ac_out_noH)==(len(SMILES_tok_prep)))
    #cannot check assignment names against toks since not same beginning letter anymore
    #check_acVStoks(SMILES_tok_prep,ac_out_noH)
    
    return SMILES_tok_prep, ac_out_noH, attention_relprep, attention_absprep
    

In [13]:
def write_outputfile(results,outname):
    f=open(f"{outname}.txt",'w') 
    for key, value in results.items():
        f.write(f"{key}\n")
        f.write("{:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30}\n".format("SMILES_token","antechamber_assignment","attention_relative","attention_absolute","SELFIES_token","SELFIES_attention_relative","SELFIES_attention_absolute")) 
        #f.write("{:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30}\n".format(results[key][0][i],value[1][i],value[2][i],value[3][i],value[4][i],value[5][i],value[6][i]))
        for i in range(len(value[0])):
            #f.write(f"{results[key][0][i]} \t\t\t {value[1][i]} \t\t\t {value[2][i]} \t\t\t {value[3][i]}\n")
            f.write("{:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30} \t {:<30}\n".format(results[key][0][i],value[1][i],value[2][i],value[3][i],value[4][i],value[5][i],value[6][i]))
        f.write("\n")
    
    #f = open(outname.txt,"w")
    #f=open(f"{outname}.txt",'w')  
    ##for key, value in results.items():
      #  f.write(f"{key}\n")
      #  f.write("{SMILES_tok_prep[i]} \t {ac_out_noH[i]} \t {attention_relprep[i]} \t {attention_absprep[i]}\n")   
      #  for i in range(len(value[0])):
      #      f.write(f"{results[key][0][i]}\t \t{value[1][i]}\t \t{value[2][i]}\t \t{value[3][i]}\n")
      #  f.write("\n")
    f.close()

## MAIN

In [14]:
#smi = "cc1cc(=O)[nH]c(=S)[nH]1"
#smi_toks = ['C','c','1','c','c','(','=','O',')','[nH]','c','(','=','S',')','[nH]','1']
#smi_atts = ['-0.012','-0.005','0.016','-0.004','-0.009','-0.007','-0.009','-0.003','-0.003','0.001','-0.011','-0.01','-0.004','0.004','0.039','0.024','-0.008']
#smi = "C1=CC=CN=C1"
#smi_toks= ['C','1','=','C','C','=','C','N','=','C','1']
#smi_atts = [1,2,3,4,5,6,7,8,9,10,11]
results=dict()
#dic=parse('./markdown_bugfix.md')
smiles_dic,selfies_dic = parse_plusselfies('./markdown_bugfix.md')
no=0
failed_smiToFile=0
failed_fileToac=0
pdb_ass_success=0
for (key, value), (key2,value2) in zip(smiles_dic.items(),selfies_dic.items()):
    no+=1
    mapped_smiles, selfies_string  = create_identities(key)
    print(mapped_smiles, selfies_string)
    selfies_sel, sel_atts_rel_clean, sel_atts_abs_clean = create_mapping_betweenSMISEL(mapped_smiles,selfies_string,value2)
    #print(aliste)
    smi=key
    smi_toks=value[0]
    smi_relatts=value[1]
    smi_absatts=value[2]
    print()
    print()
    print(f"{smi}--------------------------------------------------------")
    #mol2_file=smilestomol2(smi,no)
    print("smiles to pdb")
    pdb_file=smilestofile(smi,no, "pdb")
    if os.path.isfile(pdb_file)==False:
        failed_smiToFile+=1
        print("!!!!!!!!!!!!!!!!!!!!!!FAILURE to generate PDB file!!!!!!!!!!!!!!!!!!!")
        break
    #os.system(f"cat {mol2_file}")
    #mol2_ass = exec_antechamber(mol2_file)
    print(".............executing antechamber")
    mol2_ass = exec_antechamber(pdb_file,"pdb")
        
    if os.path.isfile(mol2_ass)==True:
        pdb_ass_success+=1
 #   os.system(f"cat {mol2_ass}")
        atoms_ass_list, atoms_ass_set = getatom_ass(mol2_ass)
        SMILES_tok_prep,ac_out_noH,attention_relprep,attention_absprep=link_atomstoatoms(atoms_ass_list, smi_toks, smi_relatts, smi_absatts)
        assert(len(ac_out_noH)==(len(selfies_sel)))
        results[smi]=[SMILES_tok_prep,ac_out_noH,attention_relprep,attention_absprep,selfies_sel,sel_atts_rel_clean,sel_atts_abs_clean]
    else:
        print("!!!!!!!!!!!!!!!!Atom assignment failed!!!!!!!!!!!!!!!!!")
        failed_fileToac+=1
    

#print("{SMILES_tok_prep[i]} \t {ac_out_noH[i]} \t {attention_relprep[i]} \t {attention_absprep[i]}")   
     #   for i in range(len(SMILES_tok_prep)):
      #      print(f"{SMILES_tok_prep[i]} \t {ac_out_noH[i]} \t {attention_relprep[i]} \t {attention_absprep[i]}")
        

[C:0]1=[C:1][C:2]2=[C:3][C:4]=[C:5]3[C:6]=[C:7][C:8]=[C:9]4[C:10]=[C:11][C:12](=[C:15]1)[C:18]2=[C:21] [C][=C][C][=C][C][=C][C][=C][C][=C][C][=C][C][=Branch1][Ring2][=C][Ring1][=C][C][Ring1][=N][=C][Ring1][O][Ring1][#Branch1]
smitoks:  ['C:0', 'C:1', 'C:2', 'C:3', 'C:4', 'C:5', 'C:6', 'C:7', 'C:8', 'C:9', 'C:10', 'C:11', 'C:12', 'C:15', 'C:18', 'C:21']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 18, 21]
['C', '=C', 'C', '=C', 'C', '=C', 'C', '=C', 'C', '=C', 'C', '=C', 'C', '=C', 'C', '=C']
['C:0', 'C:1', 'C:2', 'C:3', 'C:4', 'C:5', 'C:6', 'C:7', 'C:8', 'C:9', 'C:10', 'C:11', 'C:12', 'C:15', 'C:18', 'C:21']


c1cc2ccc3cccc4ccc(c1)c2c34--------------------------------------------------------
smiles to pdb
.............executing antechamber
inputfile no extension ./mols/mol_1

Welcome to antechamber 22.0: molecular input file processor.

Info: acdoctor mode is on: check and diagnose problems in the input file.
Info: The atom type is set to amber; the options available to the -at flag 

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted


.............executing antechamber
inputfile no extension ./mols/mol_5

Welcome to antechamber 22.0: molecular input file processor.

Info: acdoctor mode is on: check and diagnose problems in the input file.
Info: The atom type is set to amber; the options available to the -at flag are
      gaff, gaff2, amber, bcc, and sybyl.

-- Check Format for pdb File --
   Status: pass
-- Check Unusual Elements --
   Status: pass
-- Check Open Valences --
         It is quite possible that there are unfilled valences.
-- Check Geometry --
      for those bonded   
      for those not bonded   
-- Check Weird Bonds --
   Status: pass
-- Check Number of Units --
   Status: pass
acdoctor mode has completed checking the input file.

(1) double check the structure (the connectivity) and/or 
(2) adjust atom valence penalty parameters in APS.DAT, and/or 
(3) increase PSCUTOFF in define.h and recompile bondtype.c
    (be cautious, using a large value of PSCUTOFF (>100) will 
    significantly increase th

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
/home/inken/anaconda3/envs/AmberTools/bin/wrapped_progs/antechamber: Fatal Error!
This molecule may have more than one unit.
       antechamber can only handle one unit.  If the input is a single unit
       then the connectivity is wrong and the geometry may be bad.
       Please convert your molecule to a mol2 file via:
       antechamber -j 5 -at sybyl -dr no 
       And then check your molecule with a visualization program;
       manually add missing bonds or delete unwanted bonds as appropriate.
1 molecule converted
1 molecule converted
1 molecule converted


(1) double check the structure (the connectivity) and/or 
(2) adjust atom valence penalty parameters in APS.DAT, and/or 
(3) increase PSCUTOFF in define.h and recompile bondtype.c
    (be cautious, using a large value of PSCUTOFF (>100) will 
    significantly increase the computation time).


_______________________extraction 
       1 C            0.0000     0.0000     0.0000 DU         1 UNL       0.000000
      2 C1           0.0000     0.0000     0.0000 CT         1 UNL       0.000000
      3 C2           0.0000     0.0000     0.0000 CM         1 UNL       0.000000
      4 C3           0.0000     0.0000     0.0000 CT         1 UNL       0.000000
      5 N            0.0000     0.0000     0.0000 NT         1 UNL       0.000000
      6 C4           0.0000     0.0000     0.0000 CT         1 UNL       0.000000
      7 C5           0.0000     0.0000     0.0000 CT         1 UNL       0.000000
      8 O            0.0000     0.0000     0.0000 OS         1 UNL       0.000000
      9 N1   

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted


In [15]:
print(f"failed translation SMILES to pdb file: {failed_smiToFile} out of {no} times")
print(f"successful pdb assignment with antechamber: {pdb_ass_success} out of {no} times")
#print(f"Failed to translate {failed_smiToFile} SMILES to file")
#print(f"Failed to translate {failed_mol2Toac} mol2 to antechamber atom assignment")

failed translation SMILES to pdb file: 0 out of 20 times
successful pdb assignment with antechamber: 19 out of 20 times


In [16]:
for key, value in results.items():
    print(key)
    print("{SMILES_token} \t {antechamber_assignment} \t {SMILES_attention_relative} \t {SMILES_attention_absolute} \t {SELFIES_token}  \t {SELFIES_attention_relative} \t {SELFIES_attention_absolute}")   
    for i in range(len(value[0])):
        print("{:<30} {:<30} {:<30} {:<30} {:<30} {:<30} {:<30}".format(results[key][0][i],value[1][i],value[2][i],value[3][i],value[4][i],value[5][i],value[6][i]))
        #print(f"{results[key][0][i]} \t \t \t {value[1][i]} \t \t \t {value[2][i]} \t \t \t \t {value[3][i]} \t\t \t \t {value[4][i]} \t \t \t \t {value[5][i]} \t \t \t \t {value[6][i]}")
    print()

c1cc2ccc3cccc4ccc(c1)c2c34
{SMILES_token} 	 {antechamber_assignment} 	 {SMILES_attention_relative} 	 {SMILES_attention_absolute} 	 {SELFIES_token}  	 {SELFIES_attention_relative} 	 {SELFIES_attention_absolute}
c                              CD                             -0.006                         0.032                          C                              -0.017                         0.021                         
c                              CM                             -0.002                         0.036                          =C                             0.003                          0.041                         
c                              CT                             -0.011                         0.027                          C                              -0.015                         0.024                         
c                              CM                             -0.005                         0.033                          =C             

In [17]:
write_outputfile(results,"./mols/results_atomassignment_withselfies_ac_new")

In [47]:
mol12="CN1C(C(=O)Nc2ccccn2)=C(O)c2ccccc2S1(=O)=O"
smilestomol2(mol12,12)


NameError: name 'smilestomol2' is not defined

In [19]:
for key, value in results.items():
    print(key)
    print("{SMILES_token} \t {antechamber_assignment} \t {attention_relative} \t {attention_absolute}")   
    for i in range(len(value[0])):
        print(f"{results[key][0][i]} \t \t \t {value[1][i]} \t \t \t {value[2][i]} \t \t \t \t {value[3][i]} \t")
    print()
    

c1cc2ccc3cccc4ccc(c1)c2c34
{SMILES_token} 	 {antechamber_assignment} 	 {attention_relative} 	 {attention_absolute}
c 	 	 	 CD 	 	 	 -0.006 	 	 	 	 0.032 	
c 	 	 	 CM 	 	 	 -0.002 	 	 	 	 0.036 	
c 	 	 	 CT 	 	 	 -0.011 	 	 	 	 0.027 	
c 	 	 	 CM 	 	 	 -0.005 	 	 	 	 0.033 	
c 	 	 	 CM 	 	 	 -0.014 	 	 	 	 0.025 	
c 	 	 	 CT 	 	 	 -0.008 	 	 	 	 0.031 	
c 	 	 	 CM 	 	 	 -0.012 	 	 	 	 0.027 	
c 	 	 	 CD 	 	 	 -0.011 	 	 	 	 0.027 	
c 	 	 	 CM 	 	 	 -0.008 	 	 	 	 0.03 	
c 	 	 	 CT 	 	 	 -0.015 	 	 	 	 0.023 	
c 	 	 	 CM 	 	 	 -0.012 	 	 	 	 0.026 	
c 	 	 	 CM 	 	 	 -0.012 	 	 	 	 0.027 	
c 	 	 	 CT 	 	 	 -0.013 	 	 	 	 0.025 	
c 	 	 	 CM 	 	 	 -0.014 	 	 	 	 0.024 	
c 	 	 	 CT 	 	 	 -0.016 	 	 	 	 0.023 	
c 	 	 	 CT 	 	 	 -0.012 	 	 	 	 0.027 	

Cc1cc(=O)[nH]c(=S)[nH]1
{SMILES_token} 	 {antechamber_assignment} 	 {attention_relative} 	 {attention_absolute}
C 	 	 	 DU 	 	 	 -0.034 	 	 	 	 0.025 	
c 	 	 	 CT 	 	 	 -0.015 	 	 	 	 0.044 	
c 	 	 	 CM 	 	 	 -0.032 	 	 	 	 0.027 	
c 	 	 	 CT 	 

In [113]:
import numpy as np
def get_mean_median_stddev(liste):
    liste_np=np.array(liste)
    mean=round(np.mean(liste_np),3)
    median=round(np.median(liste_np),3)
    stddev=round(np.std(liste_np),3)
    return {"mean" : mean, "median" : median, "stddev" : stddev}

In [144]:
def aggregate_assignments(results):
    #create a set from value[1]
    generaldict=dict()
    for key,value in results.items():
        print(key,value)
        assignSet = set(value[1])
       # print(assignSet)
        dikt=dict()
        dikt_relatts=dict()
        dikt_absatts=dict()
        #loop through set
        assignstoatoms=dict()
        for ass in assignSet:
            dikt[ass]=0
            dikt_relatts[ass]=dict()
            dikt_absatts[ass]=dict()
            assigns=0
            atoms_sameass=list()
            atts_rel_sameass=list()
            atts_abs_sameass=list()
            for count,val in enumerate(value[1]):
                if val == ass:
                    atoms_sameass.append(value[0][count])
                    atts_rel_sameass.append(float(value[2][count]))
                    atts_abs_sameass.append(float(value[3][count]))
                    dikt[ass]+=float(value[2][count])
                    assigns+=1
            dikt[ass]=dikt[ass]/assigns
            print(ass,atts_rel_sameass,atts_abs_sameass)
            dikt_relatts[ass]=get_mean_median_stddev(atts_rel_sameass)
            dikt_absatts[ass]=get_mean_median_stddev(atts_abs_sameass)
            assignstoatoms[ass]=atoms_sameass
            #print("atomsameass",atoms_sameass)
            #print("diktassings,assigns",dikt[ass],assigns)
            assigns=0
        generaldict[key]=(dikt_relatts,dikt_absatts,assignstoatoms)
        print(generaldict[key])
    return generaldict
                    

In [145]:
def save_aggregated_assignments(agg, outname):
    f=open(f"{outname}.txt",'w') 
    for key, value in agg.items():
        f.write(f"{key}\n")
        for va in value:
            f.write(f"{va}\n")
        f.write("\n")
    f.close()

In [146]:
#aggregate_assignments(results)
testmo={'CC1CO1' : [['C', 'C', 'C', 'O', 'C'], ['DU', 'CM', 'CZ', 'OS', 'DU'], ['-0.035', '-0.024', '-0.019', '-0.012', '0.035'], ['0.131', '0.143', '0.147', '0.155','0.131']], 'CC4CO4' : [['C', 'C', 'C', 'O', 'C'], ['DU', 'CM', 'CZ', 'OS', 'DU'], ['-0.035', '-0.024', '-0.019', '-0.012', '0.035'], ['0.131', '0.143', '0.147', '0.155', '0.131']]}
gendi=aggregate_assignments(testmo)
print(gendi)

CC1CO1 [['C', 'C', 'C', 'O', 'C'], ['DU', 'CM', 'CZ', 'OS', 'DU'], ['-0.035', '-0.024', '-0.019', '-0.012', '0.035'], ['0.131', '0.143', '0.147', '0.155', '0.131']]
DU [-0.035, 0.035] [0.131, 0.131]
CM [-0.024] [0.143]
CZ [-0.019] [0.147]
OS [-0.012] [0.155]
({'DU': {'mean': 0.0, 'median': 0.0, 'stddev': 0.035}, 'CM': {'mean': -0.024, 'median': -0.024, 'stddev': 0.0}, 'CZ': {'mean': -0.019, 'median': -0.019, 'stddev': 0.0}, 'OS': {'mean': -0.012, 'median': -0.012, 'stddev': 0.0}}, {'DU': {'mean': 0.131, 'median': 0.131, 'stddev': 0.0}, 'CM': {'mean': 0.143, 'median': 0.143, 'stddev': 0.0}, 'CZ': {'mean': 0.147, 'median': 0.147, 'stddev': 0.0}, 'OS': {'mean': 0.155, 'median': 0.155, 'stddev': 0.0}}, {'DU': ['C', 'C'], 'CM': ['C'], 'CZ': ['C'], 'OS': ['O']})
CC4CO4 [['C', 'C', 'C', 'O', 'C'], ['DU', 'CM', 'CZ', 'OS', 'DU'], ['-0.035', '-0.024', '-0.019', '-0.012', '0.035'], ['0.131', '0.143', '0.147', '0.155', '0.131']]
DU [-0.035, 0.035] [0.131, 0.131]
CM [-0.024] [0.143]
CZ [-0.019] [0

In [147]:
agg=aggregate_assignments(results)

c1cc2ccc3cccc4ccc(c1)c2c34 [['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], ['CD', 'CM', 'CT', 'CM', 'CM', 'CT', 'CM', 'CD', 'CM', 'CT', 'CM', 'CM', 'CT', 'CM', 'CT', 'CT'], ['-0.006', '-0.002', '-0.011', '-0.005', '-0.014', '-0.008', '-0.012', '-0.011', '-0.008', '-0.015', '-0.012', '-0.012', '-0.013', '-0.014', '-0.016', '-0.012'], ['0.032', '0.036', '0.027', '0.033', '0.025', '0.031', '0.027', '0.027', '0.03', '0.023', '0.026', '0.027', '0.025', '0.024', '0.023', '0.027']]
CT [-0.011, -0.008, -0.015, -0.013, -0.016, -0.012] [0.027, 0.031, 0.023, 0.025, 0.023, 0.027]
CM [-0.002, -0.005, -0.014, -0.012, -0.008, -0.012, -0.012, -0.014] [0.036, 0.033, 0.025, 0.027, 0.03, 0.026, 0.027, 0.024]
CD [-0.006, -0.011] [0.032, 0.027]
({'CT': {'mean': -0.012, 'median': -0.012, 'stddev': 0.003}, 'CM': {'mean': -0.01, 'median': -0.012, 'stddev': 0.004}, 'CD': {'mean': -0.008, 'median': -0.008, 'stddev': 0.002}}, {'CT': {'mean': 0.026, 'median': 0.026, 'stddev': 0.

In [127]:
for key, value in agg.items():
    print(key)
    for va in value:
        print(va)
    print()

c1cc2ccc3cccc4ccc(c1)c2c34
{'CT': {'mean': -0.012, 'median': -0.012, 'stddev': 0.003}, 'CM': {'mean': -0.01, 'median': -0.012, 'stddev': 0.004}, 'CD': {'mean': -0.008, 'median': -0.008, 'stddev': 0.002}}
{'CT': {'mean': 0.026, 'median': 0.026, 'stddev': 0.003}, 'CM': {'mean': 0.028, 'median': 0.027, 'stddev': 0.004}, 'CD': {'mean': 0.03, 'median': 0.03, 'stddev': 0.003}}
{'CT': ['c', 'c', 'c', 'c', 'c', 'c'], 'CM': ['c', 'c', 'c', 'c', 'c', 'c', 'c', 'c'], 'CD': ['c', 'c']}

Cc1cc(=O)[nH]c(=S)[nH]1
{'CM': {'mean': -0.032, 'median': -0.032, 'stddev': 0.0}, 'DU': {'mean': 0.01, 'median': 0.028, 'stddev': 0.031}, 'CT': {'mean': -0.022, 'median': -0.018, 'stddev': 0.008}, 'S': {'mean': 0.092, 'median': 0.092, 'stddev': 0.0}, 'OS': {'mean': -0.001, 'median': -0.001, 'stddev': 0.0}}
{'CM': {'mean': 0.027, 'median': 0.027, 'stddev': 0.0}, 'DU': {'mean': 0.069, 'median': 0.087, 'stddev': 0.031}, 'CT': {'mean': 0.037, 'median': 0.041, 'stddev': 0.008}, 'S': {'mean': 0.151, 'median': 0.151, 'std

In [128]:
save_aggregated_assignments(agg,"./mols/aggregated_assignments_attention")