In [3]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from ase.io import read
from dscribe.descriptors import SOAP
import pubchempy as pcp

In [3]:
path_excel_file = '..\\Bancos de Dados\\qm9.xlsx'
path_mol_files = '..\\Arquivos Mol\\QM9\\'

In [81]:
df = pd.read_excel(path_excel_file)

In [6]:
def filtering_database(database, colunas):
    database_copy = database.copy()
    database_copy = database_copy.reindex(columns = colunas)
    padrao = Chem.MolFromSmiles("N")
    padrao2 = Chem.MolFromSmiles("F")
    coluna = "smiles"
    linhas = len(database[coluna])
    progresso = 0
    fracao = 100/linhas
    
    for i in range(linhas):
        progresso = progresso+fracao
        print(f'Progresso: {round(progresso,3)}%', end='\r')
        smile = database[coluna][i]
        mol = Chem.MolFromSmiles(smile)
        #print(mol.HasSubstructMatch(padrao))
        if (mol.HasSubstructMatch(padrao) or mol.HasSubstructMatch(padrao2)): 
            database_copy = database_copy.drop(i)
            
    return database_copy


def getting_canonical_smiles(database):
    database_copy = database.copy()
    coluna = "smiles"
    nova_coluna = "canonical smiles"
    linhas = len(database_copy[coluna])
    fracao = 100/linhas
    progresso = 0 
    
    for i in range(linhas):
        progresso = progresso + fracao
        print(f'Progresso: {round(progresso, 3)}%', end='\r')
        smile = database_copy.loc[i, coluna]
        mol = Chem.MolFromSmiles(smile)
        canonical_smile = Chem.MolToSmiles(mol)
        database_copy.loc[i, coluna] = canonical_smile
    
    database_copy = database_copy.rename(columns={coluna: nova_coluna})
    return database_copy


def chempy_existence(database):
    database_copy = database.copy()
    coluna = "smiles"
    linhas = len(database_copy[coluna])
    fracao = 100/linhas
    progresso = 0 
    for i in range(linhas):
        progresso = progresso + fracao
        print(f'Progresso: {round(progresso,3)}%', end='\r')
        smile = database_copy[coluna][i]
        mol = pcp.get_compounds(smile,"smiles")[0].cid
        if mol == None:
            database_copy = database_copy.drop(i)
            
    return database_copy


def creating_mol_files(database):
    
    coluna = "smiles"
    linhas = len(database[coluna])
    linha = 0
    iteracao = 100/linhas
    for smile in database[coluna]:
        linha = linha + iteracao
        print(f'Progresso: {round(linha,3)}%', end='\r')   
        mol = Chem.MolFromSmiles(smile)
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol)
        Chem.MolToMolFile(mol,f"{path_mol_files}{smile}.mol")

In [None]:
colunas = ["mol_id", "smiles","h298"]
database_filtrado = filtering_database(df,colunas)
database_filtrado.to_pickle("qm9_filtrada.pickle")

In [4]:
df_filtrada = pd.read_pickle("qm9_filtrada.pickle")


In [None]:
df_filtrada_smile_canonico

Progresso: 100.0%%

In [None]:
soap = SOAP(
    species=['H', 'C', 'O'],
    rbf="gto", 
    r_cut= 6, 
    n_max= 1, 
    l_max= 0, 
    sparse=False
)


def creating_soaps(database, path, method):
    coluna = "smiles"
    lista_de_soaps = []
    iteracao = 100/len(database[coluna])
    linha = 0
    for smile in database[coluna]:
        linha = linha + iteracao
        print(f'Progresso: {round(linha,3)}%', end='\r')   
        mol_file = path+f"{smile}.mol"
        mol = soap.create(read(mol_file))
        mol = sum(mol)
        lista_de_soaps.append(mol)
        
    return lista_de_soaps

In [None]:
soaps_sum = creating_soaps(df_filtrada,path_mol_files,"a")

Progresso: 100.0%%

In [7]:
smiles_canonicos = getting_canonical_smiles(df_filtrada)

Progresso: 100.0%%

In [8]:
smiles_canonicos

Unnamed: 0,mol_id,canonical smiles,A,B,C,mu,alpha,homo,lumo,gap,...,zpve,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom
0,gdb_1,C,157.71180,157.709970,157.706990,0.0000,13.21,-0.3877,0.1171,0.5048,...,0.044749,-40.478930,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.643290,-401.014647,-372.471772
1,gdb_3,O,799.58812,437.903860,282.945450,1.8511,6.31,-0.2928,0.0687,0.3615,...,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171
2,gdb_4,C#C,0.00000,35.610036,35.610036,0.0000,16.28,-0.2845,0.0506,0.3351,...,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724
3,gdb_6,C=O,285.48839,38.982300,34.298920,2.1089,14.18,-0.2670,-0.0406,0.2263,...,0.026603,-114.483613,-114.480746,-114.479802,-114.505268,6.413,-358.756935,-360.512706,-362.291066,-340.464421
4,gdb_7,CC,80.46225,19.906490,19.906330,0.0000,23.95,-0.3385,0.1041,0.4426,...,0.074542,-79.764152,-79.760666,-79.759722,-79.787269,10.098,-670.788296,-675.710476,-679.860821,-626.927299
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50673,gdb_133870,C1C2OC3C4CC3(O2)C14,3.22990,2.024540,1.960070,1.7397,70.28,-0.2505,0.0710,0.3215,...,0.139275,-421.741500,-421.735945,-421.735001,-421.770998,24.855,-1683.452280,-1695.076257,-1704.559173,-1565.822580
50674,gdb_133871,C1C2OC3C4CC13C4O2,3.23767,2.022870,1.981900,2.1011,70.50,-0.2508,0.0836,0.3344,...,0.139644,-421.754627,-421.749139,-421.748195,-421.784075,24.583,-1691.689590,-1703.355610,-1712.838526,-1574.028515
50675,gdb_133875,C1C2C3C2C2C4OC12C43,3.51898,2.150950,1.876170,2.3177,73.27,-0.2283,0.0744,0.3027,...,0.139289,-384.596376,-384.591020,-384.590076,-384.625639,24.482,-1729.061516,-1740.810995,-1750.293911,-1611.540376
50676,gdb_133877,C1C2C3C2C2C4CC12C43,3.45974,2.118880,1.844500,0.2462,81.37,-0.2499,0.1023,0.3523,...,0.164037,-348.696763,-348.691301,-348.690357,-348.726069,25.376,-1928.388495,-1941.849191,-1952.518099,-1797.779400


In [2]:
smiles_IPIII = pd.read_pickle('dScribeSOAP_1_0 - QM9.pickle')["Smiles-Canonical"].tolist()
smiles_IPII = smiles_canonicos["canonical smiles"].tolist()

NameError: name 'pd' is not defined

In [134]:
smiles_IPIII = set(smiles_IPIII)
smiles_IPII = set(smiles_IPII)

In [135]:
smiles_IPIII - smiles_IPII

set()

In [8]:
smile = "OCC1(O)C2C3OC2C31"

import pubchempy as pbp

a = pbp.get_compounds(smile, "smiles")[0].cid

In [1]:
len(smiles_IPII)

NameError: name 'smiles_IPII' is not defined

In [136]:
len(smiles_IPII - smiles_IPIII)

5437