In [158]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from ase.io import read
from dscribe.descriptors import SOAP
import pubchempy as pcp

In [80]:
path_excel_file = '..\\Bancos de Dados\\qm9.xlsx'
path_mol_files = '..\\Arquivos Mol\\QM9\\'

In [81]:
df = pd.read_excel(path_excel_file)

In [173]:
def filtering_database(database, colunas):
    database_copy = database.copy()
    database_copy = database_copy.reindex(columns = colunas)
    padrao = Chem.MolFromSmiles("N")
    padrao2 = Chem.MolFromSmiles("F")
    coluna = "smiles"
    linhas = len(database[coluna])
    progresso = 0
    fracao = 100/linhas
    
    for i in range(linhas):
        progresso = progresso+fracao
        print(f'Progresso: {round(progresso,3)}%', end='\r')
        smile = database[coluna][i]
        mol = Chem.MolFromSmiles(smile)
        #print(mol.HasSubstructMatch(padrao))
        if (mol.HasSubstructMatch(padrao) or mol.HasSubstructMatch(padrao2)): 
            database_copy = database_copy.drop(i)
            
    return database_copy


def getting_canonical_smiles(database):
    database_copy = database.copy()
    coluna = "smiles"
    nova_coluna = "canonical smiles"
    linhas = len(database_copy[coluna])
    fracao = 100/linhas
    progresso = 0 
    
    for i in range(linhas):
        progresso = progresso + fracao
        print(f'Progresso: {round(progresso, 3)}%', end='\r')
        smile = database_copy.loc[i, coluna]
        mol = Chem.MolFromSmiles(smile)
        canonical_smile = Chem.MolToSmiles(mol)
        database_copy.loc[i, coluna] = canonical_smile
    
    database_copy = database_copy.rename(columns={coluna: nova_coluna})
    return database_copy


def chempy_existence(database):
    database_copy = database.copy()
    coluna = "smiles"
    linhas = len(database_copy[coluna])
    fracao = 100/linhas
    progresso = 0 
    for i in range(linhas):
        progresso = progresso + fracao
        print(f'Progresso: {round(progresso,3)}%', end='\r')
        smile = database_copy[coluna][i]
        mol = pcp.get_compounds(smile,"smiles")[0].cid
        if mol == None:
            database_copy = database_copy.drop(i)
            
    return database_copy


def creating_mol_files(database):
    
    coluna = "smiles"
    linhas = len(database[coluna])
    linha = 0
    iteracao = 100/linhas
    for smile in database[coluna]:
        linha = linha + iteracao
        print(f'Progresso: {round(linha,3)}%', end='\r')   
        mol = Chem.MolFromSmiles(smile)
        mol = Chem.AddHs(mol)
        AllChem.EmbedMolecule(mol)
        Chem.MolToMolFile(mol,f"{path_mol_files}{smile}.mol")

In [None]:
colunas = ["mol_id", "smiles","h298"]
database_filtrado = filtering_database(df,colunas)
database_filtrado.to_pickle("qm9_filtrada.pickle")

In [None]:
df_filtrada = pd.read_pickle("qm9_filtrada.pickle")


Unnamed: 0,mol_id,smiles,h298
0,gdb_1,C,-40.475117
1,gdb_3,O,-76.400922
2,gdb_4,C#C,-77.304583
3,gdb_6,C=O,-114.479802
4,gdb_7,CC,-79.759722
...,...,...,...
50673,gdb_133870,C1C2C3CC4OC2C13O4,-421.735001
50674,gdb_133871,C1C2C3OC4CC13C2O4,-421.748195
50675,gdb_133875,C1C2C3C2C2C4OC12C34,-384.590076
50676,gdb_133877,C1C2C3C4C5CC13C2C45,-348.690357


In [None]:
df_filtrada_smile_canonico

Progresso: 100.0%%

In [None]:
soap = SOAP(
    species=['H', 'C', 'O'],
    rbf="gto", 
    r_cut= 6, 
    n_max= 1, 
    l_max= 0, 
    sparse=False
)


def creating_soaps(database, path, method):
    coluna = "smiles"
    lista_de_soaps = []
    iteracao = 100/len(database[coluna])
    linha = 0
    for smile in database[coluna]:
        linha = linha + iteracao
        print(f'Progresso: {round(linha,3)}%', end='\r')   
        mol_file = path+f"{smile}.mol"
        mol = soap.create(read(mol_file))
        mol = sum(mol)
        lista_de_soaps.append(mol)
        
    return lista_de_soaps

In [None]:
soaps_sum = creating_soaps(df_filtrada,path_mol_files,"a")

Progresso: 100.0%%

In [176]:
chempy_existence(df_filtrada)

Progresso: 0.357%

KeyboardInterrupt: 

In [131]:
smiles_canonicos = getting_canonical_smiles(df_filtrada)

Progresso: 100.0%%

In [132]:
smiles_canonicos

Unnamed: 0,mol_id,canonical smiles,h298,soaps sum
0,gdb_1,C,-40.475117,"[100.56466134545224, 37.53798233106532, 0.0, 1..."
1,gdb_3,O,-76.400922,"[30.531902836068127, 0.0, 18.083297209623076, ..."
2,gdb_4,C#C,-77.304583,"[18.59233564365028, 21.660815881383385, 0.0, 3..."
3,gdb_6,C=O,-114.479802,"[27.05652741336845, 16.98334077113256, 8.03151..."
4,gdb_7,CC,-79.759722,"[142.68426187070418, 73.51303962197329, 0.0, 3..."
...,...,...,...,...
50673,gdb_133870,C1C2OC3C4CC3(O2)C14,-421.735001,"[163.39081185056708, 104.85995236610283, 17.54..."
50674,gdb_133871,C1C2OC3C4CC13C4O2,-421.748195,"[124.83605102623108, 98.76470366700215, 16.721..."
50675,gdb_133875,C1C2C3C2C2C4OC12C43,-384.590076,"[171.0832450305602, 218.03089998185135, 4.6392..."
50676,gdb_133877,C1C2C3C2C2C4CC12C43,-348.690357,"[197.0329675604557, 243.0867830040825, 0.0, 49..."


In [133]:
smiles_IPIII = pd.read_pickle('dScribeSOAP_1_0 - QM9.pickle')["Smiles-Canonical"].tolist()
smiles_IPII = smiles_canonicos["canonical smiles"].tolist()

In [134]:
smiles_IPIII = set(smiles_IPIII)
smiles_IPII = set(smiles_IPII)

In [135]:
smiles_IPIII - smiles_IPII

set()

In [155]:
smile = "CC1OC23C=CC(C2)C13"

import pubchempy as pbp

a = pbp.get_compounds(smile, "smiles")[0].cid

In [175]:
a == None

True

In [137]:
smiles_IPII - smiles_IPIII

{'CC1OC23C=CC(C2)C13',
 'COC12CC3CC1C32O',
 'C#CC12CCC1CC2O',
 'C#CCC12C3CC1C2O3',
 'O=CC12CC3CC(O1)C32',
 'CC1CCC2C3CC12O3',
 'C1COC2C3CC2(C1)O3',
 'C#CC12C3CC1C2O3',
 'COC1CC23CC(O2)C13',
 'CC1C2OC13CC(=O)C23',
 'CC1CC23CCC12CO3',
 'CC12CC1C1(C)OCC21',
 'C1CC2CC3OCC23C1',
 'OC1CC2(O)C(O)CC12',
 'CC1CC2C1OC2(C)C',
 'OCC12OC1C1CCC12',
 'OCC1C2OC3C1C23O',
 'CC1C(C)C2OCC12',
 'COC1C2C3CC2C31C',
 'C1=CC2C3CC24C1OC34',
 'CC12COC1C1CC2O1',
 'CC1OC2CCC12C',
 'CC12CCCC3C1CC32',
 'CC12CC(=O)C1C1OC12',
 'CC1(O)C2OC3C2OC31',
 'C1OC1C12OC3C1CC32',
 'CC1CC2(CO)OCC12',
 'CC1=CC23OC4C2C1C43',
 'CCC1C2CC3C1C23C',
 'CC1C2C3C(C)C2C13',
 'C1C2CC3C4OC(C14)C23',
 'O=C1CC2(C1)C1CC2C1',
 'CC1(O)C2CC3C2CC31',
 'CC12CC3CC1(CO2)C3',
 'OC1C2=CCC3C2C13',
 'CC12CCC1CC21CC1',
 'CC1CC2CC(C)(O)C12',
 'CC12OC1C1OCC12O',
 'O=C1CC2C3CC(C3)C12',
 'C1OC2CC3(COC23)O1',
 'C1=CC2C3C1C1CC3C21',
 'OCC12CC3C(O1)C32',
 'CC1OC2C(C)C(C)C12',
 'O=CC1OC23CC(C2)C13',
 'O=CC1=CC2C3OC2C13',
 'CC1OC2C3CC3C12O',
 'CC1C2COC3CC2C31',
 'OC

In [136]:
len(smiles_IPII - smiles_IPIII)

5437