In [1]:
import numpy as np
import pandas as pd
from classes.descriptors import Smiles2Descriptors, VectorSmiles2Descriptors
from rdkit import Chem

In [2]:
teo_data = pd.read_csv('data/NTO_smiles_encoded.csv')
teo_data

Unnamed: 0,component_name,smiles
0,cf40823a-4b86-415c-b841-a9ead84d9fcf,CCCCC(C)C
1,6c3c9b13-07ee-434a-9d7e-e2e37b83a023,CCCCCCC(C(C(C)C)CCCCCC)CC(C(C)C)CCCCC
2,0c047bc0-0041-45f7-9222-d2ec5fd2df38,CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC
3,7bc3fcf4-7f3f-4ff6-9bdd-a62fc39828d3,CCCCCCC(CCCCCCCC)CCCCCCCC
4,240187a5-a150-4a92-9e53-8d964dedf3fb,CC1=CC2=NNN=C2C=C1.C(CO)N(CCO)CCO
5,9c948f48-0165-4291-8db6-bfe73a534785,CCCCCC(C(OC)=O)CCCC
6,a9c1305e-3832-4ddc-a6c4-24e3a649d2e6,CC1CCCCC1C
7,18cc80c2-b2dd-4565-b54c-7b55924b5b43,O=S(C1=CC=C([C18H21])C=C1)(O)=O
8,435358c0-8f5b-4e51-8f44-726273e02756,CCCCCCCCC(C(CC)CCCCC)CC(C)CCCC
9,088905d1-991f-479d-a9d1-fab9abb0b47d,CCCCCCC(C(C)CCC)CC(C)CCCCCC


In [3]:
df = pd.DataFrame(teo_data)

In [4]:
def calculate_descriptors(smiles):
    try:
        descriptor = Smiles2Descriptors(smiles)
        return descriptor.logp, descriptor.tpsa, descriptor.molwt
    except ValueError as e:
        print(f"Ошибка при обработке SMILES: {e}")
        return None, None, None
    except Exception as e:
        print(f"Неизвестная ошибка: {e}")
        return None, None, None

In [5]:
df["logp"], df["tpsa"], df["molwt"] = zip(*df["smiles"].apply(calculate_descriptors))

# Отображение результата
df

Ошибка при обработке SMILES: Неверный SMILES
Ошибка при обработке SMILES: Неверный SMILES


[11:17:24] SMILES Parse Error: syntax error while parsing: O=S(C1=CC=C([C18H21])C=C1)(O)=O
[11:17:24] SMILES Parse Error: check for mistakes around position 15:
[11:17:24] O=S(C1=CC=C([C18H21])C=C1)(O)=O
[11:17:24] ~~~~~~~~~~~~~~^
[11:17:24] SMILES Parse Error: Failed parsing SMILES 'O=S(C1=CC=C([C18H21])C=C1)(O)=O' for input: 'O=S(C1=CC=C([C18H21])C=C1)(O)=O'
[11:17:24] Explicit valence for atom # 6 C, 5, is greater than permitted


Unnamed: 0,component_name,smiles,logp,tpsa,molwt
0,cf40823a-4b86-415c-b841-a9ead84d9fcf,CCCCC(C)C,2.8326,0.0,100.205
1,6c3c9b13-07ee-434a-9d7e-e2e37b83a023,CCCCCCC(C(C(C)C)CCCCCC)CC(C(C)C)CCCCC,10.0582,0.0,380.745
2,0c047bc0-0041-45f7-9222-d2ec5fd2df38,CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC,10.1004,0.0,366.718
3,7bc3fcf4-7f3f-4ff6-9bdd-a62fc39828d3,CCCCCCC(CCCCCCCC)CCCCCCCC,9.0742,0.0,324.637
4,240187a5-a150-4a92-9e53-8d964dedf3fb,CC1=CC2=NNN=C2C=C1.C(CO)N(CCO)CCO,-0.46838,105.5,282.344
5,9c948f48-0165-4291-8db6-bfe73a534785,CCCCCC(C(OC)=O)CCCC,3.5461,26.3,200.322
6,a9c1305e-3832-4ddc-a6c4-24e3a649d2e6,CC1CCCCC1C,2.8326,0.0,112.216
7,18cc80c2-b2dd-4565-b54c-7b55924b5b43,O=S(C1=CC=C([C18H21])C=C1)(O)=O,,,
8,435358c0-8f5b-4e51-8f44-726273e02756,CCCCCCCCC(C(CC)CCCCC)CC(C)CCCC,9.1761,0.0,338.664
9,088905d1-991f-479d-a9d1-fab9abb0b47d,CCCCCCC(C(C)CCC)CC(C)CCCCCC,8.0058,0.0,296.583


In [6]:
final = df.dropna()
final

Unnamed: 0,component_name,smiles,logp,tpsa,molwt
0,cf40823a-4b86-415c-b841-a9ead84d9fcf,CCCCC(C)C,2.8326,0.0,100.205
1,6c3c9b13-07ee-434a-9d7e-e2e37b83a023,CCCCCCC(C(C(C)C)CCCCCC)CC(C(C)C)CCCCC,10.0582,0.0,380.745
2,0c047bc0-0041-45f7-9222-d2ec5fd2df38,CCCCCCCCC(CCCCCC)CC(C)CCCCCCCC,10.1004,0.0,366.718
3,7bc3fcf4-7f3f-4ff6-9bdd-a62fc39828d3,CCCCCCC(CCCCCCCC)CCCCCCCC,9.0742,0.0,324.637
4,240187a5-a150-4a92-9e53-8d964dedf3fb,CC1=CC2=NNN=C2C=C1.C(CO)N(CCO)CCO,-0.46838,105.5,282.344
5,9c948f48-0165-4291-8db6-bfe73a534785,CCCCCC(C(OC)=O)CCCC,3.5461,26.3,200.322
6,a9c1305e-3832-4ddc-a6c4-24e3a649d2e6,CC1CCCCC1C,2.8326,0.0,112.216
8,435358c0-8f5b-4e51-8f44-726273e02756,CCCCCCCCC(C(CC)CCCCC)CC(C)CCCC,9.1761,0.0,338.664
9,088905d1-991f-479d-a9d1-fab9abb0b47d,CCCCCCC(C(C)CCC)CC(C)CCCCCC,8.0058,0.0,296.583
10,2b2de7e7-be33-4505-ab93-50234b79f528,CCCCCCCCC(CCCCC)CCCCCC,7.9039,0.0,282.556


In [7]:
final.to_csv('data/correct-teo-smiles.csv', index=False)