In [2]:
import pandas as pd

In [4]:
!cd SPMM && python "smiles2emb.py" --input_file "../data/2025-04-13T06-47_export.csv" --output_file "lb400_embedded.csv" --seed 42 --checkpoint "../checkpoint_SPMM.ckpt" --device "cuda"

seed: 42
Loading CSV data
Creating model
LOADING PRETRAINED MODEL..
load checkpoint from ../checkpoint_SPMM.ckpt
_IncompatibleKeys(missing_keys=[], unexpected_keys=['temp'])
Generating SMILES embeddings...
Processed 100/411 SMILES
Processed 200/411 SMILES
Processed 300/411 SMILES
Processed 400/411 SMILES
Processed 411/411 SMILES
CSV with embeddings saved to lb400_embedded.csv


In [5]:
data = pd.read_csv('SPMM/lb400_embedded.csv')

In [6]:
data['emb'] = data['emb'].apply(lambda x: eval(x))
data = data[['emb', 'PDSC']]

In [7]:
data = data[data['PDSC'] != 0]

In [8]:
import os
import joblib
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [9]:
os.makedirs('models_ridge_400lb', exist_ok=True)

emb_df = pd.DataFrame(data['emb'].tolist())
emb_df.columns = [f'emb_{i}' for i in range(emb_df.shape[1])]
X = emb_df

y = data['PDSC']

kf = KFold(n_splits=10, shuffle=True, random_state=42)
oof_predictions_ridge = np.zeros(len(data))

for fold, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model_ridge = Ridge(random_state=42, alpha=5)
    model_ridge.fit(X_train, y_train)

    oof_predictions_ridge[test_index] = model_ridge.predict(X_test)

    model_path = f'models_ridge_400lb/ridge_model_fold_{fold}.joblib'
    joblib.dump(model_ridge, model_path)

In [10]:
mse = mean_squared_error(y, oof_predictions_ridge)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y, oof_predictions_ridge)
r2 = r2_score(y, oof_predictions_ridge)

print(f'''
Ridge Metrics
- MSE: {mse:.4f}
- RMSE: {rmse:.4f}
- MAE: {mae:.4f}
- R²: {r2:.4f}
''')


Ridge Metrics
- MSE: 689.5391
- RMSE: 26.2591
- MAE: 16.6126
- R²: 0.6807



In [12]:
from rdkit import Chem

folder_path = 'SPMM/valid_molecules_with_emb_csv'
output_path = 'SPMM/valid_molecules_with_emb_csv_LBPROB_PDSC_400'
models_path = 'models_ridge_400lb'

os.makedirs(output_path, exist_ok=True)

model_files = [f for f in os.listdir(models_path) if f.endswith('.joblib')]
models = []
for model_file in model_files:
    model_path = os.path.join(models_path, model_file)
    model = joblib.load(model_path)
    models.append(model)

In [13]:
def prepare_for_inference(df):
    emb_df = pd.DataFrame(df['emb'].tolist())
    emb_df.columns = [f'emb_{i}' for i in range(emb_df.shape[1])]

    return emb_df

In [14]:
def assign_flag(smiles):
    mol = Chem.MolFromSmiles(smiles)

    aromatic_amine_pattern = Chem.MolFromSmarts('c-[NH2,NH1,NH0;!$(NC=O);!$(N=*);!$(N#*)]')
    phenol_pattern = Chem.MolFromSmarts('c-[OH]')
    
    has_aromatic_amine = mol.HasSubstructMatch(aromatic_amine_pattern)
    has_phenol = mol.HasSubstructMatch(phenol_pattern)
    
    if has_aromatic_amine:
        return 'aromaamin'
    elif has_phenol:
        return 'fenol'
    return 'none'

In [15]:
from tqdm import tqdm

for file_name in tqdm(os.listdir(folder_path)):
    if file_name.endswith('.csv') and file_name.startswith('valid_generated_molecules_'):
        file_path = os.path.join(folder_path, file_name)
        
        df = pd.read_csv(file_path)
        df['emb'] = df['emb'].apply(lambda x: eval(x))
        
        df['flag'] = df['Smiles'].apply(assign_flag)

        df = df[df['flag'] != 'none']
        df = df.reset_index(drop=True)
        
        X = prepare_for_inference(df)
        
        all_predictions = []
        for model in models:
            pred = model.predict(X)
            all_predictions.append(pred)
        
        df['PDSC'] = np.mean(all_predictions, axis=0)
        
        output_file_name = f"processed_{file_name}"
        output_file_path = os.path.join(output_path, output_file_name)
        df.drop(columns=['emb'], axis=1, inplace=True)
        df.to_csv(output_file_path, index=False)
        
        # print(f"processed {file_name=}, saved to {output_file_name=}")

100%|██████████| 13/13 [00:04<00:00,  3.11it/s]


In [16]:
dfs = []
for file in os.listdir('SPMM/valid_molecules_with_emb_csv_LBPROB_PDSC_400/'):
    curr_df = pd.read_csv('SPMM/valid_molecules_with_emb_csv_LBPROB_PDSC_400/' + file)
    dfs.append(curr_df)

In [30]:
df_final = pd.concat(dfs, axis=0)

In [31]:
df_final = df_final.sort_values(by='PDSC', ascending=False)

In [32]:
used = ['CCCCC(CC)COC(=O)CNc1cccc(OC)c1OC',
 'CCCCCCCCCCCCN(C)Cc1cccc(NCC(O)c2ccc(OC)cc2)c1',
 'CCNc1ccc(CN(C)CCOCCOC)cc1C',
 'CCCN(CCC)CC(O)CNCc1ccc(-c2ccc(N(CC)CC)cc2)nc1',
 'CCCCN(CCC)c1cc(CN(CCO)CCOC)cc(Nc2ccc(C)cc2)c1',
 'CCOc1cc(C)c(CNCc2ccc(N(CC)CC)cc2)c(C)c1OCCNCC',
 'CCc1ccc(N(C)CCC(=O)NCCCOC(C)C)cc1',
 'CCc1ccc(N(CCO)CCOCc2cccc(CNC)c2)cc1',
 'CCCCCCN(CCN(CC(C)C)CCC)c1ccc(Nc2n[nH]c(C)c2C)cc1',
 'CCc1cccc(CC)c1NCC(O)CN(C)Cc1ccc(OC)cc1',
 'CCCCCCC(O)CN(CC(O)c1ccccc1)Cc1ccc(N(CC)CC)cc1',
 'CCCCCCCCCCCCN(CCO)c1ccc(NCc2ccc(N(C)C)cc2)cc1',
 'CCCN(CCC)c1nc2ccccc2nc1NCCCN(CCC)CCCC',
 'CCNC(=NCC(CC)(CC)CC)NCc1ccc(N(C)Cc2ccco2)cc1',
 'Cc1c(CNCCCCCCNC(C)C)cccc1N(Cc1cccn1C)C(C)C',
 'CCOCCCN(CCC)Cc1ccc(CNCCc2ccc(O)cc2)cc1',
 'CCCOc1ccc(NC(C)CCCN(C)C)nc1',
 'CCCCNC(CO)c1ccc(OCc2ccc(N(C)C)cc2)cc1',
 'CCCCCCCCCCCCCCC(CO)Nc1ccc(-c2ccc(OC)cc2)cn1',
 'CCCCN(C)c1ccc(NCC(C)(C)OC)nc1',
 'CCCCCCN(CCO)c1ccc(NCCc2ccc(OC)cc2)cc1',
 'CCN(CC)CCCNC(C)c1ccc(NCc2ccco2)cc1',
 'CCCCN(CCCC)c1ccc(N(CCCC)CCCO)cc1NCCCO',
 'CCCCCCCCC(CNC(=S)Nc1ccccc1)N(C)Cc1cccc(OC)c1',
 'CCN(CC)CCCNCc1ccc(N(CC)CCCc2ccc(C)cc2)cc1',
 'CCCCN(CCCC)c1cccc(NCC(O)c2cccn2C)c1',
 'CCCCN(C)c1ccc(NC(=O)COC(C)C)cc1',
 'CCCCN(CCCC)CC(O)CNC(C)c1ccc(N(CC)CC)c2ccccc12',
 'CCCCCCCCN(CC(=O)OC)c1ccc(O)cc1',
 'CCCCCCCCCCCCCCNc1cc(-c2ccc(N(C)C)cc2)c(CO)cn1',
 'CCCCC(CC)CN(CCCCCC)c1nc(C)c(C)c(Nc2ccc(C)cc2)n1',
 'CCCCCCN(CCO)c1ccc(-c2ccc(C(C)NC(C)CCC)cc2)nc1',
 'CCCN(CCC)CCOc1cccc(CNCc2ccccc2O)c1',
 'CCCCC(CC)CNC(=NC)NCC(c1ccc(OC)cc1)N(CC)c1ccccc1',
 'CCCN(CCC)C(CO)c1ccc(NCCc2ccc(OC)cc2)cc1',
 'CCCCN(CCC)c1ccc2cc(O)ccc2c1CN(CCC)CCCCNC',
 'CCCCCCCCCCCCOc1ccc(-c2ccc(NCCO)c(OCC)c2)cc1',
 'CCCCCCNCc1cncc(N(CCCCCC)Cc2cc(C)cc(C)c2)c1',
 'CCCCCCCCCCc1cc2ccc(NCCCCCC)cc2n1CCN(C)C',
 'CCCN(CCC)CCCNc1cc(N(CCC)CCC)c2ccccc2n1',
 'CCCNCCCCN(CCCC)CCc1c[nH]c2cc(N(CC)CC)ccc12',
 'CCc1cc(Cc2ccc(OCCCCCCN(CC)CC)cc2)c(O)c(C(C)C)c1O',
 'CCCCCCCCCCCCCCCCNc1ncc(-c2ccc(N(C)C)cc2)[nH]1',
 'CCCCCCN(CCCCCCCC)c1ccc(Nc2cc(CO)ccc2C)nc1',
 'CCCCN(CCCC)CCCNCc1ccc(O)c2ncccc12',
 'CCCCN(CCCC)CCNCCc1c(C)cc(C)cc1-c1ccc(N(C)C)cc1',
 'CCc1ccc(NC(c2cccc(OC)c2O)N(C)CCC(C)C)cc1',
 'CCNC(=NCc1ccc(N(CC)CC)cc1)NCCc1cccnc1',
 'CCCCC(C)(C)CN(CCO)c1ccc(-c2ccc(OCCCCCC)cc2)cc1O',
 'CCCCCCCCCCNCc1ccc(-c2ccc(N(CC)CC)cc2)nc1',
 'CCCCCCCCc1ccc(N(CCCCC)c2ccc(N(CCO)CCO)cc2C)cc1',
 'CCCCCCN(CCCCC)c1cc(CNCCC)cc(-c2ccc(C)cc2)n1',
 'CCCN(CCC)c1ccc(NCC(O)c2ccc(OC)cc2)c(C)c1',
 'CCCCCCN(CCCC)c1cc(CNC(C)CC)c(O)c(-c2ccccn2)c1',
 'CCc1cc(CN(CCO)CCO)ccc1-c1ccc(N(CC)CC)cc1',
 'CCCCCCCCCCCC(C)NC(=S)Nc1cc(C(C)(C)C)nn1Cc1ccccc1',
 'Cc1cc(N(CCO)CCO)c(OCCC(C)C)c(-c2ccc(CCCCC)cc2)c1',
 'CCCCN(CCCC)c1ccc(-c2ccc(CNCCCCCC)cc2)nc1',
 'CCCCCCN(CCC)c1cc(CC)nc(NCc2ccc(CCCC)cc2)c1',
 'CCCCCCCCCCNc1nc(CCCCCCC)cc(-c2ccc(C)cc2)n1',
 'CCCCN(CCCC)c1nc(CC)c(CC)c(CC)c1-c1cc(C)c(C)cc1NC',
 'CCCN(CCC)c1cc(CNC(C)C)c(-c2cc(C)c(C)cc2C)c(C)c1',
 'CCCCN(CCCCCC)c1nc(C)cc(C)c1-c1c(C)cc(C)cc1CC',
 'CCc1cc2c(NCCCN(CC)CC)ccc(N(CCC)CCC)c2cc1CC',
 'CCCCCCc1c(CCCC)n(CCCCCC)c2ccc(NCCC)cc12',
 'CCCCN(CCCC)c1nc(CC)c(CC)c(NCc2ccc(C)cc2)c1C',
 'CCCCCCn1cc(CCCC)c(-c2ccc(CCCCCC)cc2)c1NCCC',
 'CCCCCCN(CCCC)c1ccc(-c2[nH]c(C)c(C(C)C)c2CC)cc1',
 'CCCCCCc1ccc(-c2ccc(N(CCCC)CCC)c(CNCCC)c2)cc1',
 'CCCCCCN(CCCCCC)c1ccc(-c2ccc(NCCC)c(CC)c2)cc1',
 'CCCCCCN(CCCC)c1nc(C)cc(Nc2c(C)cc(CC)cc2C)c1C',
 'CCNc1nc(C)cc(-c2ccc(CN(CCC)CCCC)c(CC)c2)c1C',
 'CCCCCCCCNc1c(CC)cc(-c2ccc(N(CCC)CCCC)cc2)cc1C',
 'CCCCCCN(CCCCC)c1ccc(-c2[nH]c(C)c(C)c2CCCC)cc1',
 'CCCCCN(CCCCC)c1ccc(-c2[nH]c(C)c(CCC)c2C)cc1',
 'CCCCCCCCCNc1c(CC)c(CC)n(-c2ccc(C)cc2C)c1C',
 'CCCCN(CCCC)c1cc(NC(CC)CC)cc(-c2ccc(CC)cc2)n1',
 'CCCCCC(CC)N(Cc1ccccc1)Cc1c[nH]c2c(C)ccc(N(CC)CC)c12',
 'CCCCCCN(CCCCCC)c1ccc(-c2[nH]c(CCCCC)nc2CC)cc1',
 'CCCCCN(CCCCC)c1cc(CNCCC)cc(-c2ccc(C)cc2)c1',
 'CCCN(CCC)c1cc(-c2ccc(NCCN(CC)CC)cc2)cc(C)c1C',
 'CCCCN(CCCC)c1ccc(Nc2ccc(N(CC)CC)cc2C)c(C)c1',
 'CCc1ccccc1NC(c1ccc(CC)cc1)c1c(C)cc(C)cc1C',
 'CCCCCCCCN(CCCCC)c1nc(-c2cc(C)c(C)c(C)c2)c(C)[nH]1',
 'CCCCCC(c1ccc(O)c(OCCCCCCC)c1)c1ccc(O)c(CCCC)c1',
 'CCCCN(CCCC)c1nc(CC)c(CC)cc1CNCc1ccccc1',
 'CCCCCCc1ccc(-c2ccc(NCCCCCC)c(OCCCCC)c2)cc1C',
 'CCCCCCCc1c(C)cc(O)c(OCCCCC)c1CCCCCNCCCC',
 'CCCCCCCCCCCCNc1nc(C)c(-c2ccc(C)cc2C)c(C)c1C',
 'CCCCCCN(CCCCC)CCCNc1nc(-c2ccc(C)cc2)cc(C)c1C',
 'CCCCCCCNCc1c(C)cc(-c2ccc(N(CC)CC)cc2)nc1C(C)C',
 'CCCCCCCCCCCNc1nc(-c2cc(C)cc(C)c2)cc(C)c1C(C)C',
 'CCCCCCN(CCCCCC)c1cc(CNCCc2ccc(C)cc2)cc(C)c1',
 'CCCCCCN(CCCCC)Cc1cc(NCc2ccc(C)cc2C)cc(C)c1',
 'CCC(C)NC(c1ccc(C)cc1)c1ccc2cc(N(CC)CC)ccc2c1',
 'CCCCCCCCCCNc1cc(-c2ccc(N(CC)CC)cc2)cc(C)c1C',
 'CCCCCCCCCCN(CC)c1cc(Nc2ccc(C)cc2C)cc(C)c1C',
 'CCCN(CCC)Cc1ccc(-c2cnc(NCCCCCCCCC)c(CC)c2)cc1',
 'CCCCCCCCCCc1nc(Nc2cc(C)cc(C)c2)cc(C)c1CCCCC',
 'CCCCCCCCNCc1c(C)cc(C)cc1N(CCC)Cc1ccc(C)cc1',
 'CCCCNCc1c(C)cc(C)cc1N(c1ccccc1)c1ccccc1',
 'CCCCCN(CCO)c1ccccc1C',
 'CCCCN(c1ccccc1)C(C)CCO',
 'CCCCCCCCCCCCCCCCOc1c(O)c(OC)cc2ccccc12',
 'CCCCCCCCN(CCCCCCCCCC)c1ccc(Nc2ccc(O)c(C)c2)cc1',
 'CCCCCCCCN(CC(O)CO)c1ccc(-c2ccc(CCCC)cc2)cc1C',
 'CCCCCCCCCCCCCN(CCCCO)c1ccc(-c2ccccc2O)cc1',
 'CCCCCCCCCCn1c(C)c(C)c2cc(NCCN(C)C)ccc21',
 'CCCCCCCCCc1c(C)[nH]c(-c2ccc(N(CC)CC)cc2)c1C',
 'CCC(O)CCCCN(CC)c1ccccc1',
 'CCCCCCCCN(CCCCCCCC)c1ccc(NCc2ccc(C)cc2C)cc1',
 'Cc1c(CNCC(O)c2ccccc2)cccc1N(CC(C)C)CC(C)C',
 'CCC(CC)(c1ccccc1)c1ccc(NCc2ccc(C)cc2)cn1',
 'CCCCCCCCCCCCCOc1cc(CNC(C)C)c(O)c(C(C)(C)C)c1',
 'CCCNC(Cc1ccc(N(CC)CC)cc1)c1cccc2ccccc12',
 'Cc1ccccc1Nc1ccccc1',
 'CCCC(C)NC(C)Cc1ccccc1O',
 'CCCCCCCCCCCCCCCCNc1cc(-c2ccccc2)cc(C)n1',
 'CCCCc1ccc(-c2cc(NC(CC)c3ccccc3)ccn2)cc1',
 'Cc1ccc(C)c(O)c1C',
 'CCCCCCCCCCCC(NCCC)c1ccc2cc(O)ccc2c1OCCC',
 'CCCCCCCCCc1ccc(-c2ccc(N(CCO)CCO)cc2)cc1',
 'CCCCCC(CCO)c1ccc(N(C)Cc2ccc(CNCCC)cc2)cc1CC',
 'CCCCCCCCN(CCCCCCC)c1ccc(NCc2ccc(O)cc2)cc1',
 'CCN(CC)CCCNC(c1ccccc1)c1ccc(O)c(OC)c1',
 'CCCCCCCCCC(CN(CCO)Cc1ccccc1)c1ccc(O)cc1',
 'CCCCCCCCCCCCCOc1ccccc1CNC(=S)Nc1ccc(CC)cc1',
 'CCCCCCCCCCCCNC(=NCc1ccc(O)cc1)NCc1ccccc1',
 'CCCCCCCCCCCCNCc1cc(-c2ccc(O)cc2)c(OC)c(CC)c1',
 'CCCCCCCCOc1ccc(CNc2cc(C(C)(C)C)c(O)c(CC)c2)cc1',
 'CCCCCCCCCCCCc1c(O)ccc2c(O)cc(OCCCCCCCC)cc12',
 'CCCCCCCCCCc1nc(O)c(Cc2cc(C)c(O)cc2C)cc1CCCCC',
 'CCCCCCCCNc1cc(-c2ccc(-c3ccc(CCCCCC)cc3)cc2)nc(C)n1',
 'CCCCN(CCCC)c1ccc(-c2ccc(CNCCO)cc2)cc1',
 'CCCCCCCCCCN(C)c1cc(CNCc2ccc(C)c(C)c2)cc(C)c1C',
 'CCCC(C)CCc1ccc(-c2ccc(N(CCO)CCO)cc2)cc1',
 'CNc1ccccc1-c1ccccc1',
 'CNc1ccccc1-c1ccccc1',
 'CNc1ccccc1-c1ccccc1',
 'CCCCCCCCNc1nc(C)cc(C)c1-c1ccc(N(CCC)CCC)cc1',
 'CCCCN(CCCC)c1ccc(-c2ccc(NCCO)cc2)cc1',
 'CCNC(Cc1ccc(N(C)C)cc1)c1cc2cc(C)ccc2n1CC',
 'COc1cc(C)c(CN(Cc2ccccc2O)CC(C)C)c(C)c1O',
 'CCCN(CCC)c1ccc(-c2ccc(CNCCO)cc2)cc1',
 'CCCCCCCCCCCCNC(=S)Nc1c(C)cc(C)cc1-c1ccccc1OC',
 'CCCCCCCCCCCCN(c1ccc(O)cc1)c1ccc(O)c(C(C)(C)C)c1',
 'CCCc1c(CC)c(CNC(C)Cc2ccc(O)cc2)c(C)n1C',
 'CNc1ccccc1C',
 'CCN(CC)c1ccc(-c2ccc(NC(C)c3ccc(C)cc3)cc2)cc1',
 'CCCCN(CCC)CCOc1ccc(-c2cc(C(C)(C)C)c(O)c(C)c2)cc1',
 'CCC(CC)CN(CC)CCN(CC)c1ccc(Nc2ccc(C)cc2C)cc1',
 'CCc1cc(C)c(O)cc1C',
 'CCCCC(C)c1ccc(O)c(OCC)c1',
 'Cc1cc(C)c(O)c(C)c1',
 'CNC(CCCCc1ccc(N(C)C)cc1)c1cccc2ccn(C)c12',
 'CCCCN(CCCC)c1cc(CNCc2ccc(O)cc2)ccc1C',
 'CCCc1c(O)cccc1C',
 'CCc1cccc(O)c1C',
 'CCc1cccc(O)c1C',
 'CCc1cccc(O)c1C',
 'CCCCCCCCCCCCCCCCNC(=S)Nc1ccccc1-n1cccc1',
 'CCCCCCCCCCCCCNC(c1ccc(O)cc1)c1ccc(OC)cc1',
 'Cc1cccc(O)c1C',
 'CNc1cc(C)ccc1C',
 'CCc1cc(C)cc(C)c1O',
 'CCc1cc(C)cc(C)c1O',
 'CCc1cc(O)c(C)cc1C',
 'CCCCCCCCCCCc1cc(O)cc(-c2ccc(CO)c(CN(CC)CC)c2)c1',
 'COCCc1ccccc1O',
 'CCCCCCCCCCCCNCc1ccc(-c2ccc(N(C)C)cc2)cc1',
 'CCc1c(O)cccc1C',
 'CCc1c(O)cccc1C',
 'CCc1c(O)cccc1C',
 'CCCCCCN(CCCCC)c1cc2ccccc2cc1CNCCC',
 'CCCCCCCCCCCCNc1c(C)cc(C)cc1-n1c(C)ccc1C',
 'CCc1ccc(NC(c2ccc(CC)cc2)c2ccc(C)cc2)nc1',
 'CCCCNCc1ccc(N(C)C)cc1',
 'CCCCCCCCCc1ccc(O)cn1',
 'CCc1ccc(CNCc2ccc(N(CC)CC)cc2)cc1',
 'CCCCCCCCN(CCC)c1ccc(Nc2ccc(C(C)(C)C)cc2)cn1',
 'CCN(CC)c1ccc(N(Cc2ccc(O)c(O)c2)C(C)C)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCc1cc(C)c(O)cc1',
 'CCCc1ccc(OCCCNC(c2ccc(O)cc2)C(C)C)cc1',
 'CCc1ccc(C)c(O)c1',
 'CCc1ccc(C)c(O)c1',
 'CCCCCN(CCCC)c1ccc(CNC(=S)NCCc2ccc(C)cc2)cc1',
 'CCCCN(CCCO)Cc1ccc(NCc2ccc(C)cc2)cc1',
 'CCc1cc(O)ccc1N(CC)CC',
 'CCCCCCCCCCCNc1cc(-c2ccc(C(C)(C)C)cc2)cc(CO)c1',
 'Cc1cccc(C)c1O',
 'Cc1cccc(C)c1O',
 'CCc1cc(C)ccc1O']

In [33]:
df_final = df_final[~df_final['Smiles'].isin(used)]

In [34]:
df_final = df_final.drop_duplicates()

In [36]:
pd.DataFrame(df_final['Smiles']).to_csv('last_nodubles.csv', index=False)