In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw

In [2]:
sw_test = pd.read_csv("./data/original/0.sweet-test.tsv", sep="\t")
sw_train = pd.read_csv("./data/original/0.sweet-train.tsv", sep="\t")
bt_train = pd.read_csv("./data/original/0.bitter-train.tsv", sep="\t")
bt_test = pd.read_csv("./data/original/0.bitter-test.tsv", sep="\t")

In [3]:
sw_test

Unnamed: 0,Name,Taste,Reference,SMILES,Canonical SMILES,In Bitter Domain,Sweet
0,Alitame,Sweet,Rojas et al. (2017),CC(COCNC1C(C)(C)SC1(C)C)NCOCC(CC(=O)O)N,CC(COCNC1C(C)(C)SC1(C)C)NCOCC(CC(=O)O)N,,True
1,Maltitol,Sweet,Rojas et al. (2017),OCC(C(C(C(CO)O)O)OC1OC(CO)C(C(C1O)O)O)O,OCC(C(C(C(CO)O)O)OC1OC(CO)C(C(C1O)O)O)O,,True
2,Isomalt (Palatinit) / Isomaltitol,Sweet,Rojas et al. (2017),OCC(C(C(C(COC1OC(CO)C(C(C1O)O)O)O)O)O)O,OCC(C(C(C(COC1OC(CO)C(C(C1O)O)O)O)O)O)O,,True
3,Erythritol,Sweet,Rojas et al. (2017),OCC(C(CO)O)O,OCC(C(CO)O)O,,True
4,Rebaudioside B,Sweet,Rojas et al. (2017),OCC1OC(OC23CCC4C(C2)(CC3=C)CCC2C4(C)CCCC2(C)C(...,OCC1OC(OC23CCC4C(C2)(CC3=C)CCC2C4(C)CCCC2(C)C(...,,True
...,...,...,...,...,...,...,...
156,Polypodoside A,Sweet,Rojas et al. (2017),O1[C@H](OC2CC[C@]3([C@H](C2)C(=O)C=C2[C@H]4CC[...,OCC1O[C@H](OC2CC[C@]3([C@H](C2)C(=O)C=C2[C@H]3...,,True
157,Potassium glycyrrhizinate,Sweet,Rojas et al. (2017),[KH].O1[C@H](OC2CC[C@@]3([C@H]4[C@@](CC[C@H]3C...,OCC1O[C@H](OC2CC[C@]3([C@H](C2(C)C)CC[C@]2([C@...,,True
158,Potassium saccharin,Sweet,Rojas et al. (2017),[KH].S1(=O)(=O)NC(=O)c2c1cccc2,O=C1NS(=O)(=O)c2c1cccc2.[KH],,True
159,Pterocaryoside B,Sweet,Rojas et al. (2017),O1CC(O)C(O)C(O)[C@@H]1OC1[C@@H]2[C@@](CC[C@@H]...,OC(=O)CC[C@]1(C)C(CC[C@@]2([C@H]1CC(O[C@@H]1OC...,,True


In [5]:
# def filter_by_length(df, min_len, max_len):
#     return df[(df["SMILES"].str.len() >= min_len) & (df["SMILES"].str.len() <= max_len)]
def filter_by_taste(df, taste):
    df = df[df["Taste"] == taste]
    df = df[df[taste] == 1]
    df = df[df["SMILES"].str.len() <= 70]  # filter out too long SMILES
    return df

In [6]:
df_sw_tr = filter_by_taste(sw_train, "Sweet")
df_sw_te = filter_by_taste(sw_test, "Sweet")
df_bt_tr = filter_by_taste(bt_train, "Bitter")
df_bt_te = filter_by_taste(bt_test, "Bitter")

In [7]:
df_sw_tr

Unnamed: 0,Name,Taste,Reference,SMILES,Canonical SMILES,Sweet
0,Sucrose,Sweet,Rojas et al. (2017),OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,OCC1OC(C(C1O)O)(CO)OC1OC(CO)C(C(C1O)O)O,True
1,"Sucralose / 4,1',6'-Trichloro-galactosucrose",Sweet,Rojas et al. (2017),ClCC1OC(C(C1O)O)(CCl)OC1OC(CO)C(C(C1O)O)Cl,ClCC1OC(C(C1O)O)(CCl)OC1OC(CO)C(C(C1O)O)Cl,True
2,Aspartame/Aspartyl-phenylalanine methylester,Sweet,Rojas et al. (2017),COC(=O)C(NC(=O)C(CC(=O)O)N)Cc1ccccc1,COC(=O)C(NC(=O)C(CC(=O)O)N)Cc1ccccc1,True
3,Tagatose,Sweet,Rojas et al. (2017),OCC1(O)OCC(C(C1O)O)O,OCC1(O)OCC(C(C1O)O)O,True
4,Isomaltulose/Palatinose,Sweet,Rojas et al. (2017),OCC1OC(OCC2OC(C(C2O)O)(O)CO)C(C(C1O)O)O,OCC1OC(OCC2OC(C(C2O)O)(O)CO)C(C(C1O)O)O,True
...,...,...,...,...,...,...
2154,D-mannitol,Sweet,SuperSweet,C([C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O)O,OC[C@H]([C@H]([C@@H]([C@@H](CO)O)O)O)O,True
2158,Hernandulcin,Sweet,SuperSweet,CC1=CC(=O)[C@@H](CC1)[C@](C)(CCC=C(C)C)O,CC(=CCC[C@@]([C@@H]1CCC(=CC1=O)C)(O)C)C,True
2159,Phyllodulcin,Sweet,SuperSweet,COC1=C(C=C(C=C1)[C@H]2CC3=C(C(=CC=C3)O)C(=O)O2)O,COc1ccc(cc1O)[C@@H]1OC(=O)c2c(C1)cccc2O,True
2160,alpha-D-glucose,Sweet,The Good Scents Company Database,C([C@@H]1[C@H]([C@@H]([C@H]([C@H](O1)O)O)O)O)O,OC[C@H]1O[C@H](O)[C@@H]([C@H]([C@@H]1O)O)O,True


In [18]:
df_sw_tr = df_sw_tr[["Canonical SMILES", "Taste"]]
df_sw_te = df_sw_te[["Canonical SMILES", "Taste"]]
df_bt_tr = df_bt_tr[["Canonical SMILES", "Taste"]]
df_bt_te = df_bt_te[["Canonical SMILES", "Taste"]]

df_sw_tr.loc[:, "mol"] = df_sw_tr["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_sw_te.loc[:, "mol"] = df_sw_te["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_bt_tr.loc[:, "mol"] = df_bt_tr["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_bt_te.loc[:, "mol"] = df_bt_te["Canonical SMILES"].apply(Chem.MolFromSmiles)

# Apply the SMILES to mol conversion
df_sw_tr["mol"] = df_sw_tr["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_sw_te["mol"] = df_sw_te["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_bt_tr["mol"] = df_bt_tr["Canonical SMILES"].apply(Chem.MolFromSmiles)
df_bt_te["mol"] = df_bt_te["Canonical SMILES"].apply(Chem.MolFromSmiles)

# Filter out rows where mol conversion failed
df_sw_tr = df_sw_tr[df_sw_tr["mol"].notnull()]
df_sw_te = df_sw_te[df_sw_te["mol"].notnull()]
df_bt_tr = df_bt_tr[df_bt_tr["mol"].notnull()]
df_bt_te = df_bt_te[df_bt_te["mol"].notnull()]




In [9]:
def get_random_n(df, n, seed=0):
    return df.sample(n, random_state=seed)

In [10]:
df_sw_tr_100 = get_random_n(df_sw_tr, 100)
df_sw_te_20 = get_random_n(df_sw_te, 20)
df_bt_tr_100 = get_random_n(df_bt_tr, 100)
df_bt_te_20 = get_random_n(df_bt_te, 20)

In [11]:
df_sw_tr_100

Unnamed: 0,Canonical SMILES,Taste,mol
2020,COc1ccccc1OC(=O)Cc1ccccc1,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00ddf90>
936,OC[C@@H]([C@@H]([C@H]([C@H](CO)O)O)O)O,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00d5eb0>
520,OC/C=C(/CCC=C(C)C)\C,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00cc660>
578,OCC=C(C)C,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00ce040>
873,COC(=O)[C@H](NC(=O)[C@H](CC(=O)O)N)Cc1ccccc1,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00d4f90>
...,...,...,...
709,OC(=O)C1CCCCC1,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00d1a50>
2037,CC(COC(=O)c1ccccc1N)C,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00de740>
219,OCC(C(=O)O)N,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00c7200>
716,CCCC(CCOC(=O)C)SC,Sweet,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00d1d60>


In [12]:
print(df_sw_tr_100["mol"].isnull().sum())
print(df_sw_te_20["mol"].isnull().sum())
print(df_bt_tr_100["mol"].isnull().sum())
print(df_bt_te_20["mol"].isnull().sum())

0
0
0
0


In [13]:
# change label to 0 or 1 if sweet (1) or bitter (0)
df_sw_tr_100["Taste"] = 1
df_sw_te_20["Taste"] = 1
df_bt_tr_100["Taste"] = 0
df_bt_te_20["Taste"] = 0

In [14]:
df_sw_tr_100.head()

Unnamed: 0,Canonical SMILES,Taste,mol
2020,COc1ccccc1OC(=O)Cc1ccccc1,1,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00ddf90>
936,OC[C@@H]([C@@H]([C@H]([C@H](CO)O)O)O)O,1,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00d5eb0>
520,OC/C=C(/CCC=C(C)C)\C,1,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00cc660>
578,OCC=C(C)C,1,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00ce040>
873,COC(=O)[C@H](NC(=O)[C@H](CC(=O)O)N)Cc1ccccc1,1,<rdkit.Chem.rdchem.Mol object at 0x7f4dc00d4f90>


In [15]:
# save as csv
import os

save_data_dir = "../data/sampled_mix"
os.makedirs(save_data_dir, exist_ok=True)
df_sw_tr_100.iloc[:, :-1].to_csv(os.path.join(save_data_dir, "1.sweet-train-100.csv"))
df_sw_te_20.iloc[:, :-1].to_csv(os.path.join(save_data_dir, "1.sweet-test-20.csv"))
df_bt_tr_100.iloc[:, :-1].to_csv(os.path.join(save_data_dir, "1.bitter-train-100.csv"))
df_bt_te_20.iloc[:, :-1].to_csv(os.path.join(save_data_dir, "1.bitter-test-20.csv"))

In [16]:
# save as like prompt

train_data = pd.concat([df_sw_tr_100, df_bt_tr_100])
test_data = pd.concat([df_sw_te_20, df_bt_te_20])

# mix data
train_data = train_data.sample(frac=1)
test_data = test_data.sample(frac=1)


str_format = "smiles: {}\nsweet_or_bitter: {}\n\n"
train_text = ""
for i, row in train_data.iterrows():
    train_text += str_format.format(row["Canonical SMILES"], row["Taste"])

test_text = ""
for i, row in test_data.iterrows():
    test_text += str_format.format(row["Canonical SMILES"], row["Taste"])

unlabeld_text = ""
for i, row in test_data.iterrows():
    unlabeld_text += str_format.format(row["Canonical SMILES"], "")

In [17]:
with open(os.path.join(save_data_dir, "1.train.txt"), "w") as f:
    f.write(train_text)

with open(os.path.join(save_data_dir, "1.test.txt"), "w") as f:
    f.write(test_text)

with open(os.path.join(save_data_dir, "1.unlabeled_test.txt"), "w") as f:
    f.write(unlabeld_text)