## Data
- Lets have a look at our dataset, preprocess it and save the preprocessed version

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
import re
import pandas as pd

from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

In [3]:
# dataset comes from here: https://github.com/theochem/B3DB/blob/main/README.md

df = pd.read_csv("https://staicentreprod001.blob.core.windows.net/share/mlprague23/B3DB_classification.tsv", sep="\t")
df.head()

Unnamed: 0,NO.,compound_name,IUPAC_name,SMILES,CID,logBB,BBB+/BBB-,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,BBB-,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,BBB-,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,
2,3,clioquinol,5-chloro-7-iodoquinolin-8-ol,Oc1c(I)cc(Cl)c2cccnc12,2788.0,-2.4,BBB-,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,,R18|R26|R27|,A,
3,4,bbcpd11 (cimetidine analog) (y-g13),2-[2-[(3-bromopyridin-2-yl)methylsulfanyl]ethy...,CCNC(=NCCSCc1ncccc1Br)NC#N,14022517.0,-2.15,BBB-,InChI=1S/C12H16BrN5S/c1-2-15-12(18-9-14)17-6-7...,,R2|R2|R8|R40|R2|R2|R2|R2|R18|R21|R25|R25|R26|R...,A,
4,5,schembl614298,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-7-hydro...",CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,18595497.0,-2.15,BBB-,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-4-12(25)2...,,R25|,A,


In [4]:
df.shape

(7807, 12)

In [5]:
# for readability
df = df.rename(columns={"BBB+/BBB-": "label", "compound_name": "name"})
df.loc[df.label == "BBB+", "label"] = 1
df.loc[df.label == "BBB-", "label"] = 0

df.head(20)

Unnamed: 0,NO.,name,IUPAC_name,SMILES,CID,logBB,label,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,0,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,0,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,
2,3,clioquinol,5-chloro-7-iodoquinolin-8-ol,Oc1c(I)cc(Cl)c2cccnc12,2788.0,-2.4,0,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,,R18|R26|R27|,A,
3,4,bbcpd11 (cimetidine analog) (y-g13),2-[2-[(3-bromopyridin-2-yl)methylsulfanyl]ethy...,CCNC(=NCCSCc1ncccc1Br)NC#N,14022517.0,-2.15,0,InChI=1S/C12H16BrN5S/c1-2-15-12(18-9-14)17-6-7...,,R2|R2|R8|R40|R2|R2|R2|R2|R18|R21|R25|R25|R26|R...,A,
4,5,schembl614298,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-7-hydro...",CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,18595497.0,-2.15,0,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-4-12(25)2...,,R25|,A,
5,6,"uk-240,455","n-(6,7-dichloro-2,3-dioxo-1,4-dihydroquinoxali...",CS(=O)(=O)N(CCO)c1c(Cl)c(Cl)cc2[nH]c(=O)c(=O)[...,9842188.0,-2.15,0,"InChI=1S/C11H11Cl2N3O5S/c1-22(20,21)16(2-3-17)...",,R18|R26|R27|,A,
6,7,morphine-6-glucuronide,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-9-hydro...",CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](OC2O[C@H]...,9847115.0,-2.09,0,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-5-13(31-2...,,R25|,A,
7,8,nitrofurantoin,1-[(e)-(5-nitrofuran-2-yl)methylideneamino]imi...,O=C1CN(/N=C/c2ccc([N+](=O)[O-])o2)C(=O)N1,5353830.0,-2.09,0,InChI=1S/C8H6N4O5/c13-6-4-11(8(14)10-6)9-3-5-1...,,R2|R2|R25|R46|,A,
8,9,"l-701,324",7-chloro-4-hydroxy-3-(3-phenoxyphenyl)-1h-quin...,O=c1[nH]c2cc(Cl)ccc2c(O)c1-c1cccc(Oc2ccccc2)c1,54682505.0,-2.03,0,InChI=1S/C21H14ClNO3/c22-14-9-10-17-18(12-14)2...,,R18|R26|R27|,A,
9,10,33419-42-0,"5-[(7,8-dihydroxy-2-methyl-4,4a,6,7,8,8a-hexah...",COc1cc(C2c3cc4c(cc3C(OC3OC5COC(C)OC5C(O)C3O)C3...,3310.0,-2.0,0,InChI=1S/C29H32O13/c1-11-36-9-20-27(40-11)24(3...,,R2|R2|R8|R12|R21|R25|R35|R40|R47|,A,


### Do the molecule names need to be cleaned?

In [6]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [7]:
mol_ids = tokenizer.encode('bbcpd11 (cimetidine analog) (y-g13)')
print('mol_ids:', mol_ids)

print('mol_tokens', tokenizer.convert_ids_to_tokens(mol_ids))

mol_ids: [2, 11212, 10020, 7240, 1009, 12, 20090, 4988, 27099, 5729, 13, 12, 67, 17, 8533, 1010, 13, 3]
mol_tokens ['[CLS]', 'bb', '##cp', '##d1', '##1', '(', 'cim', '##eti', '##dine', 'analog', ')', '(', 'y', '-', 'g1', '##3', ')', '[SEP]']


- Notice how the subword unit (suffix) starts with "##" to indicate that it is part of the previous string
- Also [CLS] and [SEP] tokens are automatically added

In [8]:
mol_ids = tokenizer.encode('morphine-6-glucuronide')
print('mol_ids:', mol_ids)
print('mol_tokens', tokenizer.convert_ids_to_tokens(mol_ids))

mol_ids: [2, 11553, 17, 26, 17, 29306, 3]
mol_tokens ['[CLS]', 'morphine', '-', '6', '-', 'glucuronide', '[SEP]']


- The model has *morphine* and *glucuronide* in its vocabulary (has matching input id for these words)
- But doesn't have *bbcpd11* or *cimetidine*

In [9]:
mol_ids = tokenizer.encode('33419-42-0')
print('mol_ids:', mol_ids)
print('mol_tokens', tokenizer.convert_ids_to_tokens(mol_ids))

mol_ids: [2, 26403, 4136, 17, 3858, 17, 20, 3]
mol_tokens ['[CLS]', '334', '##19', '-', '42', '-', '0', '[SEP]']


#### Regex to remove non alpha-numeric characters and convert to lowercase
- see it in action: https://regex101.com/

In [13]:
df["name"] = df["name"].apply(lambda x: re.sub("[^A-Za-z0-9]+", "", str(x)).lower())
df.head(20)

Unnamed: 0,NO.,name,IUPAC_name,SMILES,CID,logBB,label,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,0,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,0,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,
2,3,clioquinol,5-chloro-7-iodoquinolin-8-ol,Oc1c(I)cc(Cl)c2cccnc12,2788.0,-2.4,0,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,,R18|R26|R27|,A,
3,4,bbcpd11cimetidineanalogyg13,2-[2-[(3-bromopyridin-2-yl)methylsulfanyl]ethy...,CCNC(=NCCSCc1ncccc1Br)NC#N,14022517.0,-2.15,0,InChI=1S/C12H16BrN5S/c1-2-15-12(18-9-14)17-6-7...,,R2|R2|R8|R40|R2|R2|R2|R2|R18|R21|R25|R25|R26|R...,A,
4,5,schembl614298,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-7-hydro...",CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,18595497.0,-2.15,0,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-4-12(25)2...,,R25|,A,
5,6,uk240455,"n-(6,7-dichloro-2,3-dioxo-1,4-dihydroquinoxali...",CS(=O)(=O)N(CCO)c1c(Cl)c(Cl)cc2[nH]c(=O)c(=O)[...,9842188.0,-2.15,0,"InChI=1S/C11H11Cl2N3O5S/c1-22(20,21)16(2-3-17)...",,R18|R26|R27|,A,
6,7,morphine6glucuronide,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-9-hydro...",CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](OC2O[C@H]...,9847115.0,-2.09,0,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-5-13(31-2...,,R25|,A,
7,8,nitrofurantoin,1-[(e)-(5-nitrofuran-2-yl)methylideneamino]imi...,O=C1CN(/N=C/c2ccc([N+](=O)[O-])o2)C(=O)N1,5353830.0,-2.09,0,InChI=1S/C8H6N4O5/c13-6-4-11(8(14)10-6)9-3-5-1...,,R2|R2|R25|R46|,A,
8,9,l701324,7-chloro-4-hydroxy-3-(3-phenoxyphenyl)-1h-quin...,O=c1[nH]c2cc(Cl)ccc2c(O)c1-c1cccc(Oc2ccccc2)c1,54682505.0,-2.03,0,InChI=1S/C21H14ClNO3/c22-14-9-10-17-18(12-14)2...,,R18|R26|R27|,A,
9,10,33419420,"5-[(7,8-dihydroxy-2-methyl-4,4a,6,7,8,8a-hexah...",COc1cc(C2c3cc4c(cc3C(OC3OC5COC(C)OC5C(O)C3O)C3...,3310.0,-2.0,0,InChI=1S/C29H32O13/c1-11-36-9-20-27(40-11)24(3...,,R2|R2|R8|R12|R21|R25|R35|R40|R47|,A,


In [14]:
# replace molecules whose names are just numbers with nan
df["name"] = df["name"].apply(lambda x: re.sub("^[0-9]+", "nan", str(x)))
df.head(20)

Unnamed: 0,NO.,name,IUPAC_name,SMILES,CID,logBB,label,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,0,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,0,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,
2,3,clioquinol,5-chloro-7-iodoquinolin-8-ol,Oc1c(I)cc(Cl)c2cccnc12,2788.0,-2.4,0,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,,R18|R26|R27|,A,
3,4,bbcpd11cimetidineanalogyg13,2-[2-[(3-bromopyridin-2-yl)methylsulfanyl]ethy...,CCNC(=NCCSCc1ncccc1Br)NC#N,14022517.0,-2.15,0,InChI=1S/C12H16BrN5S/c1-2-15-12(18-9-14)17-6-7...,,R2|R2|R8|R40|R2|R2|R2|R2|R18|R21|R25|R25|R26|R...,A,
4,5,schembl614298,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-7-hydro...",CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,18595497.0,-2.15,0,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-4-12(25)2...,,R25|,A,
5,6,uk240455,"n-(6,7-dichloro-2,3-dioxo-1,4-dihydroquinoxali...",CS(=O)(=O)N(CCO)c1c(Cl)c(Cl)cc2[nH]c(=O)c(=O)[...,9842188.0,-2.15,0,"InChI=1S/C11H11Cl2N3O5S/c1-22(20,21)16(2-3-17)...",,R18|R26|R27|,A,
6,7,morphine6glucuronide,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-9-hydro...",CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](OC2O[C@H]...,9847115.0,-2.09,0,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-5-13(31-2...,,R25|,A,
7,8,nitrofurantoin,1-[(e)-(5-nitrofuran-2-yl)methylideneamino]imi...,O=C1CN(/N=C/c2ccc([N+](=O)[O-])o2)C(=O)N1,5353830.0,-2.09,0,InChI=1S/C8H6N4O5/c13-6-4-11(8(14)10-6)9-3-5-1...,,R2|R2|R25|R46|,A,
8,9,l701324,7-chloro-4-hydroxy-3-(3-phenoxyphenyl)-1h-quin...,O=c1[nH]c2cc(Cl)ccc2c(O)c1-c1cccc(Oc2ccccc2)c1,54682505.0,-2.03,0,InChI=1S/C21H14ClNO3/c22-14-9-10-17-18(12-14)2...,,R18|R26|R27|,A,
9,10,,"5-[(7,8-dihydroxy-2-methyl-4,4a,6,7,8,8a-hexah...",COc1cc(C2c3cc4c(cc3C(OC3OC5COC(C)OC5C(O)C3O)C3...,3310.0,-2.0,0,InChI=1S/C29H32O13/c1-11-36-9-20-27(40-11)24(3...,,R2|R2|R8|R12|R21|R25|R35|R40|R47|,A,


In [15]:
df[df["name"] == "nan"]

Unnamed: 0,NO.,name,IUPAC_name,SMILES,CID,logBB,label,Inchi,threshold,reference,group,comments
9,10,,"5-[(7,8-dihydroxy-2-methyl-4,4a,6,7,8,8a-hexah...",COc1cc(C2c3cc4c(cc3C(OC3OC5COC(C)OC5C(O)C3O)C3...,3310.0,-2.00,0,InChI=1S/C29H32O13/c1-11-36-9-20-27(40-11)24(3...,,R2|R2|R8|R12|R21|R25|R35|R40|R47|,A,
15,16,,"n-phenyl-4,5-dihydro-1h-imidazol-2-amine",c1ccc(NC2=NCCN2)cc1,137235.0,-1.89,0,InChI=1S/C9H11N3/c1-2-4-8(5-3-1)12-9-10-6-7-11...,,R2|R2|R8|R25|R46|R47|R4|R40|,A,
20,21,,,CN1Cc2c(-c3noc(C(C)(O)O)n3)ncn2-c2cccc(Cl)c2C1=O,,-1.82,0,"InChI=1S/C16H14ClN5O4/c1-16(24,25)15-19-13(20-...",,R35|,A,
21,22,,,CN1Cc2c(-c3noc([C@](C)(O)CO)n3)ncn2-c2cccc(Cl)...,,-1.82,0,"InChI=1S/C17H16ClN5O4/c1-17(26,7-24)16-20-14(2...",,R48|,A,
55,56,,"8-(3-oxocyclopentyl)-1,3-dipropyl-7h-purine-2,...",CCCn1c(=O)c2[nH]c(C3CCC(=O)C3)nc2n(CCC)c1=O,131584.0,-1.40,0,InChI=1S/C16H22N4O3/c1-3-7-19-14-12(15(22)20(8...,,R2|R2|R4|R27|R40|R43|R47|,A,
...,...,...,...,...,...,...,...,...,...,...,...,...
7696,7697,,,O=c1[nH]c2ccccc2n1CCCN1CCC(Nn2c(=O)[nH]c3cc(Cl...,,,1,InChI=1S/C22H25ClN6O2/c23-15-6-7-20-18(14-15)2...,,R27|,C,
7700,7701,,,OC[C@@H]1O[C@@H](OC[C@@H]2O[C@@H](O[C@]3(CO)O[...,,,0,InChI=1S/C18H32O16/c19-1-5-8(22)11(25)13(27)16...,,R27|,C,
7701,7702,,,OC[C@@H]1O[C@@H](O[C@@H]2[C@H](CO)O[C@@](O)(CO...,,,0,InChI=1S/C12H22O11/c13-1-4-6(16)7(17)8(18)11(2...,,R27|,C,
7705,7706,,,ON=Cc1cc[n+](COC[n+]2ccc(/C=N/O)cc2)cc1,,,0,InChI=1S/C14H14N4O3/c19-15-9-13-1-5-17(6-2-13)...,,R27|,C,


In [16]:
num_nan=sum(df["name"] == "nan")
print(f"number of molecules with nan name: {num_nan}")

number of molecules with nan name: 1279


In [17]:
print(f"df shape before nan molecule removal: {df.shape}")
df = df[df["name"] != "nan"]
print(f"df shape after nan molecule removal: {df.shape}")

df shape before nan molecule removal: (7807, 12)
df shape after nan molecule removal: (6528, 12)


In [18]:
df.head(20)

Unnamed: 0,NO.,name,IUPAC_name,SMILES,CID,logBB,label,Inchi,threshold,reference,group,comments
0,1,sulphasalazine,2-hydroxy-5-[[4-(pyridin-2-ylsulfamoyl)phenyl]...,O=C(O)c1cc(N=Nc2ccc(S(=O)(=O)Nc3ccccn3)cc2)ccc1O,5339.0,-2.69,0,InChI=1S/C18H14N4O5S/c23-16-9-6-13(11-15(16)18...,,R2|R2|R25|R46|,A,
1,2,moxalactam,7-[[2-carboxy-2-(4-hydroxyphenyl)acetyl]amino]...,COC1(NC(=O)C(C(=O)O)c2ccc(O)cc2)C(=O)N2C(C(=O)...,3889.0,-2.52,0,InChI=1S/C20H20N6O9S/c1-25-19(22-23-24-25)36-8...,,R25|,A,
2,3,clioquinol,5-chloro-7-iodoquinolin-8-ol,Oc1c(I)cc(Cl)c2cccnc12,2788.0,-2.4,0,InChI=1S/C9H5ClINO/c10-6-4-7(11)9(13)8-5(6)2-1...,,R18|R26|R27|,A,
3,4,bbcpd11cimetidineanalogyg13,2-[2-[(3-bromopyridin-2-yl)methylsulfanyl]ethy...,CCNC(=NCCSCc1ncccc1Br)NC#N,14022517.0,-2.15,0,InChI=1S/C12H16BrN5S/c1-2-15-12(18-9-14)17-6-7...,,R2|R2|R8|R40|R2|R2|R2|R2|R18|R21|R25|R25|R26|R...,A,
4,5,schembl614298,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-7-hydro...",CN1CC[C@]23c4c5ccc(OC6O[C@H](C(=O)O)[C@@H](O)[...,18595497.0,-2.15,0,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-4-12(25)2...,,R25|,A,
5,6,uk240455,"n-(6,7-dichloro-2,3-dioxo-1,4-dihydroquinoxali...",CS(=O)(=O)N(CCO)c1c(Cl)c(Cl)cc2[nH]c(=O)c(=O)[...,9842188.0,-2.15,0,"InChI=1S/C11H11Cl2N3O5S/c1-22(20,21)16(2-3-17)...",,R18|R26|R27|,A,
6,7,morphine6glucuronide,"(2s,3s,4s,5r)-6-[[(4r,4ar,7s,7ar,12bs)-9-hydro...",CN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](OC2O[C@H]...,9847115.0,-2.09,0,InChI=1S/C23H27NO9/c1-24-7-6-23-10-3-5-13(31-2...,,R25|,A,
7,8,nitrofurantoin,1-[(e)-(5-nitrofuran-2-yl)methylideneamino]imi...,O=C1CN(/N=C/c2ccc([N+](=O)[O-])o2)C(=O)N1,5353830.0,-2.09,0,InChI=1S/C8H6N4O5/c13-6-4-11(8(14)10-6)9-3-5-1...,,R2|R2|R25|R46|,A,
8,9,l701324,7-chloro-4-hydroxy-3-(3-phenoxyphenyl)-1h-quin...,O=c1[nH]c2cc(Cl)ccc2c(O)c1-c1cccc(Oc2ccccc2)c1,54682505.0,-2.03,0,InChI=1S/C21H14ClNO3/c22-14-9-10-17-18(12-14)2...,,R18|R26|R27|,A,
10,11,icotidine,2-[4-(3-methoxypyridin-2-yl)butylamino]-5-[(6-...,COc1cccnc1CCCCNc1ncc(Cc2ccc(C)nc2)c(=O)[nH]1,72108.0,-2.0,0,InChI=1S/C21H25N5O2/c1-15-8-9-16(13-24-15)12-1...,,R2|R2|R2|R2|R5|R8|R11|R12|R18|R21|R25|R26|R27|...,A,


In [19]:
df = df.drop_duplicates(subset="name")
df.shape

(4832, 12)

In [20]:
X = df.name.values
y = df.label.values
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [21]:
df.name.values

array(['sulphasalazine', 'moxalactam', 'clioquinol', ..., 'ketotifen',
       'milrinone', 'amrinone'], dtype=object)

In [22]:
df_train = pd.DataFrame(data={"name": X_train, "label": y_train})

df_test = pd.DataFrame(data={"name": X_test, "label": y_test})

Mount your drive to colab so you can write the processed data there

In [23]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [24]:
# save preprocessed data to your drive (you may want/need to edit the path) 
df_train.to_csv("/content/drive/MyDrive/Colab Notebooks/B3DB_cleaned_name_TRAIN.csv", index=False)
df_test.to_csv("/content/drive/MyDrive/Colab Notebooks/B3DB_cleaned_name_TEST.csv", index=False)

Split and save dataset with SMILES separately

In [25]:
X = df["SMILES"].values
y = df.label.values
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [26]:
df_train = pd.DataFrame(data={"SMILES": X_train, "label": y_train})

df_test = pd.DataFrame(data={"SMILES": X_test, "label": y_test})

df_train.to_csv("/content/drive/MyDrive/Colab Notebooks/B3DB_cleaned_SMILES_TRAIN.csv", index=False)
df_test.to_csv("/content/drive/MyDrive/Colab Notebooks/B3DB_cleaned_SMILES_TEST.csv", index=False)