In [1]:
%load_ext autoreload
%autoreload 2

# Load model

In [5]:
import pandas as pd
from pathlib import Path
from Levenshtein import distance

path = Path('data')
train_labels = pd.read_csv(path / 'train_labels_tokenized.csv')
train_labels

Unnamed: 0,image_id,InChI,InChI_1,InChI_text
0,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,C13H20OS,C 13 H 20 O S /c 1 - 9 ( 2 ) 8 - 15 - 13 - 6 -...
1,000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,C21H30O4,C 21 H 30 O 4 /c 1 - 12 ( 22 ) 25 - 14 - 6 - 8...
2,0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...,C24H23N5O4,C 24 H 23 N 5 O 4 /c 1 - 14 - 13 - 15 ( 7 - 8 ...
3,000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...,C17H24N2O4S,C 17 H 24 N 2 O 4 S /c 1 - 12 ( 20 ) 18 - 13 (...
4,000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...,C10H19N3O2S,C 10 H 19 N 3 O 2 S /c 1 - 15 - 10 ( 14 ) 12 -...
...,...,...,...,...
2424181,ffffe824f539,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...,C10H12F2N2O3,C 10 H 12 F 2 N 2 O 3 /c 1 - 17 - 7 - 2 - 5 ( ...
2424182,ffffea1ebdfa,InChI=1S/C19H20F2N4O3S/c1-10(8-20)28-14-7-12(2...,C19H20F2N4O3S,C 19 H 20 F 2 N 4 O 3 S /c 1 - 10 ( 8 - 20 ) 2...
2424183,ffffee2b4534,InChI=1S/C22H26Cl2N2O4S/c1-16-6-2-3-10-21(16)3...,C22H26Cl2N2O4S,C 22 H 26 Cl 2 N 2 O 4 S /c 1 - 16 - 6 - 2 - 3...
2424184,fffff0b6eea6,InChI=1S/C17H26N2O6S/c1-23-16-12-14(4-5-15(16)...,C17H26N2O6S,C 17 H 26 N 2 O 6 S /c 1 - 23 - 16 - 12 - 14 (...


In [6]:
from src.utils import parse_config_file
from src.dm import DataModule

config_file = 'base_16.yml'
config = parse_config_file(config_file)
#config['batch_size'] = 100

dm = DataModule(**config)
dm.setup()

Training samples:  2181767
Validation samples:  242419


In [7]:
import glob 

models = glob.glob('*.ckpt')
models

['transformer-val_loss=0.2392.ckpt', 'transformer-val_loss=0.3548.ckpt']

In [8]:
from src.models import Transformer
from src.vocab import VOCAB

model_path = 'transformer-val_loss=0.2392.ckpt'
model = Transformer.load_from_checkpoint(model_path, len_vocab=len(VOCAB))
model.hparams

"batch_size":         512
"dropout":            0.1
"embed_dim":          256
"gpus":               1
"gradient_clip_val":  1.0
"img_size":           128
"len_vocab":          193
"load_from":          False
"log":                True
"lr":                 0.001
"max_epochs":         10
"max_len":            277
"nhead":              4
"num_decoder_layers": 4
"num_encoder_layers": 4
"num_workers":        24
"optimizer":          Adam
"patch_size":         16
"pin_memory":         True
"precision":          16
"scheduler":          {'CosineAnnealingLR': {'T_max': 10, 'eta_min': 0.0001, 'verbose': True}}
"shuffle_train":      True
"subset":             0.33
"train_batches":      1.0
"train_trans":        {'Resize': {'width': 128, 'height': 128}}
"val_batches":        1.0
"val_trans":          {'Resize': {'width': 128, 'height': 128}}
"val_with_train":     False

# Evaluate model

In [36]:
from tqdm import tqdm

preds, labels = [], []
model.cuda()
for imgs, labs in tqdm(dm.val_dataloader()):
    outputs = model.predict(imgs)
    preds += outputs
    labels += labs.tolist()
    break
    
len(preds)

  0%|          | 0/157 [00:16<?, ?it/s]


512

In [37]:
preds_decoded = [dm.decode(pred) for pred in preds]
preds_inchis = ['InChI=1S/' + pred for pred in preds_decoded]

In [38]:
labs_decoded = [dm.decode(lab) for lab in labs]
inchis = ['InChI=1S/' + lab for lab in labs_decoded]

In [39]:
import numpy as np 

metric = []
for pred, inchi in zip(preds_inchis, inchis):
    metric.append(distance(pred, inchi))
    
np.mean(metric)

87.62109375

# Generate predictions

In [9]:
sample_submission = pd.read_csv(path / 'sample_submission.csv')
sample_submission

Unnamed: 0,image_id,InChI
0,00000d2a601c,InChI=1S/H2O/h1H2
1,00001f7fc849,InChI=1S/H2O/h1H2
2,000037687605,InChI=1S/H2O/h1H2
3,00004b6d55b6,InChI=1S/H2O/h1H2
4,00004df0fe53,InChI=1S/H2O/h1H2
...,...,...
1616102,ffffcdb2e39e,InChI=1S/H2O/h1H2
1616103,ffffcfddd770,InChI=1S/H2O/h1H2
1616104,ffffe4ab06b2,InChI=1S/H2O/h1H2
1616105,ffffec4033ec,InChI=1S/H2O/h1H2


In [10]:
most_freq_inchi = 'InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18(19)20/h2-8H,1H3'
sample_submission.InChI = most_freq_inchi
sample_submission

Unnamed: 0,image_id,InChI
0,00000d2a601c,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...
1,00001f7fc849,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...
2,000037687605,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...
3,00004b6d55b6,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...
4,00004df0fe53,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...
...,...,...
1616102,ffffcdb2e39e,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...
1616103,ffffcfddd770,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...
1616104,ffffe4ab06b2,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...
1616105,ffffec4033ec,InChI=1S/C15H22N2O2/c1-2-3-4-5-6-7-8-9-10-11-1...


In [25]:
from src.dm import Dataset
from src.utils import get_image_path
import albumentations as A 
import torch

limit = 10000
#test_images = sample_submission.image_id[:limit]
test_images = sample_submission.image_id
test_images = test_images.apply(lambda i: get_image_path(i, mode="test"))

ds = Dataset(test_images, train=False, trans=A.Compose([A.Resize(128,128)]))
dl = torch.utils.data.DataLoader(ds, batch_size=100, num_workers=24, pin_memory=True, shuffle=False)

#assert len(ds) == limit

In [26]:
from tqdm import tqdm

preds = []
model.cuda()
for batch in tqdm(dl):
    outputs = model.predict(batch)
    preds += outputs

  0%|          | 29/16162 [01:08<10:34:07,  2.36s/it]


KeyboardInterrupt: 

In [73]:
preds_decoded = [dm.decode(pred) for pred in preds]
sample_submission.InChI[:limit] = ['InChI=1S/'+pred for pred in preds_decoded]
sample_submission

['C36H29N2O4S/c1-36(2,40)34-31-21-40-25-14-15-24(16-25)27(20-29-23-28(26-27)18-37)33(39)36-36-21-31-18-33(27)37(25)22-7-10-26(11-8-22)40-19-9-6-4-3-5-9-13-33/h3-20,24-25H,21H2,1-2H3',
 'C33H46N4/c1-20(2)23-13-25-26(10-7-9-15-25)29-22(33(3,4)5)30-37(28(20)19-23)27(6)11-17-32(34-30)21(3)30-18-31(27)8-14-33(29,32)24/h7-16,21-20-22,24-27,35H,17-21H2,1-6H3',
 'C19H33N5O/c1-15-11-20-19(22-15)13-21(15)23-9-5-7-18(23)24-8-3-4-17-10-16(2)6-12-24/h2,18H,3-14,20H2,1H3',
 'C13H26O2/c1-6-7-10-11(4,5)14-12(6)9-13(2,3)8-12/h9,14H,5-8,15H2,1-4H3',
 'C12H20N2/c1-12(2)7-8-12(3,9-13)15-11-4-5-10(14-6-11)9-11/h10H,4-9H2,1-3H3']

In [79]:
sample_submission.to_csv('submission.csv', index=False)

In [80]:
#!kaggle competitions list
#!kaggle competitions submit bms-molecular-translation -f submission.csv -m "baseline test"

100%|██████████████████████████████████████| 48.8M/48.8M [00:05<00:00, 9.24MB/s]
Successfully submitted to Bristol-Myers Squibb – Molecular Translation