In [5]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
import pandas as pd
from pathlib import Path
from Levenshtein import distance

path = Path('data')
train_labels = pd.read_csv(path / 'train_labels.csv')
train_labels

Unnamed: 0,image_id,InChI
0,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...
1,000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...
2,0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...
3,000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...
4,000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...
...,...,...
2424181,ffffe824f539,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...
2424182,ffffea1ebdfa,InChI=1S/C19H20F2N4O3S/c1-10(8-20)28-14-7-12(2...
2424183,ffffee2b4534,InChI=1S/C22H26Cl2N2O4S/c1-16-6-2-3-10-21(16)3...
2424184,fffff0b6eea6,InChI=1S/C17H26N2O6S/c1-23-16-12-14(4-5-15(16)...


In [7]:
from src.dm import Dataset
from src.utils import get_image_path
import albumentations as A 
import torch


sample = train_labels.sample(100)
images = sample.image_id.map(get_image_path)

ds = Dataset(images.values, train=False, trans=A.Compose([A.Resize(128,128)]))
dl = torch.utils.data.DataLoader(ds, batch_size=100, num_workers=0, pin_memory=True, shuffle=False)

len(ds)

100

In [8]:
import glob 

models = glob.glob('*.ckpt')
models

['transformer-val_ld=5.3365.ckpt',
 'transformer-val_loss=0.6712.ckpt',
 'transformer-val_loss=2.4111.ckpt',
 'transformer-val_loss=0.6376.ckpt',
 'transformer-val_loss=0.6519.ckpt',
 'temp_model.ckpt',
 'transformer-val_ld=5.3029.ckpt']

In [9]:
from src.models import Transformer
from src.vocab import VOCAB

model_path = 'transformer-val_loss=2.4111.ckpt'
model = Transformer.load_from_checkpoint(model_path, len_vocab=len(VOCAB))
model.hparams

"batch_size":         2048
"dropout":            0.1
"embed_dim":          256
"gpus":               1
"img_size":           128
"len_vocab":          39
"load_from":          False
"log":                True
"lr":                 0.001
"max_epochs":         10
"max_len":            21
"nhead":              4
"num_decoder_layers": 6
"num_encoder_layers": 6
"num_workers":        24
"optimizer":          Adam
"patch_size":         16
"pin_memory":         True
"precision":          16
"shuffle_train":      False
"train_batches":      400
"train_trans":        {'Resize': {'width': 128, 'height': 128}}
"val_batches":        50
"val_trans":          {'Resize': {'width': 128, 'height': 128}}
"val_with_train":     False

In [10]:
from tqdm import tqdm

preds = []
model.cuda()
for batch in tqdm(dl):
    outputs = model.predict(batch)
    preds += outputs
    
len(preds)

100%|██████████| 1/1 [00:00<00:00,  1.18it/s]


100

In [11]:
preds[:5]

[tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        device='cuda:0'),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        device='cuda:0'),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        device='cuda:0'),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        device='cuda:0'),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        device='cuda:0')]

In [None]:
preds_inchis = ['InChI=1S/' + inchi for inchi in preds]
preds_inchis[:3]

In [None]:
import numpy as np 

metric = []
for pred, inchi in zip(preds_inchis, sample.InChI.values):
    metric.append(distance(pred, inchi))
    
np.mean(metric)

In [None]:
import glob 
from pathlib import Path 

test_images = glob.glob('data/test/*/*/*/*.png')
#test_images = glob.glob('data/bms100/test/*.png')

len(test_images)

In [None]:
from src.dm import Dataset
import albumentations as A 

#limit = 100000
#test_images = test_images[:limit]

ds = Dataset(test_images, train=False, trans=A.Compose([A.Resize(128,128)]))
#ds = Dataset(test_images, train=False, trans=None)
len(ds)

In [None]:
import torch

dl = torch.utils.data.DataLoader(ds, batch_size=100, num_workers=0, pin_memory=True, shuffle=False)

imgs = next(iter(dl))
imgs.shape

In [None]:
from tqdm import tqdm

preds = []
model.cuda()
for batch in tqdm(dl):
    outputs = model.predict(batch.cuda())
    preds += outputs
len(preds)

In [None]:
import pandas as pd

submission = pd.DataFrame({
    'image_id': [image_id.split('/')[-1][:-4] for image_id in test_images],
    #'image_id': [image_id.split('/')[-1][:-4] for image_id in test_images[:32]],
    'InChI': ['InChI=1S/'+pred for pred in preds]
})

submission

In [None]:
submission.sample(10)

In [None]:
submission.to_csv('submission.csv', index=False)

In [None]:
#!kaggle competitions list
#!kaggle competitions submit bms-molecular-translation -f submission.csv -m "My submission message"