In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import pandas as pd
from pathlib import Path
from Levenshtein import distance

path = Path('data')
train_labels = pd.read_csv(path / 'train_labels.csv')
train_labels

Unnamed: 0,image_id,InChI
0,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...
1,000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...
2,0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...
3,000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...
4,000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...
...,...,...
2424181,ffffe824f539,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...
2424182,ffffea1ebdfa,InChI=1S/C19H20F2N4O3S/c1-10(8-20)28-14-7-12(2...
2424183,ffffee2b4534,InChI=1S/C22H26Cl2N2O4S/c1-16-6-2-3-10-21(16)3...
2424184,fffff0b6eea6,InChI=1S/C17H26N2O6S/c1-23-16-12-14(4-5-15(16)...


In [55]:
from src.dm import Dataset
from src.utils import get_image_path
import albumentations as A 

sample = train_labels.sample(5000)
images = sample.image_id.map(get_image_path)

ds = Dataset(images.values, train=False, trans=A.Compose([A.Resize(128,128)]))
dl = torch.utils.data.DataLoader(ds, batch_size=100, num_workers=0, pin_memory=True, shuffle=False)

len(ds)

5000

In [56]:
models = glob.glob('*.ckpt')
models

['lr_find_temp_model.ckpt', 'resnet34-val_ld=4.6759.ckpt']

In [57]:
from src.models import Baseline

model_path = 'resnet34-val_ld=4.6759.ckpt'
model = Baseline.load_from_checkpoint(model_path)
model.hparams

"backbone":       resnet34
"batch_size":     2048
"gpus":           1
"log":            True
"lr":             0.001
"max_epochs":     10
"max_len":        21
"model":          Baseline
"num_workers":    0
"optimizer":      Adam
"pin_memory":     True
"precision":      16
"pretrained":     True
"shuffle_train":  False
"train_batches":  50
"train_trans":    {'Resize': {'width': 128, 'height': 128}}
"val_batches":    5
"val_trans":      {'Resize': {'width': 128, 'height': 128}}
"val_with_train": False

In [64]:
from tqdm import tqdm

preds = []
model.cuda()
for batch in tqdm(dl):
    outputs = model.predict(batch.cuda())
    preds += outputs
    
len(preds)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:08<00:00,  6.13it/s]


5000

In [65]:
preds_inchis = ['InChI=1S/' + inchi for inchi in preds]
preds_inchis[:3]

['InChI=1S/C14H15ClNOOS', 'InChI=1S/C12H14O4', 'InChI=1S/C23H32N2O6S']

In [67]:
import numpy as np 

metric = []
for pred, inchi in zip(preds_inchis, sample.InChI.values):
    metric.append(distance(pred, inchi))
    
np.mean(metric)

109.0152

In [68]:
import glob 
from pathlib import Path 

test_images = glob.glob('data/test/*/*/*/*.png')
#test_images = glob.glob('data/bms100/test/*.png')

len(test_images)

1616107

In [69]:
from src.dm import Dataset
import albumentations as A 

#limit = 100000
#test_images = test_images[:limit]

ds = Dataset(test_images, train=False, trans=A.Compose([A.Resize(128,128)]))
#ds = Dataset(test_images, train=False, trans=None)
len(ds)

1616107

In [70]:
import torch

dl = torch.utils.data.DataLoader(ds, batch_size=100, num_workers=0, pin_memory=True, shuffle=False)

imgs = next(iter(dl))
imgs.shape

torch.Size([100, 1, 128, 128])

In [71]:
from tqdm import tqdm

preds = []
model.cuda()
for batch in tqdm(dl):
    outputs = model.predict(batch.cuda())
    preds += outputs
len(preds)

 16%|███████████▌                                                             | 2563/16162 [1:12:45<6:26:05,  1.70s/it]


KeyboardInterrupt: 

In [None]:
import pandas as pd

submission = pd.DataFrame({
    'image_id': [image_id.split('/')[-1][:-4] for image_id in test_images],
    #'image_id': [image_id.split('/')[-1][:-4] for image_id in test_images[:32]],
    'InChI': ['InChI=1S/'+pred for pred in preds]
})

submission

In [22]:
submission.sample(10)

Unnamed: 0,image_id,InChI
1500992,952441fa7516,InChI=1S/C13H10N2/
718808,f03967b3beb4,InChI=1S/C33H38N3O4O/
587118,d2923f9fb819,InChI=1S/C15H28N4O/
1283897,26dd6342d670,InChI=1S/C23H28N3O4S/
1001944,e9b64b1aeadc,InChI=1S/C23H28N3O4/
1416517,9adaba0782e2,InChI=1S/C23H38NlO4O/
532554,dc8ea639b200,InChI=1S/C21H28N4O2/
1389095,5642db8095e7,InChI=1S/C13H28N4O2/
351899,8f3da2a9fa8b,InChI=1S/C21H28N4O2/
1338916,584864cebda7,InChI=1S/C13H18N3O2S/


In [11]:
submission.to_csv('submission.csv', index=False)

In [12]:
#!kaggle competitions list
#!kaggle competitions submit bms-molecular-translation -f submission.csv -m "My submission message"

100%|██████████████████████████████████████| 52.7M/52.7M [00:06<00:00, 8.62MB/s]
Successfully submitted to Bristol-Myers Squibb – Molecular Translation