# Preprocess datasets (v2)

In [1]:
import re
import shutil
import multiprocessing as mp

import cv2
import pandas as pd
import tokenizers
from dataclasses import dataclass
from pathlib import Path
from rdkit import Chem
from tqdm.auto import tqdm

tqdm.pandas()

  from pandas import Panel


In [8]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule import constants

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Config

In [9]:
NOTEBOOK_ID = "bms-preprocess-v2"
KFOLD_PATH = constants.INPUTDIR / "kfujikawa" / "bms-kfold" / "10fold.csv"
OUTDIR = constants.INPUTDIR / "kfujikawa" / NOTEBOOK_ID

## Preprocess

In [4]:
OUTDIR.mkdir(parents=True, exist_ok=True)
print(f"{OUTDIR / 'train.pkl'}: {(OUTDIR / 'train.pkl').exists()}")
print(f"{OUTDIR / 'test.pkl'}: {(OUTDIR / 'test.pkl').exists()}")

/work/input/kfujikawa/bms-preprocess-v2/train.pkl: True
/work/input/kfujikawa/bms-preprocess-v2/test.pkl: True


In [5]:
def get_image_shape(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    w, h = max(image.shape), min(image.shape)
    return {
        "w": w,
        "h": h,
        "w/h": w / h,
        "w*h": w * h,
        "is_flipped": image.shape[0] > image.shape[1],
    }

### Preprocess train dataset

In [6]:
train_df = pd.read_csv(constants.COMPETITION_DATADIR / "train_labels.csv")
train_df["image_path"] = train_df.image_id.progress_apply(
    lambda x: str(constants.COMPETITION_DATADIR / f"train/{x[0]}/{x[1]}/{x[2]}/{x}.png")
)
train_df["InChI_length"] = train_df.InChI.apply(len)
train_df = train_df.sort_values("InChI_length", ascending=False).reset_index(drop=True)
with mp.Pool() as pool:
    iterator = pool.imap(get_image_shape, train_df.image_path.values)
    train_df = train_df.assign(**pd.DataFrame(tqdm(iterator, total=len(train_df))))
train_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2424186.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2424186.0), HTML(value='')))




Unnamed: 0,image_id,InChI,image_path,InChI_length,w,h,w/h,w*h,is_flipped
0,4435736fd10b,InChI=1S/C65H110O6/c1-4-7-10-13-16-19-22-25-28...,/work/input/bms-molecular-translation/train/4/...,403,1472,788,1.86802,1159936,False
1,8c0e35ce3f1f,InChI=1S/C60H98O6/c1-4-7-10-13-16-19-22-25-27-...,/work/input/bms-molecular-translation/train/8/...,398,939,309,3.038835,290151,False
2,934593ad3cae,InChI=1S/C63H106O6/c1-4-7-10-13-16-19-22-25-27...,/work/input/bms-molecular-translation/train/9/...,397,826,661,1.249622,545986,False
3,c963808e309d,InChI=1S/C59H92O6/c1-4-7-10-13-16-19-22-25-28-...,/work/input/bms-molecular-translation/train/c/...,393,1268,883,1.436014,1119644,False
4,0aa425d5f5ac,InChI=1S/C62H106O6/c1-4-7-10-13-16-19-22-25-27...,/work/input/bms-molecular-translation/train/0/...,393,918,543,1.690608,498474,False


### Load predicted InChi_length

In [7]:
prediction_df = pd.read_csv("/work/output/1003_effnet_b5/test_beam=1.csv")
prediction_df["InChI_length"] = prediction_df.InChI.apply(len)
prediction_df.head()

Unnamed: 0,image_id,InChI,InChI_length
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...,86
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...,105
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...,96
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1...",95
4,00004df0fe53,InChI=1S/C9H12O2/c1-4-5-2-6-7(3-4)11-9(10)8(5)...,85


### Preprocess test dataset

In [None]:
test_df = pd.read_csv(constants.COMPETITION_DATADIR / "sample_submission.csv", usecols=["image_id"])
test_df["image_path"] = test_df.image_id.progress_apply(
    lambda x: str(constants.COMPETITION_DATADIR / f"test/{x[0]}/{x[1]}/{x[2]}/{x}.png")
)
assert len(test_df) == len(prediction_df)
test_df = test_df.merge(prediction_df, on="image_id", how="left")
assert len(test_df) == len(prediction_df)
test_df = test_df.sort_values("InChI_length", ascending=False).reset_index(drop=True)
test_df = test_df[["image_id", "image_path", "InChI_length"]]
with mp.Pool() as pool:
    iterator = pool.imap(get_image_shape, test_df.image_path.values)
    test_df = test_df.assign(**pd.DataFrame(tqdm(iterator, total=len(test_df))))
test_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))




### Load KFold

In [None]:
kfold_df = pd.read_csv(KFOLD_PATH)
benchmark_ids = kfold_df.query("fold.isin([0,1,2])").image_id
benchmark_ids

In [None]:
OUTDIR.mkdir(parents=True, exist_ok=True)
train_df.head(1000).to_pickle(OUTDIR / "train.debug.pkl")
train_df.query("image_id.isin(@benchmark_ids)").reset_index(drop=True).to_pickle(OUTDIR / "train.bench.pkl")
train_df.to_pickle(OUTDIR / "train.pkl")
test_df.head(1000).to_pickle(OUTDIR / "test.debug.pkl")
test_df.head(1000).to_pickle(OUTDIR / "test.bench.pkl")
test_df.to_pickle(OUTDIR / "test.pkl")
list(OUTDIR.iterdir())