# Preprocess w/ pseudo (LB: 0.74)

In [1]:
import re
import shutil
import multiprocessing as mp

import cv2
import pandas as pd
import tokenizers
from dataclasses import dataclass
from pathlib import Path
from rdkit import Chem
from tqdm.auto import tqdm

tqdm.pandas()

  from pandas import Panel


In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule import constants
from nncomp_molecule.preprocessors import normalize_inchi_batch, disable_rdlogger

## Config

In [3]:
NOTEBOOK_ID = "bms-preprocess-with-pseudo-lb074"
KFOLD_PATH = constants.INPUTDIR / "kfujikawa" / "bms-kfold" / "10fold.csv"
OUTDIR = constants.INPUTDIR / "kfujikawa" / NOTEBOOK_ID

## Preprocess

In [4]:
OUTDIR.mkdir(parents=True, exist_ok=True)
print(f"{OUTDIR / 'train.pkl'}: {(OUTDIR / 'train.pkl').exists()}")
print(f"{OUTDIR / 'test.pkl'}: {(OUTDIR / 'test.pkl').exists()}")

/work/input/kfujikawa/bms-preprocess-with-pseudo-lb074/train.pkl: False
/work/input/kfujikawa/bms-preprocess-with-pseudo-lb074/test.pkl: False


### Preprocess train dataset

In [5]:
train_df = pd.read_csv(constants.COMPETITION_DATADIR / "train_labels.csv")
train_df["image_path"] = train_df.image_id.progress_apply(
    lambda x: str(constants.COMPETITION_DATADIR / f"train/{x[0]}/{x[1]}/{x[2]}/{x}.png")
)
train_df["InChI_length"] = train_df.InChI.apply(len)
train_df["is_pseudo"] = False
train_df = train_df.sort_values("InChI_length", ascending=False).reset_index(drop=True)
train_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2424186.0), HTML(value='')))




Unnamed: 0,image_id,InChI,image_path,InChI_length,is_pseudo
0,4435736fd10b,InChI=1S/C65H110O6/c1-4-7-10-13-16-19-22-25-28...,/work/input/bms-molecular-translation/train/4/...,403,False
1,8c0e35ce3f1f,InChI=1S/C60H98O6/c1-4-7-10-13-16-19-22-25-27-...,/work/input/bms-molecular-translation/train/8/...,398,False
2,934593ad3cae,InChI=1S/C63H106O6/c1-4-7-10-13-16-19-22-25-27...,/work/input/bms-molecular-translation/train/9/...,397,False
3,c963808e309d,InChI=1S/C59H92O6/c1-4-7-10-13-16-19-22-25-28-...,/work/input/bms-molecular-translation/train/c/...,393,False
4,0aa425d5f5ac,InChI=1S/C62H106O6/c1-4-7-10-13-16-19-22-25-27...,/work/input/bms-molecular-translation/train/0/...,393,False


In [6]:
pseudo_df = pd.read_csv(constants.INPUTDIR / "kfujikawa/bms-pseudo-labels-lb074/pseudo_labels.csv")
pseudo_df["InChI_length"] = pseudo_df.InChI.apply(len)
pseudo_df["is_pseudo"] = True
pseudo_df["image_path"] = pseudo_df.image_id.progress_apply(
    lambda x: str(constants.COMPETITION_DATADIR / f"test/{x[0]}/{x[1]}/{x[2]}/{x}.png")
)
pseudo_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1544273.0), HTML(value='')))




Unnamed: 0,image_id,InChI,InChI_length,is_pseudo,image_path
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...,86,True,/work/input/bms-molecular-translation/test/0/0...
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...,105,True,/work/input/bms-molecular-translation/test/0/0...
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...,96,True,/work/input/bms-molecular-translation/test/0/0...
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1...",95,True,/work/input/bms-molecular-translation/test/0/0...
4,000085dab281,InChI=1S/C20H38O/c1-20(2)18-16-14-12-10-8-6-4-...,95,True,/work/input/bms-molecular-translation/test/0/0...


In [7]:
train_df = pd.concat([train_df, pseudo_df], ignore_index=True)[train_df.columns]
train_df

Unnamed: 0,image_id,InChI,image_path,InChI_length,is_pseudo
0,4435736fd10b,InChI=1S/C65H110O6/c1-4-7-10-13-16-19-22-25-28...,/work/input/bms-molecular-translation/train/4/...,403,False
1,8c0e35ce3f1f,InChI=1S/C60H98O6/c1-4-7-10-13-16-19-22-25-27-...,/work/input/bms-molecular-translation/train/8/...,398,False
2,934593ad3cae,InChI=1S/C63H106O6/c1-4-7-10-13-16-19-22-25-27...,/work/input/bms-molecular-translation/train/9/...,397,False
3,c963808e309d,InChI=1S/C59H92O6/c1-4-7-10-13-16-19-22-25-28-...,/work/input/bms-molecular-translation/train/c/...,393,False
4,0aa425d5f5ac,InChI=1S/C62H106O6/c1-4-7-10-13-16-19-22-25-27...,/work/input/bms-molecular-translation/train/0/...,393,False
...,...,...,...,...,...
3968454,ffffcdb2e39e,InChI=1S/C21H28N2O2/c1-15(2)14-25-19-10-8-17(9...,/work/input/bms-molecular-translation/test/f/f...,156,True
3968455,ffffcfddd770,InChI=1S/C24H28N2O5/c1-5-30-19-9-8-17(13-16(19...,/work/input/bms-molecular-translation/test/f/f...,156,True
3968456,ffffe4ab06b2,InChI=1S/C17H17NO3/c19-12-6-7-14-13(10-12)16-1...,/work/input/bms-molecular-translation/test/f/f...,106,True
3968457,ffffec4033ec,"InChI=1S/C12H14F3NO3S/c1-2-3-4-9-16-20(17,18)1...",/work/input/bms-molecular-translation/test/f/f...,105,True


### Load predicted InChi_length

In [8]:
prediction_df = pd.read_csv("/work/output/1003_effnet_b5/test_beam=1.csv")
prediction_df["InChI_length"] = prediction_df.InChI.apply(len)
print(len(prediction_df))
prediction_df.head()

1616107


Unnamed: 0,image_id,InChI,InChI_length
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...,86
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...,105
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...,96
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1...",95
4,00004df0fe53,InChI=1S/C9H12O2/c1-4-5-2-6-7(3-4)11-9(10)8(5)...,85


### Preprocess test dataset

In [9]:
test_df = pd.read_csv(constants.COMPETITION_DATADIR / "sample_submission.csv", usecols=["image_id"])
test_df["image_path"] = test_df.image_id.progress_apply(
    lambda x: str(constants.COMPETITION_DATADIR / f"test/{x[0]}/{x[1]}/{x[2]}/{x}.png")
)
assert len(test_df) == len(prediction_df)
test_df = test_df.merge(prediction_df, on="image_id", how="left")
assert len(test_df) == len(prediction_df)
test_df = test_df.sort_values("InChI_length", ascending=False).reset_index(drop=True)
test_df = test_df[["image_id", "image_path", "InChI_length"]]
test_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))




Unnamed: 0,image_id,image_path,InChI_length
0,c23b605c64d9,/work/input/bms-molecular-translation/test/c/2...,447
1,f5ecb39ddbc8,/work/input/bms-molecular-translation/test/f/5...,443
2,30153e0d3737,/work/input/bms-molecular-translation/test/3/0...,442
3,ce3b82556a4f,/work/input/bms-molecular-translation/test/c/e...,428
4,a81b9e381215,/work/input/bms-molecular-translation/test/a/8...,419


### Save files

In [10]:
OUTDIR.mkdir(parents=True, exist_ok=True)
pd.concat([train_df.head(1000), train_df.tail(1000)]).reset_index(drop=True).to_pickle(OUTDIR / "train.debug.pkl")
train_df.to_pickle(OUTDIR / "train.pkl")
test_df.head(1000).to_pickle(OUTDIR / "test.debug.pkl")
test_df.to_pickle(OUTDIR / "test.pkl")
list(OUTDIR.iterdir())

[PosixPath('/work/input/kfujikawa/bms-preprocess-with-pseudo-lb074/train.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-with-pseudo-lb074/test.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-with-pseudo-lb074/train.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-with-pseudo-lb074/test.pkl')]