# Preprocess datasets (v1)

In [1]:
import re
import shutil
import numpy as np
import pandas as pd
import tokenizers
from dataclasses import dataclass
from pathlib import Path
from pandarallel import pandarallel
from rdkit import Chem
from tqdm.auto import tqdm

tqdm.pandas()
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


  from pandas import Panel


In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule import constants

## Config

In [3]:
NOTEBOOK_ID = "bms-preprocess-v1"
OUTDIR = constants.INPUTDIR / "kfujikawa" / NOTEBOOK_ID

PosixPath('/work/input/kfujikawa/bms-preprocess-v1')

## Preprocess

In [None]:
OUTDIR.mkdir(parents=True, exist_ok=True)
print(f"{OUTDIR / 'train.pkl'}: {(OUTDIR / 'train.pkl').exists()}")
print(f"{OUTDIR / 'test.pkl'}: {(OUTDIR / 'test.pkl').exists()}")

### Preprocess train dataset

In [4]:
train_df = pd.read_csv(constants.COMPETITION_DATADIR / "train_labels.csv")
train_df["image_path"] = train_df.image_id.progress_apply(
    lambda x: str(constants.COMPETITION_DATADIR / f"train/{x[0]}/{x[1]}/{x[2]}/{x}.png")
)
train_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2424186.0), HTML(value='')))




Unnamed: 0,image_id,InChI,image_path
0,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,/work/input/bms-molecular-translation/train/0/...
1,000019cc0cd2,InChI=1S/C21H30O4/c1-12(22)25-14-6-8-20(2)13(1...,/work/input/bms-molecular-translation/train/0/...
2,0000252b6d2b,InChI=1S/C24H23N5O4/c1-14-13-15(7-8-17(14)28-1...,/work/input/bms-molecular-translation/train/0/...
3,000026b49b7e,InChI=1S/C17H24N2O4S/c1-12(20)18-13(14-7-6-10-...,/work/input/bms-molecular-translation/train/0/...
4,000026fc6c36,InChI=1S/C10H19N3O2S/c1-15-10(14)12-8-4-6-13(7...,/work/input/bms-molecular-translation/train/0/...


### Preprocess test dataset

In [5]:
submission_df = pd.read_csv(constants.COMPETITION_DATADIR / "sample_submission.csv", usecols=["image_id"])
submission_df["image_path"] = submission_df.image_id.progress_apply(
    lambda x: str(constants.COMPETITION_DATADIR / f"test/{x[0]}/{x[1]}/{x[2]}/{x}.png")
)
submission_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))




Unnamed: 0,image_id,image_path
0,00000d2a601c,/work/input/bms-molecular-translation/test/0/0...
1,00001f7fc849,/work/input/bms-molecular-translation/test/0/0...
2,000037687605,/work/input/bms-molecular-translation/test/0/0...
3,00004b6d55b6,/work/input/bms-molecular-translation/test/0/0...
4,00004df0fe53,/work/input/bms-molecular-translation/test/0/0...


## Output

In [6]:
OUTDIR.mkdir(parents=True, exist_ok=True)
train_df.head(1000).to_pickle(OUTDIR / "train.debug.pkl")
train_df.to_pickle(OUTDIR / "train.pkl")
submission_df.head(1000).to_pickle(OUTDIR / "test.debug.pkl")
submission_df.to_pickle(OUTDIR / "test.pkl")
list(OUTDIR.iterdir())

[PosixPath('/work/input/kfujikawa/bms-preprocess-v1/dataset-metadata.json'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-v1/train.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-v1/.ipynb_checkpoints'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-v1/test.debug.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-v1/train.pkl'),
 PosixPath('/work/input/kfujikawa/bms-preprocess-v1/test.pkl')]