In [1]:
import cv2
import Levenshtein
import optuna
import numpy as np
import pandas as pd
import plotly.express as px
import sklearn.metrics
import ipywidgets
import matplotlib.pyplot as plt
import multiprocessing as mp
from tqdm.auto import tqdm
from pathlib import Path
tqdm.pandas()

  from pandas import Panel


In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule import constants

In [3]:
# train_df = pd.read_pickle("/work/input/kfujikawa/bms-preprocess-v2/train.pkl").set_index("image_id")
# train_df.head()

In [4]:
CANDIDATES_DIR = constants.INPUTDIR / "kfujikawa/kf-bms-candidates-0523"
valid_candidates_df = pd.read_csv(CANDIDATES_DIR / "valid_candidates.csv")
test_candidates_df = pd.read_csv(CANDIDATES_DIR / "test_candidates.csv")

In [6]:
COLUMNS = [
    "image_id", "normed_InChI", "normed_score"
]
MODEL = "1109_vtnt_bert_512-1024-denoise-5"
VALID_CSVs = [
    constants.OUTPUTDIR / MODEL / "valid_beam=1.csv",
    constants.OUTPUTDIR / MODEL / "valid_beam=4.csv",
    constants.OUTPUTDIR / MODEL / f"candidate_0521_{MODEL}.csv",
]
valid_predictions_df = pd.concat([
    pd.read_csv(filename, usecols=COLUMNS)
    for filename in tqdm(VALID_CSVs)
], ignore_index=True)
valid_predictions_df = valid_predictions_df.drop_duplicates(subset=["image_id", "normed_InChI"])
valid_predictions_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0,image_id,normed_InChI,normed_score
0,3a0f7a797b08,InChI=1S/C19H19N5OS/c1-12-6-9-16(26-2)11-17(12...,1.613341e-06
1,2b4299bf47aa,InChI=1S/C16H24ClNO/c1-12(2)5-3-4-8-18-11-15-1...,0.001064921
2,05494c50dd8e,InChI=1S/C14H18N4/c1-11-3-2-4-13(17-11)8-18-9-...,6.580866e-07
3,5d7b569dce12,InChI=1S/C33H43ClN6O6S/c34-23-17-27-29(28(18-2...,0.0005938433
4,f5e5996ca41e,"InChI=1S/C14H19BrCl2O/c1-11(2)18-8-7-14(9-16,1...",2.535854e-05


In [7]:
valid_presets_df = valid_candidates_df.merge(valid_predictions_df)
valid_presets_df.to_csv(constants.OUTPUTDIR / MODEL / f"valid_candidates_{CANDIDATES_DIR.name}.csv", index=False)

In [8]:
TEST_CSVs = [
    constants.OUTPUTDIR / MODEL / "test_beam=1.csv",
    constants.OUTPUTDIR / MODEL / "test_beam=4.csv",
]
test_predictions_df = pd.concat([
    pd.read_csv(filename, usecols=COLUMNS)
    for filename in tqdm(TEST_CSVs)
], ignore_index=True)
test_predictions_df = test_predictions_df.drop_duplicates(subset=["image_id", "normed_InChI"])
test_predictions_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,image_id,normed_InChI,normed_score
0,c23b605c64d9,InChI=1S/C43H84O15/c1-3-43(44)58-42-16-41-57-4...,0.032149
1,f5ecb39ddbc8,InChI=1S/C52H101N3O24/c53-51-52(54-51)50-67-49...,0.120912
2,30153e0d3737,InChI=1S/C62H114O6/c1-4-7-10-13-16-19-22-25-28...,0.025315
3,ce3b82556a4f,"InChI=1S/C41H80O19/c1-4-41(2,3)40(43)59-39-38-...",0.027121
4,a81b9e381215,InChI=1S/C59H117NO5/c1-3-5-7-9-11-13-15-17-19-...,0.035394


In [9]:
test_presets_df = test_candidates_df.merge(test_predictions_df)
test_presets_df.to_csv(constants.OUTPUTDIR / MODEL / f"test_candidates_{CANDIDATES_DIR.name}.csv", index=False)

In [10]:
test_presets_df

Unnamed: 0,image_id,normed_InChI,is_valid,normed_score
0,9ba7f91f0a40,InChI=1S/C67H127NO4/c1-3-5-7-9-11-13-15-17-19-...,False,0.024314
1,224eae465d23,InChI=1S/C8H16O7/c1-14-3(2-9)7-5(11)4(10)6(12)...,True,0.033350
2,da00f0a52f53,InChI=1S/C46H79O10P/c1-3-5-7-9-11-13-15-17-19-...,False,0.009484
3,e7fe5e288b68,InChI=1S/C68H129NO5/c1-3-5-7-9-11-13-14-15-16-...,False,0.016226
4,1d9df1430439,InChI=1S/C72H139NO5/c1-3-5-7-9-11-13-15-17-18-...,True,0.022186
...,...,...,...,...
3894111,712136a37a11,"InChI=1S/C2H6OS/c1-5(2,3)4/h1-2H3/i1D3,2D2",False,0.117267
3894112,7b9ed6e15824,"InChI=1S/C3HCl4FO/c4-2(5,6)1-3(7,8)9/h1H",True,1.413789
3894113,d78db5f90e4b,"InChI=1S/C2H5IO/c3-2-1-4/h4H,1-2H2/i1D2,2D2",False,0.123084
3894114,5f53ee1eb20f,"InChI=1S/C2H4O2/c1-2(3)4/h3H,1H3/i1D3,2+1",True,0.067391
