In [1]:
import cv2
import Levenshtein
import optuna
import numpy as np
import pandas as pd
import plotly.express as px
import sklearn.metrics
import ipywidgets
import matplotlib.pyplot as plt
import multiprocessing as mp
from tqdm.auto import tqdm
from pathlib import Path
tqdm.pandas()

  from pandas import Panel


In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule import constants

In [3]:
# train_df = pd.read_pickle("/work/input/kfujikawa/bms-preprocess-v2/train.pkl").set_index("image_id")
# train_df.head()

In [4]:
CANDIDATES_DIR = constants.INPUTDIR / "kfujikawa/kf-bms-candidates-0523"
valid_candidates_df = pd.read_csv(CANDIDATES_DIR / "valid_candidates.csv")
test_candidates_df = pd.read_csv(CANDIDATES_DIR / "test_candidates.csv")

In [5]:
COLUMNS = [
    "image_id", "normed_InChI", "normed_score"
]
MODEL = "1113_swin_large_bert_384"
VALID_CSVs = [
    constants.OUTPUTDIR / MODEL / "valid_beam=1.csv",
    constants.OUTPUTDIR / MODEL / "valid_beam=4.csv",
    constants.OUTPUTDIR / MODEL / "candidate_0521_1113_swin_large_bert_384.csv",
]
valid_predictions_df = pd.concat([
    pd.read_csv(filename, usecols=COLUMNS)
    for filename in tqdm(VALID_CSVs)
], ignore_index=True)
valid_predictions_df = valid_predictions_df.drop_duplicates(subset=["image_id", "normed_InChI"])
valid_predictions_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0,image_id,normed_InChI,normed_score
0,a3c096ab64e7,InChI=1S/C56H90O6/c1-4-7-10-13-16-19-22-25-26-...,0.03326
1,01d9b7ce6ba1,InChI=1S/C61H104O6/c1-4-7-10-13-16-19-22-24-26...,0.01326
2,75c32fbd3779,InChI=1S/C72H135NO5/c1-3-5-7-9-11-13-15-17-19-...,0.035122
3,b440ab35d6a0,InChI=1S/C57H92O6/c1-4-7-10-13-16-19-22-25-27-...,0.032751
4,807b043f3c86,InChI=1S/C55H88O6/c1-4-7-10-13-16-19-22-25-27-...,0.027721


In [6]:
valid_presets_df = valid_candidates_df.merge(valid_predictions_df)
valid_presets_df.to_csv(constants.OUTPUTDIR / MODEL / f"valid_candidates_{CANDIDATES_DIR.name}.csv", index=False)

In [7]:
TEST_CSVs = [
    constants.OUTPUTDIR / MODEL / "test_beam=1.csv",
    constants.OUTPUTDIR / MODEL / "test_beam=4.csv",
]
test_predictions_df = pd.concat([
    pd.read_csv(filename, usecols=COLUMNS)
    for filename in tqdm(TEST_CSVs)
], ignore_index=True)
test_predictions_df = test_predictions_df.drop_duplicates(subset=["image_id", "normed_InChI"])
test_predictions_df.head()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,image_id,normed_InChI,normed_score
0,c23b605c64d9,InChI=1S/C42H84O15/c1-3-42(43)57-41-15-5-23-49...,0.128493
1,f5ecb39ddbc8,InChI=1S/C62H120O22/c1-63-6-7-65-8-9-66-10-11-...,0.248977
2,30153e0d3737,InChI=1S/C63H116O6/c1-4-7-10-13-16-19-22-25-28...,0.031166
3,ce3b82556a4f,"InChI=1S/C42H84O21/c1-4-42(2,3)41(44)63-40-39-...",0.048358
4,a81b9e381215,InChI=1S/C60H125NO5/c1-3-5-7-9-11-13-15-17-19-...,0.022294


In [8]:
test_presets_df = test_candidates_df.merge(test_predictions_df)
test_presets_df.to_csv(constants.OUTPUTDIR / MODEL / f"test_candidates_{CANDIDATES_DIR.name}.csv", index=False)

In [9]:
test_presets_df

Unnamed: 0,image_id,normed_InChI,is_valid,normed_score
0,c23b605c64d9,InChI=1S/C42H84O15/c1-3-42(43)57-41-15-5-23-49...,False,0.128493
1,f5ecb39ddbc8,InChI=1S/C62H120O22/c1-63-6-7-65-8-9-66-10-11-...,False,0.248977
2,30153e0d3737,InChI=1S/C63H116O6/c1-4-7-10-13-16-19-22-25-28...,False,0.031166
3,ce3b82556a4f,"InChI=1S/C42H84O21/c1-4-42(2,3)41(44)63-40-39-...",False,0.048358
4,a81b9e381215,InChI=1S/C60H125NO5/c1-3-5-7-9-11-13-15-17-19-...,False,0.022294
...,...,...,...,...
6328503,809c403155f7,"InChI=1S/C3HF3O2/c4-2(5)1(8)9-3(2,6)7",False,0.198270
6328504,809c403155f7,"InChI=1S/C3F4O2/c4-2(5)1(9)10-3(2,6)7/b8",False,0.265043
6328505,c2d86f19e139,InChI=1S/C3F3NS/c4-1-2(5)8-3(6)7-1,True,0.124100
6328506,c2d86f19e139,"InChI=1S/C3Cl3NS/c4-1-2(5)8-3(6)7-1/h(H,7,8)",False,0.147435
