In [1]:
from pathlib import Path

import click
import numpy as np
import pandas as pd
import Levenshtein
from tqdm.auto import tqdm
from loguru import logger

In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule.preprocessors import normalize_inchi_batch, disable_rdlogger

In [3]:
OUTDIR = Path("/work/input/kfujikawa/kf-bms-candidates-v2")

## Load training dataset

In [4]:
OUT_COLUMNS = [
    "image_id",
    "InChI",
    "levenshtein",
    "is_valid",
]

In [5]:
disable_rdlogger()
candidate_ids = set()
origin_df = pd.read_pickle("/work/input/kfujikawa/bms-preprocess-v2/train.pkl").set_index("image_id")

## Load valid: kf_0523, kf_0525, kf_0527, yokoo_0527

In [6]:
VALID_CSVs = """
/work/output/1109_vtnt_bert_512-1024-denoise-5/kf-bms-candidates/valid_kf_0523.csv
/work/output/1109_vtnt_bert_512-1024-denoise-5/kf-bms-candidates/valid_kf_0525.csv
/work/output/1109_vtnt_bert_512-1024-denoise-5/kf-bms-candidates/valid_yokoo_0527.csv
/work/output/1109_vtnt_bert_512-1024-denoise-5/kf-bms-candidates/valid_kf_0527.csv
/work/output/1113_swin_large_bert_384/kf-bms-candidates/valid_kf_0523.csv
/work/output/1113_swin_large_bert_384/kf-bms-candidates/valid_kf_0525.csv
/work/output/1113_swin_large_bert_384/kf-bms-candidates/valid_yokoo_0527.csv
/work/output/1113_swin_large_bert_384/kf-bms-candidates/valid_kf_0527.csv
""".strip().split()

valid_df = pd.concat([
    pd.read_csv(path).assign(filename=path.split("/")[-1])
    for path in tqdm(VALID_CSVs)
], ignore_index=True)
valid_ensembled_df = valid_df.groupby(["image_id", "InChI"]).mean().reset_index()
lower_score = valid_df.groupby("image_id").levenshtein.min().mean()
print(lower_score)
valid_df

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))


0.14262083417553906


Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,filename
0,a3c096ab64e7,InChI=1S/C56H90O6/c1-4-7-10-13-16-19-22-25-26-...,94.0,False,0.289551,valid_kf_0523.csv
1,01d9b7ce6ba1,InChI=1S/C61H104O6/c1-4-7-10-13-16-19-22-24-26...,104.0,True,0.042877,valid_kf_0523.csv
2,75c32fbd3779,InChI=1S/C72H135NO5/c1-3-5-7-9-11-13-15-17-19-...,119.0,False,0.200317,valid_kf_0523.csv
3,b440ab35d6a0,InChI=1S/C57H92O6/c1-4-7-10-13-16-19-22-25-27-...,74.0,False,0.250000,valid_kf_0523.csv
4,807b043f3c86,InChI=1S/C55H88O6/c1-4-7-10-13-16-19-22-25-27-...,72.0,False,0.197021,valid_kf_0523.csv
...,...,...,...,...,...,...
4883017,cbaa4844a4a4,"InChI=1S/CBrF7NS/c2-1(3)10-11(4,5,6,7)8",5.0,False,0.260986,valid_kf_0527.csv
4883018,ff46160b9517,"InChI=1S/CH5N/c1-2-3/h2-4H,1H3/i1D3,2D",5.0,False,0.343994,valid_kf_0527.csv
4883019,ff46160b9517,"InChI=1S/CH3N/c1-2-3/h2H,1H3/i1D3,2D2",4.0,False,0.383789,valid_kf_0527.csv
4883020,ff46160b9517,"InChI=1S/CH3N/c1-2/h2H2/i1D3,2D",5.0,False,0.198486,valid_kf_0527.csv


In [9]:
%matplotlib inline
n_valid_inchis = valid_ensembled_df.groupby(["image_id"]).is_valid.sum()
valid_ensembled_df["n_valid_inchis"] = valid_ensembled_df.image_id.map(n_valid_inchis)
_df = valid_ensembled_df.groupby(["n_valid_inchis", "image_id"]).levenshtein.min().groupby("n_valid_inchis").agg(["count", "mean"])
_df2 = valid_ensembled_df.query("is_valid").groupby(["n_valid_inchis", "image_id"]).levenshtein.min().groupby("n_valid_inchis").agg(["count", "mean"])
_df["lowest levenshtein (valid only)"] = _df2["mean"]
_df.rename(columns=dict(mean="lowest levenshtein"))

Unnamed: 0_level_0,count,lowest levenshtein,lowest levenshtein (valid only)
n_valid_inchis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,859,18.728754,
1,43347,0.122984,0.279973
2,68834,0.056542,0.108304
3,50101,0.049001,0.076765
4,41735,0.05142,0.079214
5,17116,0.035873,0.052232
6,8685,0.056074,0.07795
7,3169,0.091196,0.104449
8,6296,0.189803,0.19155
9,1115,0.603587,0.66009
