In [1]:
import cv2
import Levenshtein
import optuna
import numpy as np
import pandas as pd
import plotly.express as px
import sklearn.metrics
import ipywidgets
import matplotlib.pyplot as plt
import multiprocessing as mp
from tqdm.auto import tqdm
tqdm.pandas()

  from pandas import Panel


In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule import constants

In [3]:
# train_df = pd.read_pickle("/work/input/kfujikawa/bms-preprocess-v2/train.pkl").set_index("image_id")
# train_df.head()

In [4]:
VALID_CSVs = [
    constants.OUTPUTDIR / "1113_swin_large_bert_384" / "valid_candidates_kf-bms-candidates-0523.csv",
    constants.OUTPUTDIR / "1109_vtnt_bert_512-1024-denoise-5" / "valid_candidates_kf-bms-candidates-0523.csv",
]
COLUMNS = [
    "image_id",
    "normed_InChI",
    "is_valid",
    "normed_score",
    "InChI_GT",
    "levenshtein",
]
valid_df = pd.concat([
    pd.read_csv(filename, usecols=COLUMNS).assign(model=filename.parent.name)
    for filename in tqdm(VALID_CSVs)
], ignore_index=True)
display(valid_df.groupby("model").agg(**{
    "count": ("image_id", "count"),
    "levenshtein": ("levenshtein", "mean"),
    "is_valid": ("is_valid", "mean"),
    "normed_score": ("normed_score", "mean")
}))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0_level_0,count,levenshtein,is_valid,normed_score
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1109_vtnt_bert_512-1024-denoise-5,1454637,15.741277,0.381213,0.141157
1113_swin_large_bert_384,1454637,15.741277,0.381213,0.125802


In [5]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    normed_score=True,
)
valid_ensembled_df = valid_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby(["image_id", "model"]).first()
valid_ensembled_df.groupby("model").levenshtein.mean()

model
1109_vtnt_bert_512-1024-denoise-5    0.784905
1113_swin_large_bert_384             0.773784
Name: levenshtein, dtype: float64

### モデルの分布差異を無視した、normed_score最小のnormed_InChI選択

In [6]:
base_score = valid_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().levenshtein.mean()
lower_score = valid_df.groupby("image_id").levenshtein.min().mean()

print(f"Levenshtein: {base_score}")
print(f"Levenshtein (lower): {lower_score}")

Levenshtein: 0.7544994410504127
Levenshtein (lower): 0.19324392889996247


### normed_score のモデル間平均

In [7]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    normed_score=True,
)
valid_ensembled_df = valid_df.groupby(["image_id", "normed_InChI"]).mean()
base_score = valid_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().levenshtein.mean()
lower_score = valid_ensembled_df.groupby("image_id").levenshtein.min().mean()

print(f"Levenshtein: {base_score}")
print(f"Levenshtein (lower): {lower_score}")

Levenshtein: 0.7326900944232919
Levenshtein (lower): 0.19324392889996247


### ランク平均

In [20]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    rank=True,
)
valid_df["rank"] = valid_df.groupby(["model"]).normed_score.rank()
valid_ensembled_df = valid_df.groupby(["image_id", "normed_InChI"]).mean()
base_score = valid_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().levenshtein.mean()

print(f"Levenshtein: {base_score}")

Levenshtein: 0.7378340806619943


### 標準化

In [21]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    scaled_normed_score=True,
)
for model in valid_df.model.unique():
    _valid_df = valid_df.query("model == @model")
    mean, std = _valid_df.normed_score.agg(["mean", "std"])
    valid_df["scaled_normed_score"] = (_valid_df.normed_score - mean) / std
valid_ensembled_df = valid_df.groupby(["image_id", "normed_InChI"]).mean()
base_score = valid_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().levenshtein.mean()

print(f"Levenshtein: {base_score}")

Levenshtein: 0.7848518474212005


In [16]:
valid_df["scaled_normed_score"] = (_valid_df.normed_score - mean) / std
valid_df["scaled_normed_score"]

0               NaN
1               NaN
2               NaN
3               NaN
4               NaN
             ...   
2909269    1.046160
2909270    0.417296
2909271    0.720687
2909272    1.228578
2909273    0.418256
Name: scaled_normed_score, Length: 2909274, dtype: float64

In [15]:
valid_ensembled_df = valid_df.groupby(["image_id", "normed_InChI"]).mean()
base_score = valid_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().levenshtein.mean()
lower_score = valid_ensembled_df.groupby("image_id").levenshtein.min().mean()

print(f"Levenshtein: {base_score}")
print(f"Levenshtein (lower): {lower_score}")

Unnamed: 0,image_id,InChI_GT,normed_InChI,is_valid,levenshtein,normed_score,model,rank
1417137,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,True,0,4.885595e-07,1109_vtnt_bert_512-1024-denoise-5,1.0
2093094,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,InChI=1S/C13H20OSi/c1-9(2)8-15-13-6-5-10(3)7-1...,True,1,1.939042e-01,1109_vtnt_bert_512-1024-denoise-5,2.0
2093095,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,InChI=1S/C13H20OS/c1-9(2)8-14-13-6-5-10(3)7-12...,True,3,2.211914e-01,1109_vtnt_bert_512-1024-denoise-5,3.0
2093096,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,InChI=1S/C13H18OS/c1-9(2)8-15-13-6-5-10(3)7-12...,False,2,2.287598e-01,1109_vtnt_bert_512-1024-denoise-5,4.0
226450,000011a64c74,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,InChI=1S/C13H20OS/c1-9(2)8-15-13-6-5-10(3)7-12...,True,0,4.357953e-07,1113_swin_large_bert_384,1.0
...,...,...,...,...,...,...,...,...
193411,ffffe824f539,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...,True,0,4.033509e-07,1113_swin_large_bert_384,1.0
805479,ffffe824f539,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(37)15)9(...,False,3,1.692607e-01,1113_swin_large_bert_384,2.0
805480,ffffe824f539,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...,False,6,1.934423e-01,1113_swin_large_bert_384,3.0
805481,ffffe824f539,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(15)16)9(...,InChI=1S/C10H12F2N2O3/c1-17-7-2-5(3-8(14)15)9(...,False,4,2.017083e-01,1113_swin_large_bert_384,4.0


In [7]:
voting_df = valid_df.groupby(["image_id", "normed_InChI"]).model.count()
valid_df["votes"] = valid_df[["image_id", "normed_InChI"]].progress_apply(
    lambda x: voting_df.loc[x.image_id, x.normed_InChI],
    axis=1
)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4848380.0), HTML(value='')))




In [8]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
#     votes=False,
    normed_score=True,
)
base_score = valid_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().levenshtein.mean()
lower_score = valid_df.groupby("image_id").levenshtein.min().mean()

print(f"Levenshtein: {base_score}")
print(f"Levenshtein (lower): {lower_score}")

Levenshtein: 0.7396903708042687
Levenshtein (lower): 0.16620809425003816


In [9]:
invalid_df = valid_df[valid_df.normed_InChI.apply(lambda x: "/q" in x)]

In [10]:
sample = invalid_df.sort_values(["votes", "image_id"]).iloc[-1]