In [2]:
import gc
import io
from copy import deepcopy
from urllib.parse import urlparse

import numpy as np
import pandas as pd
import seaborn as sns
import Levenshtein
from tqdm.auto import tqdm

In [6]:
VALID_CSVs = """
/work/output/1113_swin_large_bert_384/kf-bms-candidates/valid_kf_0523.csv
/work/output/1109_vtnt_bert_512-1024-denoise-5/kf-bms-candidates/valid_kf_0523.csv
/work/input/camaro/exp084/valid_kf_0523.csv
""".strip().split()
TEST_CSVs = """
/work/output/1113_swin_large_bert_384/kf-bms-candidates/test_kf_0523.csv
/work/output/1109_vtnt_bert_512-1024-denoise-5/kf-bms-candidates/test_kf_0523.csv
/work/input/camaro/exp084/test_kf_0523.csv
""".strip().split()

In [14]:
def load_prediction(path: str):
    if path.split("/")[-2] == "kf-bms-candidates":
        user = "kfujikawa"
        model = path.split("/")[-3]
    else:
        user = path.split("/")[3]
        model = path.split("/")[-2]
    df = pd.read_csv(path).assign(
        model=model,
        user=user,
    )
    return df

# アンサンブル（Valid）

In [15]:
valid_df = pd.concat([load_prediction(path) for path in tqdm(VALID_CSVs)], ignore_index=True)
valid_df = valid_df.drop_duplicates(["image_id", "model", "InChI"]).reset_index(drop=True)

shared_valid_ids = pd.read_csv("/work/input/kfujikawa/kf-bms-candidates/shared_valid_image_ids_kf_camaro.csv").image_id
common_valid_df = valid_df.query("image_id.isin(@shared_valid_ids)", engine="python")

display(valid_df.head(1))
with pd.option_context("display.float_format", '{:.4f}'.format):
    display(valid_df.groupby("model").describe().T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,user,focal_score,ce_score
0,a3c096ab64e7,InChI=1S/C56H90O6/c1-4-7-10-13-16-19-22-25-26-...,94,False,0.03326,1113_swin_large_bert_384,kfujikawa,,


Unnamed: 0,model,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,exp084
levenshtein,count,1454637.0,1454637.0,1454637.0
levenshtein,mean,15.7413,15.7413,15.7413
levenshtein,std,19.4499,19.4499,19.4499
levenshtein,min,0.0,0.0,0.0
levenshtein,25%,1.0,1.0,1.0
levenshtein,50%,5.0,5.0,5.0
levenshtein,75%,31.0,31.0,31.0
levenshtein,max,185.0,185.0,185.0
normed_score,count,1454637.0,1454637.0,0.0
normed_score,mean,0.1412,0.1258,


In [16]:
kf_valid_df = common_valid_df.query("user == 'kfujikawa'")
sort_keys = dict(
    image_id=True,
    is_valid=False,
    normed_score=True,
)
kf_valid_ensembled_df = kf_valid_df.groupby(["image_id", "InChI"]).mean().reset_index()
kf_valid_ensembled_df = kf_valid_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first()
kf_valid_ensembled_df.levenshtein.mean()

0.7174587209789623

In [17]:
camaro_valid_df = common_valid_df.query("user == 'camaro'")
sort_keys = dict(
    image_id=True,
    is_valid=False,
    focal_score=True,
)
camaro_valid_ensembled_df = camaro_valid_df.groupby(["image_id", "InChI"]).mean().reset_index()
camaro_valid_ensembled_df = camaro_valid_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first()
camaro_valid_ensembled_df.levenshtein.mean()

0.5381778560053642

## KF vs camaro の選択結果同士のLevenshtein

In [18]:
merged_df = kf_valid_ensembled_df.merge(camaro_valid_ensembled_df, on="image_id")
np.mean([
    Levenshtein.distance(x, y)
    for x, y in merged_df[["InChI_x", "InChI_y"]].values
])

0.33341714860447574

# アンサンブル（Test）

In [19]:
test_df = pd.concat([load_prediction(path) for path in tqdm(TEST_CSVs)], ignore_index=True)
display(test_df.head(1))
with pd.option_context("display.float_format", '{:.4f}'.format):
    display(test_df.groupby("model").describe().T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,user,focal_score,ce_score
0,9ba7f91f0a40,InChI=1S/C67H127NO4/c1-3-5-7-9-11-13-15-17-19-...,,False,0.024314,1113_swin_large_bert_384,kfujikawa,,


Unnamed: 0,model,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,exp084
levenshtein,count,0.0,0.0,0.0
levenshtein,mean,,,
levenshtein,std,,,
levenshtein,min,,,
levenshtein,25%,,,
levenshtein,50%,,,
levenshtein,75%,,,
levenshtein,max,,,
normed_score,count,9702217.0,9702217.0,0.0
normed_score,mean,0.1407,0.1407,


In [20]:
kf_test_df = test_df.query("user == 'kfujikawa'")
sort_keys = dict(
    image_id=True,
    is_valid=False,
    normed_score=True,
)
kf_test_ensembled_df = kf_test_df.groupby(["image_id", "InChI"]).mean().reset_index()
kf_test_ensembled_df = kf_test_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first()

In [21]:
camaro_test_df = test_df.query("user == 'camaro'")
sort_keys = dict(
    image_id=True,
    is_valid=False,
    focal_score=True,
)
camaro_test_ensembled_df = camaro_test_df.groupby(["image_id", "InChI"]).mean().reset_index()
camaro_test_ensembled_df = camaro_test_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first()

nan

## KF vs camaro の選択結果同士のLevenshtein

In [22]:
merged_df = kf_test_ensembled_df.merge(camaro_test_ensembled_df, on="image_id")
np.mean([
    Levenshtein.distance(x, y)
    for x, y in merged_df[["InChI_x", "InChI_y"]].values
])

0.4660539184596069

In [None]:
test_ensembled_df.normed_score.hist(log=True)
valid_ensembled_df.normed_score.hist(log=True)

In [None]:
submission_df = test_ensembled_df[["image_id", "InChI"]]
assert len(submission_df) == 1616107
submission_df.to_csv("submission.csv", index=False)
!head submission.csv
!wc submission.csv