In [1]:
import gc
import io
import multiprocessing as mp
from copy import deepcopy
from urllib.parse import urlparse

import numpy as np
import google.cloud.storage as gcs
import pandas as pd
import seaborn as sns
import Levenshtein
from retrying import retry
from tqdm.auto import tqdm

In [2]:
GCP_PROJECT = "dena-ai-training-28-gcp"


@retry(stop_max_attempt_number=3)
def download_from_gcs(path: str):
    url = urlparse(path)
    bucket_name = url.netloc
    storage_client = gcs.Client(project=GCP_PROJECT)
    bucket = storage_client.get_bucket(bucket_name)
    blob = gcs.Blob(url.path[1:], bucket)
    return blob.download_as_string()


def load_prediction(path: str):
    if path.split("/")[-2] == "kf-bms-candidates-v2":
        model = path.split("/")[-3]
    else:
        model = path.split("/")[-2]
    df = pd.read_csv(io.BytesIO(download_from_gcs(path)))\
        .assign(model=model, filename=path.split("/")[-1])\
        .query("is_valid | image_id.isin(@NO_VALID_IMAGE_IDs)", engine="python")
    return df

In [3]:
n_valid_InChIs = pd.read_csv(io.BytesIO(download_from_gcs("gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/kf-bms-candidates-v2/test_n_valid_InChIs.csv")))
NO_VALID_IMAGE_IDs = n_valid_InChIs.query("n_valid_InChIs == 0").image_id
len(NO_VALID_IMAGE_IDs)

5152

In [4]:
baseline_059_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB059+kf0527.csv"
)))
baseline_056_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB056.csv"
)))
baseline_055_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB055_LB056+camaro*1.5.csv"
)))
baseline_055_full_candidates_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB055_LB056+full_candidates.csv"
)))
baseline_054_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB054_LB055+camaro_predictions.csv"
)))
baseline_054_c25_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB054_LB054_2.5*camaro.csv"
)))

# Load models

In [5]:
TEST_FILENAMES = [
    "test_kf_0523.csv",
    "test_kf_0525.csv",
    "test_kf_0527.csv",
    "test_yokoo_0527.csv",
    "test_camaro_0525.csv",
    "test_kf_0531.csv",
    "test_kf_0531_renormed.csv",
    "test_camaro_0531.csv",
    "test_camaro_old_submissions.csv",
    "test_yokoo_0531.csv",
    "test_yokoo_0601.csv",
]
KF_MODELS = [
    "1109_vtnt_bert_512-1024-denoise-5",
    "1113_swin_large_bert_384",
    "1124_swin_large_bert_384_pil_pseudo",
    "1126_swin_large_bert_384_pil_pseudo_no-denoise",
    "1127_vtnt_bert_512-1024_pseudo_no-denoise",
]
KF_TEST_CSVs = [
    f"gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/{model}/kf-bms-candidates-v2/{filename}"
    for model in KF_MODELS
    for filename in TEST_FILENAMES
]

In [6]:
LYAKAAP_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_yokoo_0601.csv
""".strip().split()

In [7]:
KYAKAAP_TEST_CSVs = [*KF_TEST_CSVs, *LYAKAAP_TEST_CSVs]

In [8]:
with mp.Pool() as pool:
    total = len(KYAKAAP_TEST_CSVs) 
    iterator = pool.imap_unordered(load_prediction, KYAKAAP_TEST_CSVs)
    kyakaap_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
kyakaap_df = kyakaap_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(kyakaap_df.head(1))
display(kyakaap_df.groupby("model").image_id.count())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=88.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,filename,ce_score,focal_score
0,00073e401fa1,InChI=1S/C20H27NO9S/c1-12(22)26-11-14-16-18(28...,,True,0.04187,1109_vtnt_bert_512-1024-denoise-5,test_camaro_0531.csv,,


model
1109_vtnt_bert_512-1024-denoise-5                 5811688
1113_swin_large_bert_384                          5811688
1124_swin_large_bert_384_pil_pseudo               5811688
1126_swin_large_bert_384_pil_pseudo_no-denoise    5811688
1127_vtnt_bert_512-1024_pseudo_no-denoise         5811688
v52                                               5811688
v54                                               5811688
v55                                               5811688
Name: image_id, dtype: int64

In [9]:
weights = {
    '1109_vtnt_bert_512-1024-denoise-5': 1/6,
    '1113_swin_large_bert_384': 1/10,
    '1119_swin_large_bert_384_bpe': 1/8,
    '1124_swin_large_bert_384_pil_pseudo': 1/10,
    '1126_swin_large_bert_384_pil_pseudo_no-denoise': 1/4,
    '1127_vtnt_bert_512-1024_pseudo_no-denoise': 1/4,
    'v52': 1/6,
    'v54': 3/7,
    'v55': 2/5,
}
kyakaap_df["weight"] = kyakaap_df.model.map(weights)
kyakaap_df["score"] = np.where(
    kyakaap_df.normed_score.isna(),
    kyakaap_df.ce_score,
    kyakaap_df.normed_score,
)
kyakaap_df["weighted_score"] = kyakaap_df.score * kyakaap_df.weight
kyakaap_ensembled_df = kyakaap_df\
    .groupby(["image_id", "InChI"])[["is_valid", "weighted_score"]]\
    .mean().reset_index()

In [10]:
kyakaap_ensembled_df["ranked_score"] = kyakaap_ensembled_df.weighted_score.rank() / len(kyakaap_ensembled_df)

In [11]:
pd.pivot_table(
    kyakaap_df.groupby(['model', 'filename'])['score'].mean().to_frame(),
    index='model',
    columns='filename',
    values='score',
)

filename,test_camaro_0525.csv,test_camaro_0531.csv,test_camaro_old_submissions.csv,test_kf_0523.csv,test_kf_0525.csv,test_kf_0527.csv,test_kf_0531.csv,test_kf_0531_renormed.csv,test_yokoo_0527.csv,test_yokoo_0531.csv,test_yokoo_0601.csv
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1109_vtnt_bert_512-1024-denoise-5,0.197481,0.172908,0.239684,0.073668,0.175081,0.16786,0.129182,0.147568,0.158916,0.234165,0.097581
1113_swin_large_bert_384,0.223169,0.175085,0.248445,0.080153,0.200236,0.18966,0.115637,0.150475,0.136249,0.209635,0.101402
1124_swin_large_bert_384_pil_pseudo,0.224591,0.179138,0.255915,0.08346,0.210326,0.198847,0.106689,0.159312,0.136406,0.210932,0.103632
1126_swin_large_bert_384_pil_pseudo_no-denoise,0.231299,0.181395,0.256025,0.084719,0.210903,0.199759,0.106808,0.160482,0.134062,0.211595,0.103591
1127_vtnt_bert_512-1024_pseudo_no-denoise,0.210365,0.181628,0.25077,0.076832,0.183951,0.177085,0.12727,0.153513,0.16291,0.243553,0.099789
v52,0.215945,0.170441,0.232724,0.08604,0.226494,0.211242,0.168141,0.172429,0.03546,0.125098,0.08809
v54,0.236781,0.171078,0.240433,0.110063,0.252607,0.247524,0.202302,0.217668,0.154959,0.169041,0.1105
v55,0.211917,0.170247,0.237524,0.07942,0.20906,0.197351,0.172651,0.168056,0.049787,0.122929,0.083128


# Camaro

In [12]:
CAMARO_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp072/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_yokoo_0601.csv
""".strip().split()

In [13]:
with mp.Pool() as pool:
    total = len(CAMARO_TEST_CSVs) 
    iterator = pool.imap_unordered(load_prediction, CAMARO_TEST_CSVs)
    camaro_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
camaro_df = camaro_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(camaro_df.head(1))
display(camaro_df.groupby("model").image_id.count())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=66.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,focal_score,ce_score,model,filename
0,00073e401fa1,InChI=1S/C20H27NO9S/c1-12(22)26-11-14-16-18(28...,,True,0.006505,0.033341,exp072,test_camaro_0531.csv


model
exp072        5811688
exp084        5811688
exp0845_v2    5811688
exp090        5811688
exp1031_v2    5811688
exp103_v2     5811688
Name: image_id, dtype: int64

In [14]:
pd.pivot_table(
    camaro_df.groupby(['model', 'filename'])['focal_score'].mean().to_frame(),
    index='model',
    columns='filename',
    values='focal_score',
)

filename,test_camaro_0525.csv,test_camaro_0531.csv,test_camaro_old_submissions.csv,test_kf_0523.csv,test_kf_0525.csv,test_kf_0527.csv,test_kf_0531.csv,test_kf_0531_renormed.csv,test_yokoo_0527.csv,test_yokoo_0531.csv,test_yokoo_0601.csv
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
exp072,0.085077,0.034238,0.0422,0.043092,0.098968,0.09464,0.063229,0.09035,0.051519,0.076383,0.034831
exp084,0.077179,0.036152,0.039644,0.048423,0.107337,0.103948,0.072209,0.104058,0.062715,0.088523,0.043798
exp0845_v2,0.068741,0.035434,0.040226,0.045832,0.098376,0.097883,0.056318,0.099572,0.044047,0.063683,0.044449
exp090,0.135609,0.054126,0.063926,0.07322,0.16323,0.160085,0.093917,0.15865,0.076082,0.114352,0.053802
exp1031_v2,0.063571,0.006897,0.057757,0.037167,0.083149,0.078719,0.067474,0.084153,0.060346,0.079419,0.048423
exp103_v2,0.084561,0.017677,0.059961,0.042365,0.092994,0.089489,0.068645,0.093591,0.0645,0.079838,0.057034


In [15]:
camaro_models_df = pd.pivot_table(
    camaro_df,
    index='InChI',
    columns='model',
    values='focal_score',
)
camaro_models_df.corr()

model,exp072,exp084,exp0845_v2,exp090,exp1031_v2,exp103_v2
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
exp072,1.0,0.907523,0.913617,0.897345,0.834881,0.85192
exp084,0.907523,1.0,0.929761,0.86122,0.829491,0.845633
exp0845_v2,0.913617,0.929761,1.0,0.882984,0.811109,0.819227
exp090,0.897345,0.86122,0.882984,1.0,0.753844,0.771339
exp1031_v2,0.834881,0.829491,0.811109,0.753844,1.0,0.943083
exp103_v2,0.85192,0.845633,0.819227,0.771339,0.943083,1.0


In [19]:
camaro_df

Unnamed: 0,image_id,InChI,levenshtein,is_valid,focal_score,ce_score,model,filename,weight,weighted_score
0,00073e401fa1,InChI=1S/C20H27NO9S/c1-12(22)26-11-14-16-18(28...,,True,0.006505,0.033341,exp072,test_camaro_0531.csv,0.111111,0.000723
1,000e9411226f,InChI=1S/C32H37NO6S2/c1-6-9-11-20(4)30(35)38-2...,,True,0.005854,0.035000,exp072,test_camaro_0531.csv,0.111111,0.000650
2,001111d13754,InChI=1S/C24H40O4S/c1-2-3-4-5-6-7-10-13-16-29-...,,True,0.007149,0.036204,exp072,test_camaro_0531.csv,0.111111,0.000794
3,001ae62e2309,InChI=1S/C11H11BrO4/c12-8-5-3(10(15)16-7(5)8)4...,,False,0.067335,0.333439,exp072,test_camaro_0531.csv,0.111111,0.007482
4,0020f8bfc5ff,InChI=1S/C27H38N6O4S/c1-16(24(35)30-8-9-33-10-...,,True,0.005362,0.027054,exp072,test_camaro_0531.csv,0.111111,0.000596
...,...,...,...,...,...,...,...,...,...,...
34877407,c7e9c2a0635e,"InChI=1S/CH6BCl2NSi3/c2-3(4)5(6)7/h1,6-7H2",,False,0.525929,1.246924,exp090,test_kf_0523.csv,0.125000,0.065741
34877408,c7e9c2a0635e,InChI=1S/CH4BCl2NSi3/c2-3(4)5(6)7/h6-7H2,,False,0.496213,1.139359,exp090,test_kf_0523.csv,0.125000,0.062027
34877409,d78db5f90e4b,"InChI=1S/C2H5IO/c3-1-2-4/h4H,1-2H2/i1D,2D2",,True,0.087313,0.268302,exp090,test_kf_0523.csv,0.125000,0.010914
34877410,e5dca4c9bebd,"InChI=1S/CH4OS/c1-2-3/h2H,1H3/i1D3,2D",,True,0.233525,0.678822,exp090,test_kf_0523.csv,0.125000,0.029191


In [20]:
weights = {
    'exp0845_v2': 1/4,
    'exp1031_v2': 1/5,
    'exp103_v2' : 1/6,
    'exp084':  1/7,
    'exp090':  1/8,
    'exp072':  1/9,
}
camaro_df["weight"] = camaro_df.model.map(weights)
camaro_df["weighted_score"] = camaro_df.focal_score * camaro_df.weight
camaro_ensembled_df = camaro_df\
    .groupby(["image_id", "InChI"])[["is_valid", "weighted_score"]]\
    .mean().reset_index()
camaro_ensembled_df["ranked_score"] = camaro_ensembled_df.weighted_score.rank() / len(camaro_ensembled_df)

# ランク平均

In [21]:
merged_df = kyakaap_ensembled_df.merge(
    camaro_ensembled_df,
    on=["image_id", "InChI", "is_valid"],
    suffixes=["_kyakaap", "_camaro"],
    how="outer",
)

In [22]:
camaro_weight = 2
merged_df["ranked_score_camaro_fillna"] = merged_df.ranked_score_camaro.fillna(merged_df.ranked_score_kyakaap)
merged_df["ranked_score_kyakaap_fillna"] = merged_df.ranked_score_kyakaap.fillna(merged_df.ranked_score_camaro)
merged_df["weighted_ranked_score"] = (merged_df.ranked_score_kyakaap_fillna + camaro_weight * merged_df.ranked_score_camaro_fillna)

In [23]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    weighted_ranked_score=True,
)
merged_ensembled_df = merged_df.groupby(["image_id", "InChI"])[["weighted_ranked_score", "is_valid"]].mean().reset_index()
merged_ensembled_df = merged_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()

# ベースラインとの比較

In [24]:
submission_merged_df = merged_ensembled_df.merge(baseline_055_df, on="image_id")
submission_merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(submission_merged_df[["InChI_x", "InChI_y"]].values)
]
print(submission_merged_df.query("InChI_x != InChI_y").shape)
print(submission_merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(20732, 6)
0.16327013000995602


In [25]:
submission_merged_df = merged_ensembled_df.merge(baseline_059_df, on="image_id")
submission_merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(submission_merged_df[["InChI_x", "InChI_y"]].values)
]
print(submission_merged_df.query("InChI_x != InChI_y").shape)
print(submission_merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(47153, 6)
0.22470975003511526


In [26]:
submission_merged_df = merged_ensembled_df.merge(baseline_056_df, on="image_id")
submission_merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(submission_merged_df[["InChI_x", "InChI_y"]].values)
]
print(submission_merged_df.query("InChI_x != InChI_y").shape)
print(submission_merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(24413, 6)
0.18264137213686965


In [27]:
submission_merged_df = merged_ensembled_df.merge(baseline_055_full_candidates_df, on="image_id")
submission_merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(submission_merged_df[["InChI_x", "InChI_y"]].values)
]
print(submission_merged_df.query("InChI_x != InChI_y").shape)
print(submission_merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(11320, 6)
0.06301315445078823


In [28]:
submission_merged_df = merged_ensembled_df.merge(baseline_054_df, on="image_id")
submission_merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(submission_merged_df[["InChI_x", "InChI_y"]].values)
]
print(submission_merged_df.query("InChI_x != InChI_y").shape)
print(submission_merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(4347, 6)
0.03293531925794517


In [30]:
submission_merged_df = merged_ensembled_df.merge(baseline_054_c25_df, on="image_id")
submission_merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(submission_merged_df[["InChI_x", "InChI_y"]].values)
]
print(submission_merged_df.query("InChI_x != InChI_y").shape)
print(submission_merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(6587, 6)
0.03779576476062538


# Submission

In [31]:
filename = "submission_0717_LB054+weight_tuned.csv"
submission_df = merged_ensembled_df[["image_id", "InChI"]]
assert len(submission_df) == 1616107
submission_df.to_csv(filename, index=False)
!head $filename

image_id,InChI
00000d2a601c,"InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-7(12-2)8-5-13-17-15-8/h5,7,12H,4H2,1-3H3"
00001f7fc849,"InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18-13)8-11-3-5-12(15)6-4-11/h3-6,10,16H,2,7-9H2,1H3,(H,17,18)"
000037687605,"InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)16(8-12)19-10-14-4-2-3-5-15(14)17/h2-8,19H,10H2,1H3"
00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-19(12)8-9-5-6-10(20-4)7-11(9)15/h5-7H,8,16H2,1-4H3"
00004df0fe53,"InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)6/h4-8H,2-3H2,1H3/t4-,5-,6-,7-,8-/m1/s1"
000085dab281,"InChI=1S/C20H38O/c1-20(2)18-16-14-12-10-8-6-4-3-5-7-9-11-13-15-17-19-21/h17,20H,3-16,18H2,1-2H3"
00008decfc8d,"InChI=1S/C15H26N2/c1-5-10-16-15(11-12(3)6-2)14-9-7-8-13(4)17-14/h7-9,12,15-16H,5-6,10-11H2,1-4H3"
00008e8fe68c,"InChI=1S/C22H25Cl2N3O6/c1-6-32-20-13(23)8-9-14(21(20)33-7-2)26-27-18(12(3)28)22(29)25-19-16(31-5)11-10-15(30-4)17(19)24/h8-11,18H,6-7H2,1-5H3,(H,25,29)"
000095714f0f,"InChI=1S/C25H30ClN3O2/c1-17-4-9-23