In [1]:
import gc
import io
import multiprocessing as mp
from copy import deepcopy
from urllib.parse import urlparse

import numpy as np
import google.cloud.storage as gcs
import pandas as pd
import seaborn as sns
import Levenshtein
from retrying import retry
from tqdm.auto import tqdm

In [2]:
GCP_PROJECT = "dena-ai-training-28-gcp"


@retry(stop_max_attempt_number=3)
def download_from_gcs(path: str):
    url = urlparse(path)
    bucket_name = url.netloc
    storage_client = gcs.Client(project=GCP_PROJECT)
    bucket = storage_client.get_bucket(bucket_name)
    blob = gcs.Blob(url.path[1:], bucket)
    return blob.download_as_string()


def load_prediction(path: str):
    if path.split("/")[-2] == "kf-bms-candidates-v2":
        model = path.split("/")[-3]
    else:
        model = path.split("/")[-2]
    df = pd.read_csv(io.BytesIO(download_from_gcs(path)))\
        .assign(model=model, filename=path.split("/")[-1])\
        .query("is_valid | image_id.isin(@NO_VALID_IMAGE_IDs)", engine="python")
    return df

In [3]:
n_valid_InChIs = pd.read_csv(io.BytesIO(download_from_gcs("gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/kf-bms-candidates-v2/test_n_valid_InChIs.csv")))
NO_VALID_IMAGE_IDs = n_valid_InChIs.query("n_valid_InChIs == 0").image_id
len(NO_VALID_IMAGE_IDs)

5152

In [4]:
baseline_059_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB059+kf0527.csv"
)))
baseline_056_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB056.csv"
)))

# Load models

In [5]:
TEST_FILENAMES = [
    "test_kf_0523.csv",
    "test_kf_0525.csv",
    "test_kf_0527.csv",
    "test_yokoo_0527.csv",
    "test_camaro_0525.csv",
#     "test_kf_0531.csv",
#     "test_yokoo_0602.csv",
#     "test_camaro_0531.csv",
]
KF_MODELS = [
    "1109_vtnt_bert_512-1024-denoise-5",
    "1113_swin_large_bert_384",
    "1124_swin_large_bert_384_pil_pseudo",
    "1126_swin_large_bert_384_pil_pseudo_no-denoise",
    "1127_vtnt_bert_512-1024_pseudo_no-denoise",
]
KF_TEST_CSVs = [
    f"gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/{model}/kf-bms-candidates-v2/{filename}"
    for model in KF_MODELS
    for filename in TEST_FILENAMES
]

In [6]:
LYAKAAP_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v54/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v55/test_camaro_0525.csv
""".strip().split()

In [7]:
KYAKAAP_TEST_CSVs = [*KF_TEST_CSVs, *LYAKAAP_TEST_CSVs]

In [8]:
with mp.Pool() as pool:
    total = len(KYAKAAP_TEST_CSVs) 
    iterator = pool.imap_unordered(load_prediction, KYAKAAP_TEST_CSVs)
    kyakaap_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
kyakaap_df = kyakaap_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(kyakaap_df.head(1))
display(kyakaap_df.groupby("model").image_id.count())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,filename,ce_score,focal_score
0,007706c43e8d,InChI=1S/C16H32O6/c1-5-17-11-13-21-9-3-7-19-15...,,False,0.192261,1113_swin_large_bert_384,test_yokoo_0527.csv,,


model
1109_vtnt_bert_512-1024-denoise-5                 5004678
1113_swin_large_bert_384                          5004678
1124_swin_large_bert_384_pil_pseudo               5004678
1126_swin_large_bert_384_pil_pseudo_no-denoise    5004678
1127_vtnt_bert_512-1024_pseudo_no-denoise         5004678
v52                                               5004678
v54                                               5004678
v55                                               5004678
Name: image_id, dtype: int64

In [9]:
weights = {
    '1109_vtnt_bert_512-1024-denoise-5': 1/5,
    '1113_swin_large_bert_384': 1/5,
    '1124_swin_large_bert_384_pil_pseudo': 1/5,
    '1126_swin_large_bert_384_pil_pseudo_no-denoise': 1/5,
    '1127_vtnt_bert_512-1024_pseudo_no-denoise': 1/5,
    'v52': 1/3,
    'v54': 1/3,
    'v55': 1/3,
}
kyakaap_df["weight"] = kyakaap_df.model.map(weights)
kyakaap_df["score"] = np.where(
    kyakaap_df.normed_score.isna(),
    kyakaap_df.ce_score,
    kyakaap_df.normed_score,
)
kyakaap_df["ranked_score"] = kyakaap_df.groupby("model").score.rank() / len(kyakaap_df)
kyakaap_df["weighted_score"] = kyakaap_df.ranked_score * kyakaap_df.weight

In [10]:
pd.pivot_table(
    kyakaap_df.groupby(['model', 'filename'])['score'].mean().to_frame(),
    index='model',
    columns='filename',
    values='score',
)

filename,test_camaro_0525.csv,test_kf_0523.csv,test_kf_0525.csv,test_kf_0527.csv,test_yokoo_0527.csv
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1109_vtnt_bert_512-1024-denoise-5,0.197481,0.073668,0.175081,0.16786,0.158916
1113_swin_large_bert_384,0.221747,0.080153,0.200235,0.18966,0.136249
1124_swin_large_bert_384_pil_pseudo,0.224591,0.08346,0.210325,0.198847,0.136406
1126_swin_large_bert_384_pil_pseudo_no-denoise,0.231299,0.084719,0.210902,0.199759,0.134062
1127_vtnt_bert_512-1024_pseudo_no-denoise,0.210365,0.076833,0.183951,0.177085,0.16291
v52,0.215945,0.08604,0.226494,0.211242,0.03546
v54,0.236781,0.110063,0.252607,0.247524,0.154959
v55,0.211917,0.07942,0.20906,0.197351,0.049787


# Camaro

In [11]:
CAMARO_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845_v2/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103_v2/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031_v2/test_yokoo_0527.csv
""".strip().split()

In [12]:
with mp.Pool() as pool:
    total = len(CAMARO_TEST_CSVs) 
    iterator = pool.imap_unordered(load_prediction, CAMARO_TEST_CSVs)
    camaro_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
camaro_df = camaro_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(camaro_df.head(1))
display(camaro_df.groupby("model").image_id.count())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,focal_score,ce_score,model,filename
0,001ae62e2309,InChI=1S/C11H11BrO4/c12-7-3-4-5(9(13)14)6-8(7)...,,False,0.155131,0.587556,exp103_v2,test_yokoo_0527.csv


model
exp084        4973760
exp0845_v2    4973760
exp090        4973760
exp1031_v2    4973760
exp103_v2     4973760
Name: image_id, dtype: int64

In [13]:
pd.pivot_table(
    camaro_df.groupby(['model', 'filename'])['focal_score'].mean().to_frame(),
    index='model',
    columns='filename',
    values='focal_score',
)

filename,test_kf_0523.csv,test_kf_0525.csv,test_kf_0527.csv,test_yokoo_0527.csv
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
exp084,0.048423,0.107337,0.103948,0.062715
exp0845_v2,0.045832,0.098376,0.097883,0.044047
exp090,0.07322,0.163229,0.160085,0.076082
exp1031_v2,0.037167,0.083149,0.078719,0.060346
exp103_v2,0.042365,0.092994,0.089489,0.0645


In [26]:
camaro_len = camaro_df.groupby("model").image_id.transform("count")
camaro_len.nunique()

1

In [40]:
weights = {
    'exp084': 1/5,
    'exp0845_v2': 1/5,
    'exp090': 1/5,
    'exp1031_v2': 1/5,
    'exp103_v2': 1/5,
}
camaro_df["weight"] = camaro_df.model.map(weights)
camaro_df["ranked_score"] = camaro_df.groupby("model").focal_score.rank() / camaro_len
camaro_df["weighted_score"] = camaro_df.ranked_score * camaro_df.weight

In [36]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    weighted_score=True,
)
kyakaap_ensembled_df = kyakaap_df.groupby(["image_id", "InChI"])[["weighted_score", "is_valid"]].mean().reset_index()
kyakaap_ensembled_df = kyakaap_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()

In [37]:
kyakaap_merged_ensembled_df = kyakaap_ensembled_df.merge(baseline_056_df, on="image_id")
kyakaap_merged_ensembled_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(kyakaap_merged_ensembled_df[["InChI_x", "InChI_y"]].values)
]
print(kyakaap_merged_ensembled_df.query("InChI_x != InChI_y").shape)
print(kyakaap_merged_ensembled_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(120763, 6)
0.21392642937627274


In [38]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    weighted_score=True,
)
camaro_ensembled_df = camaro_df.groupby(["image_id", "InChI"])[["weighted_score", "is_valid"]].mean().reset_index()
camaro_ensembled_df = camaro_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()

In [39]:
camaro_merged_ensembled_df = camaro_ensembled_df.merge(baseline_056_df, on="image_id")
camaro_merged_ensembled_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(camaro_merged_ensembled_df[["InChI_x", "InChI_y"]].values)
]
print(camaro_merged_ensembled_df.query("InChI_x != InChI_y").shape)
print(camaro_merged_ensembled_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1615331.0), HTML(value='')))


(28180, 6)
0.14104601471772657


# アンサンブル

In [41]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    weighted_score=True,
)
merged_df = pd.concat([
    camaro_df,
    kyakaap_df,
], ignore_index=True)
merged_ensembled_df = merged_df.groupby(["image_id", "InChI"])[["weighted_score", "is_valid"]].mean().reset_index()
merged_ensembled_df = merged_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()

In [42]:
filename = "submission_0710_LB056+full_ranked.csv"
submission_df = merged_ensembled_df[["image_id", "InChI"]]
assert len(submission_df) == 1616107
submission_df.to_csv(filename, index=False)
!head $filename

image_id,InChI
00000d2a601c,"InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-7(12-2)8-5-13-17-15-8/h5,7,12H,4H2,1-3H3"
00001f7fc849,"InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18-13)8-11-3-5-12(15)6-4-11/h3-6,10,16H,2,7-9H2,1H3,(H,17,18)"
000037687605,"InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)16(8-12)19-10-14-4-2-3-5-15(14)17/h2-8,19H,10H2,1H3"
00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-19(12)8-9-5-6-10(20-4)7-11(9)15/h5-7H,8,16H2,1-4H3"
00004df0fe53,"InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)6/h4-8H,2-3H2,1H3/t4-,5-,6-,7-,8+/m1/s1"
000085dab281,"InChI=1S/C20H38O/c1-20(2)18-16-14-12-10-8-6-4-3-5-7-9-11-13-15-17-19-21/h17,20H,3-16,18H2,1-2H3"
00008decfc8d,"InChI=1S/C15H26N2/c1-5-10-16-15(11-12(3)6-2)14-9-7-8-13(4)17-14/h7-9,12,15-16H,5-6,10-11H2,1-4H3"
00008e8fe68c,"InChI=1S/C22H25Cl2N3O6/c1-6-32-20-13(23)8-9-14(21(20)33-7-2)26-27-18(12(3)28)22(29)25-19-16(31-5)11-10-15(30-4)17(19)24/h8-11,18H,6-7H2,1-5H3,(H,25,29)"
000095714f0f,"InChI=1S/C25H30ClN3O2/c1-17-4-9-23

In [43]:
submission_merged_df = submission_df.merge(baseline_059_df, on="image_id")
submission_merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(submission_merged_df[["InChI_x", "InChI_y"]].values)
]
print(submission_merged_df.query("InChI_x != InChI_y").shape)
print(submission_merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(56358, 4)
0.3337817359865405


In [44]:
submission_merged_df = submission_df.merge(baseline_056_df, on="image_id")
submission_merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(submission_merged_df[["InChI_x", "InChI_y"]].values)
]
print(submission_merged_df.query("InChI_x != InChI_y").shape)
print(submission_merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(40045, 4)
0.3170990534661381
