In [1]:
import gc
import io
import multiprocessing as mp
from copy import deepcopy
from urllib.parse import urlparse

import google.cloud.storage as gcs
import pandas as pd
import seaborn as sns
import Levenshtein
from retrying import retry
from tqdm.auto import tqdm

In [2]:
GCP_PROJECT = "dena-ai-training-28-gcp"


@retry(stop_max_attempt_number=3)
def download_from_gcs(path: str):
    url = urlparse(path)
    bucket_name = url.netloc
    storage_client = gcs.Client(project=GCP_PROJECT)
    bucket = storage_client.get_bucket(bucket_name)
    blob = gcs.Blob(url.path[1:], bucket)
    return blob.download_as_string()


def load_prediction(path: str):
    if path.split("/")[-2] == "kf-bms-candidates-v2":
        model = path.split("/")[-3]
    else:
        model = path.split("/")[-2]
    df = pd.read_csv(io.BytesIO(download_from_gcs(path)))\
        .assign(model=model, filename=path.split("/")[-1])\
        .query("is_valid | image_id.isin(@NO_VALID_IMAGE_IDs)", engine="python")
    return df

In [15]:
baseline_df = pd.read_csv("submission_LB059.csv")
n_valid_InChIs = pd.read_csv(io.BytesIO(download_from_gcs("gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/kf-bms-candidates-v2/test_n_valid_InChIs.csv")))
NO_VALID_IMAGE_IDs = n_valid_InChIs.query("n_valid_InChIs == 0").image_id
len(NO_VALID_IMAGE_IDs)

5152

# Load predictions

In [4]:
TEST_FILENAMES = [
    "test_kf_0523.csv",
    "test_kf_0525.csv",
    "test_kf_0527.csv",
    "test_yokoo_0527.csv",
    "test_camaro_0525.csv",
    "test_yokoo_0531.csv",
    "test_kf_0531_renormed.csv",
    "test_camaro_old_submissions.csv",
    "test_kf_0531.csv",
    "test_camaro_0531.csv",
    "test_yokoo_0601.csv",
]
KF_MODELS = [
    "1109_vtnt_bert_512-1024-denoise-5",
    "1113_swin_large_bert_384",
    "1124_swin_large_bert_384_pil_pseudo",
    "1126_swin_large_bert_384_pil_pseudo_no-denoise",
    "1127_vtnt_bert_512-1024_pseudo_no-denoise",
]
KF_TEST_CSVs = [
    f"gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/{model}/kf-bms-candidates-v2/{filename}"
    for model in KF_MODELS
    for filename in TEST_FILENAMES
]

In [7]:
with mp.Pool() as pool:
    total = len(KF_TEST_CSVs)
    iterator = pool.imap_unordered(load_prediction, KF_TEST_CSVs)
    kf_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
kf_df = kf_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(kf_df.head(1))
display(kf_df.groupby("model").image_id.count())
# with pd.option_context("display.float_format", '{:.4f}'.format, "display.max_columns", None):
#     display(kf_df.groupby(["filename", "model"]).describe().T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=55.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,filename
0,00073e401fa1,InChI=1S/C20H27NO9S/c1-12(22)26-11-14-16-18(28...,,True,0.04187,1109_vtnt_bert_512-1024-denoise-5,test_camaro_0531.csv


model
1109_vtnt_bert_512-1024-denoise-5                 5811688
1113_swin_large_bert_384                          5811688
1124_swin_large_bert_384_pil_pseudo               5811688
1126_swin_large_bert_384_pil_pseudo_no-denoise    5811688
1127_vtnt_bert_512-1024_pseudo_no-denoise         5811688
Name: image_id, dtype: int64

In [8]:
LYAKAAP_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_camaro_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_camaro_old_submissions.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0531_renormed.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_camaro_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_yokoo_0601.csv
""".strip().split()

In [9]:
with mp.Pool() as pool:
    total = len(LYAKAAP_TEST_CSVs)
    iterator = pool.imap_unordered(load_prediction, LYAKAAP_TEST_CSVs)
    lyakaap_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
lyakaap_df = lyakaap_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(lyakaap_df.head(1))
display(lyakaap_df.groupby("model").image_id.count())
# with pd.option_context("display.float_format", '{:.4f}'.format, "display.max_columns", None):
#     display(lyakaap_df.groupby(["filename", "model"]).describe().T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,filename
0,007706c43e8d,InChI=1S/C16H32O6/c1-5-17-11-13-21-9-3-7-19-15...,,False,0.044922,v52,test_yokoo_0527.csv


model
v52    5811688
Name: image_id, dtype: int64

Unnamed: 0_level_0,filename,test_camaro_0525.csv,test_camaro_0531.csv,test_camaro_old_submissions.csv,test_kf_0523.csv,test_kf_0525.csv,test_kf_0527.csv,test_kf_0531.csv,test_kf_0531_renormed.csv,test_yokoo_0527.csv,test_yokoo_0531.csv,test_yokoo_0601.csv
Unnamed: 0_level_1,model,v52,v52,v52,v52,v52,v52,v52,v52,v52,v52,v52
levenshtein,count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
levenshtein,mean,,,,,,,,,,,
levenshtein,std,,,,,,,,,,,
levenshtein,min,,,,,,,,,,,
levenshtein,25%,,,,,,,,,,,
levenshtein,50%,,,,,,,,,,,
levenshtein,75%,,,,,,,,,,,
levenshtein,max,,,,,,,,,,,
normed_score,count,30694.0,11367.0,42624.0,3753511.0,304700.0,906933.0,24209.0,64239.0,8605.0,412767.0,252039.0
normed_score,mean,0.2172,0.1696,0.2346,0.086,0.2265,0.2112,0.1658,0.1721,0.0355,0.1251,0.0881


# Camaro

In [10]:
CAMARO_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0601.csv
""".strip().split()

In [11]:
with mp.Pool() as pool:
    total = len(CAMARO_TEST_CSVs)
    iterator = pool.imap_unordered(load_prediction, CAMARO_TEST_CSVs)
    camaro_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
camaro_df = camaro_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(camaro_df.head(1))
display(camaro_df.groupby("model").image_id.count())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,focal_score,ce_score,model,filename
0,007706c43e8d,InChI=1S/C16H32O6/c1-5-17-11-13-21-9-3-7-19-15...,,False,0.18025,0.521818,exp084,test_yokoo_0527.csv


model
exp084    5639758
Name: image_id, dtype: int64

In [36]:
camaro_df["score_rank"] = camaro_df.focal_score.rank() / len(camaro_df)

# Blend

In [12]:
kf_ensembled_df = kf_df.groupby(["image_id", "InChI"])[["normed_score", "is_valid"]].mean().reset_index()

In [37]:
kf_ensembled_df["score_rank"] = kf_ensembled_df.normed_score.rank() / len(kf_ensembled_df)
lyakaap_df["score_rank"] = lyakaap_df.normed_score.rank() / len(lyakaap_df)
camaro_df["score_rank"] = camaro_df.focal_score.rank() / len(camaro_df)

In [38]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    score_rank=True,
)
merged_df = pd.concat([
    kf_ensembled_df,
    lyakaap_df,
    camaro_df,
], ignore_index=True)
merged_ensembled_df = merged_df.groupby(["image_id", "InChI"])[["score_rank", "is_valid"]].mean().reset_index()
merged_ensembled_df = merged_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()

In [39]:
merged_ensembled_df = merged_ensembled_df.merge(baseline_df, on="image_id")
merged_ensembled_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(merged_ensembled_df[["InChI_x", "InChI_y"]].values)
]
print(merged_ensembled_df.query("InChI_x != InChI_y").shape)
print(merged_ensembled_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


(29393, 6)
0.2280115116140206


In [None]:
submission_df = merged_ensembled_df[["image_id", "InChI"]]
assert len(submission_df) == 1616107
submission_df.to_csv("submission_0705_1109+1113+084_0601.csv", index=False)
!head submission_0702_kyakaap+084_0601.csv
!wc submission_0702_kyakaap+084_0601.csv