In [1]:
import gc
import io
import multiprocessing as mp
from copy import deepcopy
from urllib.parse import urlparse

import google.cloud.storage as gcs
import pandas as pd
import seaborn as sns
import Levenshtein
from retrying import retry
from tqdm.auto import tqdm

In [9]:
GCP_PROJECT = "dena-ai-training-28-gcp"


@retry(stop_max_attempt_number=3)
def download_from_gcs(path: str):
    url = urlparse(path)
    bucket_name = url.netloc
    storage_client = gcs.Client(project=GCP_PROJECT)
    bucket = storage_client.get_bucket(bucket_name)
    blob = gcs.Blob(url.path[1:], bucket)
    return blob.download_as_string()


def load_prediction(path: str):
    if path.split("/")[-2] == "kf-bms-candidates-v2":
        model = path.split("/")[-3]
    else:
        model = path.split("/")[-2]
    df = pd.read_csv(io.BytesIO(download_from_gcs(path)))\
        .assign(model=model, filename=path.split("/")[-1])\
        .query("is_valid | image_id.isin(@NO_VALID_IMAGE_IDs)", engine="python")
    return df

In [6]:
n_valid_InChIs = pd.read_csv(io.BytesIO(download_from_gcs("gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/kf-bms-candidates-v2/test_n_valid_InChIs.csv")))
NO_VALID_IMAGE_IDs = n_valid_InChIs.query("n_valid_InChIs == 0").image_id
len(NO_VALID_IMAGE_IDs)

5152

In [15]:
baseline_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB059+kf0527.csv"
)))

# KF side

In [7]:
TEST_FILENAMES = [
    "test_kf_0523.csv",
    "test_kf_0525.csv",
    "test_kf_0527.csv",
    "test_yokoo_0527.csv",
    "test_camaro_0525.csv",
]
KF_MODELS = [
    "1109_vtnt_bert_512-1024-denoise-5",
    "1113_swin_large_bert_384",
]
KF_TEST_CSVs = [
    f"gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/{model}/kf-bms-candidates-v2/{filename}"
    for model in KF_MODELS
    for filename in TEST_FILENAMES
]

In [10]:
with mp.Pool() as pool:
    total = len(KF_TEST_CSVs)
    iterator = pool.imap_unordered(load_prediction, KF_TEST_CSVs)
    kf_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
kf_df = kf_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(kf_df.head(1))
display(kf_df.groupby("model").image_id.count())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,filename
0,007706c43e8d,InChI=1S/C16H32O6/c1-5-17-11-13-21-9-3-7-19-15...,,False,0.192261,1113_swin_large_bert_384,test_yokoo_0527.csv


model
1109_vtnt_bert_512-1024-denoise-5    5004678
1113_swin_large_bert_384             5004678
Name: image_id, dtype: int64

In [17]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    normed_score=True,
)
kf_ensembled_df = kf_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()
kf_ensembled_df = kf_ensembled_df.merge(baseline_df, on="image_id")
kf_ensembled_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(kf_ensembled_df[["InChI_x", "InChI_y"]].values)
]
print(len(kf_ensembled_df))
print(kf_ensembled_df.query("InChI_x != InChI_y").shape)
print(kf_ensembled_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


1616107
(135514, 8)
0.3325175870162062


# Lyakaap side

In [11]:
LYAKAAP_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/yokoo/v52/test_camaro_0525.csv
""".strip().split()

In [12]:
with mp.Pool() as pool:
    total = len(LYAKAAP_TEST_CSVs)
    iterator = pool.imap_unordered(load_prediction, LYAKAAP_TEST_CSVs)
    lyakaap_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
lyakaap_df = lyakaap_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(lyakaap_df.head(1))
display(lyakaap_df.groupby("model").image_id.count())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,filename
0,007706c43e8d,InChI=1S/C16H32O6/c1-5-17-11-13-21-9-3-7-19-15...,,False,0.044922,v52,test_yokoo_0527.csv


model
v52    5004678
Name: image_id, dtype: int64

In [18]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    normed_score=True,
)
lyakaap_ensembled_df = lyakaap_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()
lyakaap_ensembled_df = lyakaap_ensembled_df.merge(baseline_df, on="image_id")
lyakaap_ensembled_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(lyakaap_ensembled_df[["InChI_x", "InChI_y"]].values)
]
print(len(lyakaap_ensembled_df))
print(lyakaap_ensembled_df.query("InChI_x != InChI_y").shape)
print(lyakaap_ensembled_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1616107.0), HTML(value='')))


1616107
(129105, 8)
0.3429878095942905


# Camaro side

In [13]:
CAMARO_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0527.csv
""".strip().split()

In [14]:
with mp.Pool() as pool:
    total = len(CAMARO_TEST_CSVs)
    iterator = pool.imap_unordered(load_prediction, CAMARO_TEST_CSVs)
    camaro_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
camaro_df = camaro_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(camaro_df.head(1))
display(camaro_df.groupby("model").image_id.count())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,focal_score,ce_score,model,filename
0,007706c43e8d,InChI=1S/C16H32O6/c1-5-17-11-13-21-9-3-7-19-15...,,False,0.18025,0.521818,exp084,test_yokoo_0527.csv


model
exp084    4973760
Name: image_id, dtype: int64

In [19]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    focal_score=True,
)
camaro_ensembled_df = camaro_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()
camaro_ensembled_df = camaro_ensembled_df.merge(baseline_df, on="image_id")
camaro_ensembled_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(camaro_ensembled_df[["InChI_x", "InChI_y"]].values)
]
print(len(camaro_ensembled_df))
print(camaro_ensembled_df.query("InChI_x != InChI_y").shape)
print(camaro_ensembled_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1615331.0), HTML(value='')))


1615331
(37696, 9)
0.18372395502841213


# Single model

In [7]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    focal_score=True,
)
camaro_ensembled_df = camaro_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()

In [8]:
baseline_df = pd.read_csv(io.BytesIO(download_from_gcs(
    "gs://kfujikawa-kaggle-bms-molecular-generation/submissions/LB059+kf0527.csv"
)))

In [11]:
merged_df = camaro_ensembled_df.merge(baseline_df, on="image_id")
merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(merged_df[["InChI_x", "InChI_y"]].values)
]
print(merged_df.query("InChI_x != InChI_y").shape)
print(merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1615577.0), HTML(value='')))


(33625, 9)
0.21922693873458213


In [15]:
camaro_df.image_id.nunique()

1615577