In [1]:
import gc
import io
import multiprocessing as mp
from copy import deepcopy
from urllib.parse import urlparse

import google.cloud.storage as gcs
import pandas as pd
import seaborn as sns
import Levenshtein
from retrying import retry
from tqdm.auto import tqdm

In [2]:
GCP_PROJECT = "dena-ai-training-28-gcp"


@retry(stop_max_attempt_number=3)
def download_from_gcs(path: str):
    url = urlparse(path)
    bucket_name = url.netloc
    storage_client = gcs.Client(project=GCP_PROJECT)
    bucket = storage_client.get_bucket(bucket_name)
    blob = gcs.Blob(url.path[1:], bucket)
    return blob.download_as_string()


def load_prediction(path: str):
    if path.split("/")[-2] == "kf-bms-candidates-v2":
        model = path.split("/")[-3]
    else:
        model = path.split("/")[-2]
    df = pd.read_csv(io.BytesIO(download_from_gcs(path)))\
        .assign(model=model, filename=path.split("/")[-1])
    return df

In [3]:
n_valid_InChIs = pd.read_csv(io.BytesIO(download_from_gcs("gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/kf-bms-candidates-v2/test_n_valid_InChIs.csv")))
NO_VALID_IMAGE_IDs = n_valid_InChIs.query("n_valid_InChIs == 0").image_id
len(NO_VALID_IMAGE_IDs)

5152

# Camaro

In [4]:
CAMARO_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845/test_candidates_0531_all_is_valid.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103/test_candidates_0531_all_is_valid.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031/test_candidates_0531_all_is_valid.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031/test_yokoo_0601.csv
""".strip().split()

In [5]:
with mp.Pool() as pool:
    total = len(CAMARO_TEST_CSVs)
    iterator = pool.imap_unordered(load_prediction, CAMARO_TEST_CSVs)
    camaro_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
camaro_df = camaro_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(camaro_df.head(1))
display(camaro_df.groupby("model").image_id.count())
with pd.option_context("display.float_format", '{:.4f}'.format, "display.max_columns", None):
    display(camaro_df.groupby(["filename", "model"]).describe().T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,focal_score,ce_score,model,filename
0,001ae62e2309,InChI=1S/C11H11BrO4/c12-7-3-4-5(9(13)14)6-8(7)...,,False,0.167042,0.585231,exp090,test_yokoo_0527.csv


model
exp084     12878495
exp0845     5728274
exp090     12455847
exp103      5728274
exp1031     5728274
Name: image_id, dtype: int64

Unnamed: 0_level_0,filename,test_candidates_0531_all_is_valid.csv,test_candidates_0531_all_is_valid.csv,test_candidates_0531_all_is_valid.csv,test_kf_0523.csv,test_kf_0523.csv,test_kf_0525.csv,test_kf_0525.csv,test_kf_0527.csv,test_kf_0527.csv,test_yokoo_0527.csv,test_yokoo_0527.csv,test_yokoo_0531.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv
Unnamed: 0_level_1,model,exp0845,exp103,exp1031,exp084,exp090,exp084,exp090,exp084,exp090,exp084,exp090,exp084,exp084,exp0845,exp090,exp103,exp1031
levenshtein,count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
levenshtein,mean,,,,,,,,,,,,,,,,,
levenshtein,std,,,,,,,,,,,,,,,,,
levenshtein,min,,,,,,,,,,,,,,,,,
levenshtein,25%,,,,,,,,,,,,,,,,,
levenshtein,50%,,,,,,,,,,,,,,,,,
levenshtein,75%,,,,,,,,,,,,,,,,,
levenshtein,max,,,,,,,,,,,,,,,,,
focal_score,count,5411088.0,5411088.0,5411088.0,9655806.0,9655806.0,1578538.0,1578538.0,885102.0,885102.0,17255.0,8566.0,413649.0,239405.0,239405.0,239405.0,239405.0,239405.0
focal_score,mean,0.0593,0.0567,0.0508,0.0805,0.1356,0.1093,0.1739,0.1039,0.1601,0.0666,0.0761,0.0885,0.0438,0.0444,0.0538,0.0573,0.0487


In [6]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    focal_score=True,
)
camaro_ensembled_df = camaro_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby(["image_id", "model"]).first().reset_index()
len(camaro_ensembled_df)

8078945

In [7]:
baseline_df = pd.read_csv("submission_059.csv")
merged_df = camaro_ensembled_df.merge(baseline_df, on="image_id")
merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in tqdm(merged_df[["InChI_x", "InChI_y"]].values)
]
print(merged_df.query("InChI_x != InChI_y").shape)
print(merged_df.levenshtein.mean())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8078945.0), HTML(value='')))


(2629935, 9)
5.439711125648213


In [9]:
merged_df.groupby("model").levenshtein.agg(["mean", "count"])

Unnamed: 0_level_0,mean,count
model,Unnamed: 1_level_1,Unnamed: 2_level_1
exp084,0.302974,1616107
exp0845,8.857516,1615577
exp090,0.393134,1616107
exp103,8.832229,1615577
exp1031,8.816043,1615577


In [None]:
merged_df