In [1]:
import gc
import io
import multiprocessing as mp
from copy import deepcopy
from urllib.parse import urlparse

import google.cloud.storage as gcs
import pandas as pd
import seaborn as sns
import Levenshtein
from retrying import retry
from tqdm.auto import tqdm

In [2]:
GCP_PROJECT = "dena-ai-training-28-gcp"


@retry(stop_max_attempt_number=3)
def download_from_gcs(path: str):
    url = urlparse(path)
    bucket_name = url.netloc
    storage_client = gcs.Client(project=GCP_PROJECT)
    bucket = storage_client.get_bucket(bucket_name)
    blob = gcs.Blob(url.path[1:], bucket)
    return blob.download_as_string()


def load_prediction(path: str):
    if path.split("/")[-2] == "kf-bms-candidates-v2":
        model = path.split("/")[-3]
    else:
        model = path.split("/")[-2]
    df = pd.read_csv(io.BytesIO(download_from_gcs(path)))\
        .assign(model=model, filename=path.split("/")[-1])
    return df

In [3]:
n_valid_InChIs = pd.read_csv(io.BytesIO(download_from_gcs("gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/kf-bms-candidates-v2/test_n_valid_InChIs.csv")))
NO_VALID_IMAGE_IDs = n_valid_InChIs.query("n_valid_InChIs == 0").image_id
len(NO_VALID_IMAGE_IDs)

5152

# Camaro

In [4]:
CAMARO_TEST_CSVs = """
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0531.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp084/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0523.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0525.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_kf_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0527.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp090/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845/test_candidates_0531_all_is_valid.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp0845/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103/test_candidates_0531_all_is_valid.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp103/test_yokoo_0601.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031/test_candidates_0531_all_is_valid.csv
gs://kfujikawa-kaggle-bms-molecular-generation/camaro/exp1031/test_yokoo_0601.csv
""".strip().split()

In [5]:
with mp.Pool() as pool:
    total = len(CAMARO_TEST_CSVs)
    iterator = pool.imap_unordered(load_prediction, CAMARO_TEST_CSVs)
    camaro_df = pd.concat(list(tqdm(iterator, total=total)), ignore_index=True)
camaro_df = camaro_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(camaro_df.head(1))
display(camaro_df.groupby("model").image_id.count())
with pd.option_context("display.float_format", '{:.4f}'.format, "display.max_columns", None):
    display(camaro_df.groupby(["filename", "model"]).describe().T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=18.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,focal_score,ce_score,model,filename
0,001ae62e2309,InChI=1S/C11H11BrO4/c12-7-3-4-5(9(13)14)6-8(7)...,,False,0.167042,0.585231,exp090,test_yokoo_0527.csv


model
exp084     12878495
exp0845     5728274
exp090     12455847
exp103      5728274
exp1031     5728274
Name: image_id, dtype: int64

Unnamed: 0_level_0,filename,test_candidates_0531_all_is_valid.csv,test_candidates_0531_all_is_valid.csv,test_candidates_0531_all_is_valid.csv,test_kf_0523.csv,test_kf_0523.csv,test_kf_0525.csv,test_kf_0525.csv,test_kf_0527.csv,test_kf_0527.csv,test_yokoo_0527.csv,test_yokoo_0527.csv,test_yokoo_0531.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv
Unnamed: 0_level_1,model,exp0845,exp103,exp1031,exp084,exp090,exp084,exp090,exp084,exp090,exp084,exp090,exp084,exp084,exp0845,exp090,exp103,exp1031
levenshtein,count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
levenshtein,mean,,,,,,,,,,,,,,,,,
levenshtein,std,,,,,,,,,,,,,,,,,
levenshtein,min,,,,,,,,,,,,,,,,,
levenshtein,25%,,,,,,,,,,,,,,,,,
levenshtein,50%,,,,,,,,,,,,,,,,,
levenshtein,75%,,,,,,,,,,,,,,,,,
levenshtein,max,,,,,,,,,,,,,,,,,
focal_score,count,5411088.0,5411088.0,5411088.0,9655806.0,9655806.0,1578538.0,1578538.0,885102.0,885102.0,17255.0,8566.0,413649.0,239405.0,239405.0,239405.0,239405.0,239405.0
focal_score,mean,0.0593,0.0567,0.0508,0.0805,0.1356,0.1093,0.1739,0.1039,0.1601,0.0666,0.0761,0.0885,0.0438,0.0444,0.0538,0.0573,0.0487


In [6]:
camaro_ensembled_df = camaro_df.groupby(["image_id", "InChI"])[["focal_score", "is_valid"]].mean().reset_index()

In [8]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    focal_score=True,
)
camaro_ensembled_df = camaro_ensembled_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby("image_id").first().reset_index()
len(camaro_ensembled_df)

1616107

In [9]:
submission_df = camaro_ensembled_df[["image_id", "InChI"]]
assert len(submission_df) == 1616107
submission_df.to_csv("submission_0703_camaro_only.csv", index=False)
!head submission_0703_camaro_only.csv
!wc submission_0703_camaro_only.csv

image_id,InChI
00000d2a601c,"InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-7(12-2)8-5-13-17-15-8/h5,7,12H,4H2,1-3H3"
00001f7fc849,"InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18-13)8-11-3-5-12(15)6-4-11/h3-6,10,16H,2,7-9H2,1H3,(H,17,18)"
000037687605,"InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)16(8-12)19-10-14-4-2-3-5-15(14)17/h2-8,19H,10H2,1H3"
00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-19(12)8-9-5-6-10(20-4)7-11(9)15/h5-7H,8,16H2,1-4H3"
00004df0fe53,"InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)6/h4-8H,2-3H2,1H3/t4-,5+,6-,7+,8+/m1/s1"
000085dab281,"InChI=1S/C20H38O/c1-20(2)18-16-14-12-10-8-6-4-3-5-7-9-11-13-15-17-19-21/h17,20H,3-16,18H2,1-2H3"
00008decfc8d,"InChI=1S/C15H26N2/c1-5-10-16-15(11-12(3)6-2)14-9-7-8-13(4)17-14/h7-9,12,15-16H,5-6,10-11H2,1-4H3"
00008e8fe68c,"InChI=1S/C23H27Cl2N3O6/c1-6-32-17-12-11-16(31-5)18(25)20(17)26-23(30)19(13(4)29)28-27-15-10-9-14(24)21(33-7-2)22(15)34-8-3/h9-12,19H,6-8H2,1-5H3,(H,26,30)"
000095714f0f,"InChI=1S/C25H30ClN3O2/c1-17-4-9

In [12]:
baseline_df = pd.read_csv("submission_059.csv")
baseline_df

Unnamed: 0,image_id,InChI
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1..."
4,00004df0fe53,InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)...
...,...,...
1616102,ffffcdb2e39e,InChI=1S/C21H28N2O2/c1-15(2)14-25-19-10-8-17(9...
1616103,ffffcfddd770,InChI=1S/C24H28N2O5/c1-5-30-19-9-8-17(13-16(19...
1616104,ffffe4ab06b2,InChI=1S/C17H17NO3/c19-12-6-7-14-13(10-12)16-1...
1616105,ffffec4033ec,"InChI=1S/C12H14F3NO3S/c1-2-3-4-9-16-20(17,18)1..."


In [14]:
merged_df = submission_df.merge(baseline_df, on="image_id")
merged_df.query("InChI_x != InChI_y").shape

(333932, 3)

In [17]:
merged_df["levenshtein"] = [
    Levenshtein.distance(x, y)
    for x, y in merged_df[["InChI_x", "InChI_y"]].values
]
merged_df.levenshtein.mean()

3.260612075809337