In [1]:
import gc
import io
import multiprocessing as mp
from copy import deepcopy
from urllib.parse import urlparse

import google.cloud.storage as gcs
import pandas as pd
import seaborn as sns
import Levenshtein
from retrying import retry
from tqdm.auto import tqdm

In [3]:
GCP_PROJECT = "dena-ai-training-28-gcp"

In [2]:
@retry(stop_max_attempt_number=3)
def download_from_gcs(path: str):
    url = urlparse(path)
    bucket_name = url.netloc
    storage_client = gcs.Client(project=GCP_PROJECT)
    bucket = storage_client.get_bucket(bucket_name)
    blob = gcs.Blob(url.path[1:], bucket)
    return blob.download_as_string()


@retry(stop_max_attempt_number=3)
def check_gcs_path_exists(path: str):
    url = urlparse(path)
    bucket_name = url.netloc
    storage_client = gcs.Client(project=GCP_PROJECT)
    bucket = storage_client.get_bucket(bucket_name)
    blob = gcs.Blob(url.path[1:], bucket)
    return blob.exists()


def load_prediction(path: str):
    if path.split("/")[-2] == "kf-bms-candidates-v2":
        model = path.split("/")[-3]
    else:
        model = path.split("/")[-2]
    df = pd.read_csv(io.BytesIO(download_from_gcs(path)))\
        .assign(model=model, filename=path.split("/")[-1])
    return df

# KF side (valid)

In [5]:
TEST_FILENAMES = [
    "test_kf_0523.csv",
    "test_kf_0525.csv",
    "test_kf_0527.csv",
    "test_yokoo_0527.csv",
    "test_camaro_0525.csv",
    "test_yokoo_0531.csv",
    "test_kf_0531_renormed.csv",
    "test_camaro_old_submissions.csv",
    "test_kf_0531.csv",
    "test_camaro_0531.csv",
    "test_yokoo_0601.csv",
]
KF_MODELS = [
    "1109_vtnt_bert_512-1024-denoise-5",
    "1113_swin_large_bert_384",
    "1124_swin_large_bert_384_pil_pseudo",
    "1126_swin_large_bert_384_pil_pseudo_no-denoise",
    "1127_vtnt_bert_512-1024_pseudo_no-denoise",
]
KF_TEST_CSVs = [
    f"gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/{model}/kf-bms-candidates-v2/{filename}"
    for model in KF_MODELS
    for filename in TEST_FILENAMES
]

## ファイル読み込み

In [6]:
n_valid_InChIs = pd.read_csv(io.BytesIO(download_from_gcs("gs://kfujikawa-kaggle-bms-molecular-generation/kfujikawa/kf-bms-candidates-v2/test_n_valid_InChIs.csv")))
no_valid_image_ids = n_valid_InChIs.query("n_valid_InChIs == 0").image_id
no_valid_image_ids

612        001ae62e2309
787        00223006fc38
1044       002ccff46b6b
1153       0030c708efb2
2342       0060a2168372
               ...     
1614124    ffafe4506237
1614241    ffb505af0ea2
1614293    ffb73130936d
1615402    ffe3693d97b6
1616067    fffebea99ab9
Name: image_id, Length: 5152, dtype: object

In [7]:
kf_df = pd.concat([
    load_prediction(path)
        .query("is_valid | image_id.isin(@no_valid_image_ids)", engine="python")
    for path in tqdm(KF_TEST_CSVs)
], ignore_index=True)
kf_df = kf_df.drop_duplicates(subset=["model", "image_id", "InChI"])
display(kf_df.head(1))
display(kf_df.groupby("model").image_id.count())
with pd.option_context("display.float_format", '{:.4f}'.format, "display.max_columns", None):
    display(kf_df.groupby(["filename", "model"]).describe().T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=55.0), HTML(value='')))




Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,filename
0,224eae465d23,InChI=1S/C8H16O7/c1-14-3(2-9)7-5(11)4(10)6(12)...,,True,0.03335,1109_vtnt_bert_512-1024-denoise-5,test_kf_0523.csv


model
1109_vtnt_bert_512-1024-denoise-5                 5811688
1113_swin_large_bert_384                          5811688
1124_swin_large_bert_384_pil_pseudo               5811688
1126_swin_large_bert_384_pil_pseudo_no-denoise    5811688
1127_vtnt_bert_512-1024_pseudo_no-denoise         5811688
Name: image_id, dtype: int64

Unnamed: 0_level_0,filename,test_camaro_0525.csv,test_camaro_0525.csv,test_camaro_0525.csv,test_camaro_0525.csv,test_camaro_0525.csv,test_camaro_0531.csv,test_camaro_0531.csv,test_camaro_0531.csv,test_camaro_0531.csv,test_camaro_0531.csv,test_camaro_old_submissions.csv,test_camaro_old_submissions.csv,test_camaro_old_submissions.csv,test_camaro_old_submissions.csv,test_camaro_old_submissions.csv,test_kf_0523.csv,test_kf_0523.csv,test_kf_0523.csv,test_kf_0523.csv,test_kf_0523.csv,test_kf_0525.csv,test_kf_0525.csv,test_kf_0525.csv,test_kf_0525.csv,test_kf_0525.csv,test_kf_0527.csv,test_kf_0527.csv,test_kf_0527.csv,test_kf_0527.csv,test_kf_0527.csv,test_kf_0531.csv,test_kf_0531.csv,test_kf_0531.csv,test_kf_0531.csv,test_kf_0531.csv,test_kf_0531_renormed.csv,test_kf_0531_renormed.csv,test_kf_0531_renormed.csv,test_kf_0531_renormed.csv,test_kf_0531_renormed.csv,test_yokoo_0527.csv,test_yokoo_0527.csv,test_yokoo_0527.csv,test_yokoo_0527.csv,test_yokoo_0527.csv,test_yokoo_0531.csv,test_yokoo_0531.csv,test_yokoo_0531.csv,test_yokoo_0531.csv,test_yokoo_0531.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv,test_yokoo_0601.csv
Unnamed: 0_level_1,model,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise,1109_vtnt_bert_512-1024-denoise-5,1113_swin_large_bert_384,1124_swin_large_bert_384_pil_pseudo,1126_swin_large_bert_384_pil_pseudo_no-denoise,1127_vtnt_bert_512-1024_pseudo_no-denoise
levenshtein,count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
levenshtein,mean,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
levenshtein,std,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
levenshtein,min,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
levenshtein,25%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
levenshtein,50%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
levenshtein,75%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
levenshtein,max,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
normed_score,count,30918.0,30918.0,30918.0,30918.0,30918.0,11287.0,11287.0,11287.0,11287.0,11287.0,41433.0,41433.0,41433.0,41433.0,41433.0,3753518.0,3753518.0,3753518.0,3753518.0,3753518.0,304704.0,304704.0,304704.0,304704.0,304704.0,906933.0,907125.0,906933.0,906933.0,906933.0,23728.0,23728.0,23728.0,23728.0,23728.0,64564.0,64564.0,64564.0,64564.0,64564.0,8605.0,8413.0,8605.0,8605.0,8605.0,413959.0,413959.0,413959.0,413959.0,413959.0,252039.0,252039.0,252039.0,252039.0,252039.0
normed_score,mean,0.1975,0.2217,0.2246,0.2313,0.2104,0.1738,0.176,0.1801,0.1824,0.1826,0.2418,0.2507,0.2582,0.2583,0.253,0.0737,0.0802,0.0835,0.0847,0.0768,0.1751,0.2002,0.2103,0.2109,0.184,0.1679,0.1896,0.1988,0.1998,0.1771,0.1311,0.1174,0.1083,0.1084,0.1291,0.1458,0.1491,0.1574,0.1585,0.1517,0.1589,0.1388,0.1364,0.1341,0.1629,0.2342,0.2096,0.2109,0.2116,0.2436,0.0976,0.1014,0.1036,0.1036,0.0998


In [9]:
sort_keys = dict(
    image_id=True,
    is_valid=False,
    normed_score=True,
)
kf_ensembled_df = kf_df.groupby(["image_id", "InChI"]).mean()
kf_ensembled_df = kf_df.sort_values(
    by=list(sort_keys.keys()),
    ascending=list(sort_keys.values()),
).groupby(["image_id"]).first().reset_index()
kf_ensembled_df.head()

Unnamed: 0,image_id,InChI,levenshtein,is_valid,normed_score,model,filename
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...,,True,2.980232e-07,1124_swin_large_bert_384_pil_pseudo,test_kf_0523.csv
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...,,True,5.960464e-08,1126_swin_large_bert_384_pil_pseudo_no-denoise,test_kf_0523.csv
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...,,True,1.788139e-07,1126_swin_large_bert_384_pil_pseudo_no-denoise,test_kf_0523.csv
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1...",,True,1.788139e-07,1124_swin_large_bert_384_pil_pseudo,test_kf_0523.csv
4,00004df0fe53,InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)...,,True,0.0458374,1127_vtnt_bert_512-1024_pseudo_no-denoise,test_kf_0523.csv


In [10]:
submission_df = test_ensembled_df[["image_id", "InChI"]]
assert len(submission_df) == 1616107
submission_df.to_csv("submission.csv", index=False)
!head submission.csv
!wc submission.csv

image_id,InChI
00000d2a601c,"InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-7(12-2)8-5-13-17-15-8/h5,7,12H,4H2,1-3H3"
00001f7fc849,"InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18-13)8-11-3-5-12(15)6-4-11/h3-6,10,16H,2,7-9H2,1H3,(H,17,18)"
000037687605,"InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)16(8-12)19-10-14-4-2-3-5-15(14)17/h2-8,19H,10H2,1H3"
00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-19(12)8-9-5-6-10(20-4)7-11(9)15/h5-7H,8,16H2,1-4H3"
00004df0fe53,"InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)6/h4-8H,2-3H2,1H3/t4-,5+,6+,7-,8-/m1/s1"
000085dab281,"InChI=1S/C20H38O/c1-20(2)18-16-14-12-10-8-6-4-3-5-7-9-11-13-15-17-19-21/h17,20H,3-16,18H2,1-2H3"
00008decfc8d,"InChI=1S/C15H26N2/c1-5-10-16-15(11-12(3)6-2)14-9-7-8-13(4)17-14/h7-9,12,15-16H,5-6,10-11H2,1-4H3"
00008e8fe68c,"InChI=1S/C22H25Cl2N3O6/c1-6-32-20-13(23)8-9-14(21(20)33-7-2)26-27-18(12(3)28)22(29)25-19-16(31-5)11-10-15(30-4)17(19)24/h8-11,18H,6-7H2,1-5H3,(H,25,29)"
000095714f0f,"InChI=1S/C25H30ClN3O2/c1-17-4-9-23