In [1]:
import gc
import io
import itertools
import multiprocessing as mp
from urllib.parse import urlparse

import numpy as np
import google.cloud.storage as gcs
import pandas as pd
import seaborn as sns
import Levenshtein
from retrying import retry
from tqdm.auto import tqdm

In [2]:
GCP_PROJECT = "dena-ai-training-28-gcp"
client = gcs.Client(project=GCP_PROJECT)
bucket = client.get_bucket("kfujikawa-kaggle-bms-molecular-generation")

submissions = [
    pd.read_csv(io.BytesIO(blob.download_as_string())).assign(
        filename=blob.name.split("/")[-1]
    ).sort_values("image_id").reset_index(drop=True)
    for blob in tqdm(list(bucket.list_blobs(prefix="submissions/", delimiter="/")))
    if blob.path.endswith(".csv")
]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [3]:
def calc_levenshtein(ij):
    i, j = ij
    df1, df2 = submissions[i], submissions[j]
    return np.mean([
        Levenshtein.distance(x, y)
        for x, y in zip(df1.InChI, df2.InChI)
    ])


model_names = pd.concat(submissions, ignore_index=True).filename.unique()
lev_matrix = pd.DataFrame(columns=model_names, index=model_names)
for model_name in model_names:
    lev_matrix.loc[model_name, model_name] = 0

iterator = list(itertools.combinations(range(len(submissions)), 2))
mapfunc = map(calc_levenshtein, iterator)
for i, lev in enumerate(tqdm(mapfunc, total=len(iterator))):
    x, y = iterator[i]
    lev_matrix.loc[model_names[x], model_names[y]] = lev
    lev_matrix.loc[model_names[y], model_names[x]] = lev
lev_matrix

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=91.0), HTML(value='')))




Unnamed: 0,CV073_LB074.csv,Final_LB054weight_tuned.csv,LB054_LB054_2.5*camaro.csv,LB054_LB055+camaro_predictions.csv,LB055_LB056+camaro*1.5.csv,LB055_LB056+full_candidates.csv,LB056.csv,LB056_LB055+bugfixed.csv,LB057_LB056+safe_candidates.csv,LB058_LB056+kyakaap*1.5.csv,LB058_exp084+full_kyakaap.csv,LB059+kf0527.csv,LB063_camaro_only.csv,LB085_LB063+camaro_predictions.csv
CV073_LB074.csv,0.0,0.43495,0.452226,0.426708,0.355714,0.40937,0.313811,0.330938,0.353224,0.288297,0.330546,0.32524,0.438854,0.759658
Final_LB054weight_tuned.csv,0.43495,0.0,0.0377958,0.0329353,0.16327,0.0630132,0.182641,0.16414,0.153219,0.209287,0.212112,0.22471,0.327837,0.575847
LB054_LB054_2.5*camaro.csv,0.452226,0.0377958,0.0,0.0422317,0.178779,0.0757512,0.203321,0.182123,0.172074,0.230668,0.227263,0.238209,0.334067,0.575525
LB054_LB055+camaro_predictions.csv,0.426708,0.0329353,0.0422317,0.0,0.151987,0.034197,0.171949,0.151415,0.138566,0.199314,0.209271,0.221073,0.333284,0.584937
LB055_LB056+camaro*1.5.csv,0.355714,0.16327,0.178779,0.151987,0.0,0.152463,0.0712354,0.0448355,0.118782,0.102711,0.117128,0.130834,0.259685,0.50886
LB055_LB056+full_candidates.csv,0.40937,0.0630132,0.0757512,0.034197,0.152463,0.0,0.150093,0.128974,0.115351,0.178115,0.193719,0.205014,0.316056,0.596012
LB056.csv,0.313811,0.182641,0.203321,0.171949,0.0712354,0.150093,0.0,0.0299145,0.0609564,0.0345936,0.0893159,0.101467,0.248096,0.565287
LB056_LB055+bugfixed.csv,0.330938,0.16414,0.182123,0.151415,0.0448355,0.128974,0.0299145,0.0,0.085938,0.0630565,0.0911115,0.103791,0.234398,0.552166
LB057_LB056+safe_candidates.csv,0.353224,0.153219,0.172074,0.138566,0.118782,0.115351,0.0609564,0.085938,0.0,0.0918213,0.140059,0.151656,0.289954,0.593104
LB058_LB056+kyakaap*1.5.csv,0.288297,0.209287,0.230668,0.199314,0.102711,0.178115,0.0345936,0.0630565,0.0918213,0.0,0.101956,0.111811,0.262062,0.586799


In [5]:
df

Unnamed: 0,image_id,InChI
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1..."
4,00004df0fe53,InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)...
...,...,...
1616102,ffffcdb2e39e,InChI=1S/C21H28N2O2/c1-15(2)14-25-19-10-8-17(9...
1616103,ffffcfddd770,InChI=1S/C24H28N2O5/c1-5-30-19-9-8-17(13-16(19...
1616104,ffffe4ab06b2,InChI=1S/C17H17NO3/c19-12-6-7-14-13(10-12)16-1...
1616105,ffffec4033ec,"InChI=1S/C12H14F3NO3S/c1-2-3-4-9-16-20(17,18)1..."


In [7]:
merged_df = pd.concat(submissions, ignore_index=True)

In [10]:
lb056_df = merged_df.query("filename == 'LB056.csv'")

In [11]:
df

Unnamed: 0,image_id,InChI
0,00000d2a601c,InChI=1S/C10H14BrN5S/c1-6-10(11)9(16(3)14-6)4-...
1,00001f7fc849,InChI=1S/C14H18ClN3/c1-2-7-16-9-13-10-17-14(18...
2,000037687605,InChI=1S/C16H13BrN2O/c1-11(20)12-6-7-13(9-18)1...
3,00004b6d55b6,"InChI=1S/C14H19FN4O/c1-14(2,3)12-13(16)17-18-1..."
4,00004df0fe53,InChI=1S/C9H12O2/c1-4-5-2-6-7(3-5)11-9(10)8(4)...
...,...,...
1616102,ffffcdb2e39e,InChI=1S/C21H28N2O2/c1-15(2)14-25-19-10-8-17(9...
1616103,ffffcfddd770,InChI=1S/C24H28N2O5/c1-5-30-19-9-8-17(13-16(19...
1616104,ffffe4ab06b2,InChI=1S/C17H17NO3/c19-12-6-7-14-13(10-12)16-1...
1616105,ffffec4033ec,"InChI=1S/C12H14F3NO3S/c1-2-3-4-9-16-20(17,18)1..."


In [19]:
len(lb056_df.InChI.values != df.InChI.values)
(lb056_df.InChI.values != df.InChI.values).sum()

0