In [1]:
from pathlib import Path

import click
import numpy as np
import pandas as pd
import Levenshtein
from tqdm.auto import tqdm
from loguru import logger

In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule.preprocessors import normalize_inchi_batch, disable_rdlogger

## Load training dataset

In [3]:
OUTDIR = Path("/work/input/kfujikawa/kf-bms-candidates-v2")
OUT_COLUMNS = [
    "image_id",
    "InChI",
    "levenshtein",
    "is_valid",
]
candidate_ids = set()

In [4]:
# disable_rdlogger()
# origin_df = pd.read_pickle("/work/input/kfujikawa/bms-preprocess-v2/train.pkl").set_index("image_id")

## Load: kf_0523, kf_0525, kf_0527, yokoo_0527, camaro_0525

In [5]:
VALID_CSVs = """
/work/input/kfujikawa/kf-bms-candidates/valid_kf_0523.csv
/work/input/kfujikawa/kf-bms-candidates/valid_kf_0525.csv
/work/input/kfujikawa/kf-bms-candidates/valid_kf_0527.csv
/work/input/kfujikawa/kf-bms-candidates/valid_yokoo_0527.csv
""".strip().split()
TEST_CSVs = """
/work/input/kfujikawa/kf-bms-candidates/test_kf_0523.csv
/work/input/kfujikawa/kf-bms-candidates/test_kf_0525.csv
/work/input/kfujikawa/kf-bms-candidates/test_kf_0527.csv
/work/input/kfujikawa/kf-bms-candidates/test_yokoo_0527.csv
/work/input/kfujikawa/kf-bms-candidates/test_camaro_0525.csv
""".strip().split()

merged_df = pd.concat([
    *[pd.read_csv(x).assign(has_label=True, filename=x.split("/")[-1]) for x in tqdm(VALID_CSVs)],
    *[pd.read_csv(x).assign(has_label=False, filename=x.split("/")[-1]) for x in tqdm(TEST_CSVs)],
], ignore_index=True)
merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [6]:
summary_df = merged_df.groupby("image_id").agg(
    n_valid_InChIs=("is_valid", "sum"),
    lowest_levenshtein=("levenshtein", "min"),
    has_label=("has_label", "max"),
).reset_index()
summary_df = summary_df.groupby(["has_label", "n_valid_InChIs"]).agg(
    n_count=("image_id", "count"),
    lowest_levenshtein=("lowest_levenshtein", "mean"),
)
valid_only_summary_df = merged_df.query("is_valid").groupby("image_id").agg(
    n_valid_InChIs=("is_valid", "sum"),
    lowest_levenshtein=("levenshtein", "min"),
    has_label=("has_label", "max"),
).reset_index()
valid_only_summary_df = valid_only_summary_df.groupby(["has_label", "n_valid_InChIs"]).agg(
    n_count=("image_id", "count"),
    lowest_levenshtein=("lowest_levenshtein", "mean"),
)
summary_df["lowest_levenshtein (valid only)"] = valid_only_summary_df.lowest_levenshtein
summary_df

Unnamed: 0_level_0,Unnamed: 1_level_0,n_count,lowest_levenshtein,lowest_levenshtein (valid only)
has_label,n_valid_InChIs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,0,5152,,
False,1,286359,,
False,2,455685,,
False,3,335134,,
False,4,278219,,
False,5,115360,,
False,6,59810,,
False,7,22263,,
False,8,41002,,
False,9,8029,,


In [7]:
valid_image_ids = merged_df.query("is_valid").image_id.unique()
v2_merged_df = merged_df.query("is_valid | ~image_id.isin(@valid_image_ids)")
candidate_ids |= set(v2_merged_df.image_id.str.cat(merged_df.InChI))
print(len(candidate_ids))

for filename, df in tqdm(v2_merged_df.groupby("filename")):
    print(filename)
#     df.pop("filename")
#     df = df.sort_values("image_id")
#     OUTDIR.mkdir(parents=True, exist_ok=True)
#     df.to_csv(OUTDIR / filename, index=False)

5751160


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9.0), HTML(value='')))

test_camaro_0525.csv
test_kf_0523.csv
test_kf_0525.csv
test_kf_0527.csv
test_yokoo_0527.csv
valid_kf_0523.csv
valid_kf_0525.csv
valid_kf_0527.csv
valid_yokoo_0527.csv



## yokoo 0531

In [9]:
outfile = "{datatype}_yokoo_0531.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
#         pd.read_csv(OUTDIR / "/work/input/yokoo/v54/valid_beam=1.csv").assign(has_label=True),
#         pd.read_csv(OUTDIR / "/work/input/yokoo/v55/valid_beam=1.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/input/yokoo/v54/test_beam=1.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/yokoo/v55/test_beam=1.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/yokoo/v52/test_beam=32.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/yokoo/v54/test_beam=32.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/yokoo/v55/test_beam=32.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])
    if "levenshtein" not in merged_df:
        merged_df["levenshtein"] = None

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)")
    merged_df = merged_df.query("is_valid | ~image_id.isin(@valid_image_ids)")
    merged_df = merged_df.reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    if len(valid_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
        valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    if len(test_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
        test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
#     pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

Overwrite? [y/N]: 

 y


2021-05-31 16:37:02.082 | INFO     | __main__:<module>:3 - Load prediction datasets
2021-05-31 16:37:19.501 | INFO     | __main__:<module>:29 - Save: /work/input/kfujikawa/kf-bms-candidates-v2/test_yokoo_0531.csv
2021-05-31 16:37:23.995 | INFO     | __main__:<module>:38 - Add candidates: 5751160 -> 6165119 (+413959)


## camaro old_submissions

In [10]:
outfile = "{datatype}_camaro_old_submissions.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/input/camaro/submissions/exp072_cv0860_lb096.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/camaro/submissions/exp0842_cv0657_lb087.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/camaro/submissions/exp0843_cv0801_lb089.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/camaro/submissions/exp0844_cv0803_lb090.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/camaro/submissions/exp090_cv0763_lb097.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])
    if "levenshtein" not in merged_df:
        merged_df["levenshtein"] = None

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)")
    merged_df = merged_df.query("is_valid | ~image_id.isin(@valid_image_ids)")
    merged_df = merged_df.reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    if len(valid_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
        valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    if len(test_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
        test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
#     pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

Overwrite? [y/N]: 

 y


2021-05-31 16:37:34.618 | INFO     | __main__:<module>:3 - Load prediction datasets
2021-05-31 16:38:04.100 | INFO     | __main__:<module>:27 - Save: /work/input/kfujikawa/kf-bms-candidates-v2/test_camaro_old_submissions.csv
2021-05-31 16:38:04.563 | INFO     | __main__:<module>:36 - Add candidates: 6165119 -> 6206969 (+41850)


## kf 0531

In [16]:
outfile = "{datatype}_kf_0531.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/output/1124_swin_large_bert_384_pil_pseudo/valid_beam=1.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/output/1124_swin_large_bert_384_pil_pseudo/test_beam=1.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/output/1126_swin_large_bert_384_pil_pseudo_no-denoise/valid_beam=1.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/output/1126_swin_large_bert_384_pil_pseudo_no-denoise/test_beam=1.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/output/1127_vtnt_bert_512-1024_pseudo_no-denoise/valid_beam=1.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/output/1127_vtnt_bert_512-1024_pseudo_no-denoise/test_beam=1.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])
    if "levenshtein" not in merged_df:
        merged_df["levenshtein"] = None

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)")
    merged_df = merged_df.query("is_valid | ~image_id.isin(@valid_image_ids)")
    merged_df = merged_df.reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    if len(valid_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
        valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    if len(test_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
        test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
    pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

2021-05-31 19:50:51.027 | INFO     | __main__:<module>:3 - Load prediction datasets
2021-05-31 19:51:30.825 | INFO     | __main__:<module>:25 - Save: /work/input/kfujikawa/kf-bms-candidates-v2/valid_kf_0531.csv
2021-05-31 19:51:30.866 | INFO     | __main__:<module>:28 - Save: /work/input/kfujikawa/kf-bms-candidates-v2/test_kf_0531.csv
2021-05-31 19:51:31.184 | INFO     | __main__:<module>:37 - Add candidates: 6206969 -> 6235388 (+28419)


## camaro 0531

In [18]:
outfile = "{datatype}_camaro_0531.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/input/camaro/submissions/exp1031_cv0825_lb.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])
    if "levenshtein" not in merged_df:
        merged_df["levenshtein"] = None

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)")
    merged_df = merged_df.query("is_valid | ~image_id.isin(@valid_image_ids)")
    merged_df = merged_df.reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    if len(valid_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
        valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    if len(test_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
        test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
#     pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

2021-05-31 20:02:07.276 | INFO     | __main__:<module>:3 - Load prediction datasets
2021-05-31 20:02:19.617 | INFO     | __main__:<module>:23 - Save: /work/input/kfujikawa/kf-bms-candidates-v2/test_camaro_0531.csv
2021-05-31 20:02:19.766 | INFO     | __main__:<module>:32 - Add candidates: 6235388 -> 6246755 (+11367)


## yokoo 0601

In [20]:
outfile = "{datatype}_yokoo_0601.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/input/yokoo/v52/test_beam=16.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/yokoo/v54/test_beam=16.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/input/yokoo/v55/test_beam=16.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])
    if "levenshtein" not in merged_df:
        merged_df["levenshtein"] = None

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)")
    merged_df = merged_df.query("is_valid | ~image_id.isin(@valid_image_ids)")
    merged_df = merged_df.reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    if len(valid_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
        valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    if len(test_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
        test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
#     pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

2021-06-01 10:59:22.713 | INFO     | __main__:<module>:3 - Load prediction datasets
2021-06-01 10:59:34.098 | INFO     | __main__:<module>:25 - Save: /work/input/kfujikawa/kf-bms-candidates-v2/test_yokoo_0601.csv
2021-06-01 10:59:36.471 | INFO     | __main__:<module>:34 - Add candidates: 6246755 -> 6498794 (+252039)


## kf 0601

In [22]:
outfile = "{datatype}_kf_0601.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
#         pd.read_csv(OUTDIR / "/work/output/9008_1124+1127/valid_beam=1.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/output/9008_1124+1127/test_beam=1.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])
    if "levenshtein" not in merged_df:
        merged_df["levenshtein"] = None

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)")
    merged_df = merged_df.query("is_valid | ~image_id.isin(@valid_image_ids)")
    merged_df = merged_df.reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    if len(valid_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
        valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    if len(test_df) > 0:
        logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
        test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
#     pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

2021-06-01 16:25:37.040 | INFO     | __main__:<module>:3 - Load prediction datasets
2021-06-01 16:25:52.911 | INFO     | __main__:<module>:24 - Save: /work/input/kfujikawa/kf-bms-candidates-v2/test_kf_0601.csv
2021-06-01 16:25:52.958 | INFO     | __main__:<module>:33 - Add candidates: 6498794 -> 6501977 (+3183)
