In [1]:
from pathlib import Path

import click
import numpy as np
import pandas as pd
import Levenshtein
from tqdm.auto import tqdm
from loguru import logger

In [2]:
%load_ext autoreload
%autoreload 2
from nncomp_molecule.preprocessors import normalize_inchi_batch, disable_rdlogger

In [3]:
OUTDIR = Path("/work/input/kfujikawa/kf-bms-candidates")

## Load training dataset

In [4]:
OUT_COLUMNS = [
    "image_id",
    "InChI",
    "levenshtein",
    "is_valid",
]

In [5]:
disable_rdlogger()
candidate_ids = set()
origin_df = pd.read_pickle("/work/input/kfujikawa/bms-preprocess-v2/train.pkl").set_index("image_id")

## base: kf_0523

In [6]:
outfile = "{datatype}_kf_0523.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/input/kfujikawa/kf-bms-candidates-0523/valid_candidates.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/input/kfujikawa/kf-bms-candidates-0523/test_candidates.csv").assign(has_label=False),
    ], ignore_index=True)

    merged_df.pop("InChI_GT")
    merged_df["InChI"] = merged_df.pop("normed_InChI")
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
    pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
valid_image_ids = merged_df.image_id.unique()
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

Overwrite? [y/N]: 

 n


2021-05-27 22:13:32.563 | INFO     | __main__:<module>:22 - Add candidates: 0 -> 11156854 (+11156854)


## camaro_0525

In [7]:
outfile = "{datatype}_camaro_0525.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/input/kfujikawa/bms-camaro-predictions/exp084_val_norm_score.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/input/kfujikawa/bms-camaro-predictions/exp084_test_norm_score.csv").assign(has_label=False),
    ], ignore_index=True)

    # Save valid image ids
    logger.info(f"Save: {OUTDIR / 'shared_valid_image_ids_kf_camaro.csv'}")
    shared_valid_image_ids_kf_camaro_df = pd.DataFrame(
        list(set(valid_image_ids) & set(merged_df.query("has_label").image_id)),
        columns=["image_id"],
    )
    shared_valid_image_ids_kf_camaro_df.to_csv(OUTDIR / "shared_valid_image_ids_kf_camaro.csv", index=False)

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)").reset_index(drop=True)

    # Calc levenshtein
    valid_df = merged_df.query("has_label").reset_index(drop=True)
    valid_df.loc[valid_df.index, "InChI_GT"] = origin_df.loc[valid_df.image_id].InChI.values
    merged_df.loc[valid_df.index, "levenshtein"] = [
        Levenshtein.distance(x, y)
        for x, y in tqdm(valid_df[["InChI", "InChI_GT"]].values)
    ]
    
    # Calc is_valid
    merged_df["normed_InChI"] = normalize_inchi_batch(merged_df.InChI).values
    merged_df["is_valid"] = ~merged_df.normed_InChI.isna()

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
    valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
    test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)
    
merged_df = pd.concat([
    pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

Overwrite? [y/N]: 

 n


2021-05-27 22:14:51.068 | INFO     | __main__:<module>:46 - Add candidates: 11156854 -> 11322778 (+165924)


## kf_0525

In [8]:
outfile = "{datatype}_kf_0525.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/output/9007_1109+1113/valid_beam=1.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/output/9007_1109+1113/valid_beam=4.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/output/9007_1109+1113/test_beam=1.csv").assign(has_label=False),
        pd.read_csv(OUTDIR / "/work/output/9007_1109+1113/test_beam=4.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df["InChI"] = merged_df.pop("normed_InChI")
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)").reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
    valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
    test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
    pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

Overwrite? [y/N]: 

 n


2021-05-27 22:15:06.812 | INFO     | __main__:<module>:30 - Add candidates: 11322778 -> 13147362 (+1824584)


## yokoo0527

In [11]:
outfile = "{datatype}_yokoo_0527.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/input/yokoo/v52/valid_beam=1.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/input/yokoo/v52/test_beam=1.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)").reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
    valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
    test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
    pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

2021-05-27 22:48:29.481 | INFO     | __main__:<module>:3 - Load prediction datasets
2021-05-27 22:48:48.991 | INFO     | __main__:<module>:16 - Save: /work/input/kfujikawa/kf-bms-candidates/valid_yokoo_0527.csv
2021-05-27 22:48:49.026 | INFO     | __main__:<module>:18 - Save: /work/input/kfujikawa/kf-bms-candidates/test_yokoo_0527.csv
2021-05-27 22:48:49.272 | INFO     | __main__:<module>:27 - Add candidates: 13147362 -> 13168028 (+20666)


## kf_0527

In [13]:
outfile = "{datatype}_kf_0527.csv"
if not (OUTDIR / outfile.format(datatype="test")).exists() or click.confirm("Overwrite?"):
    logger.info("Load prediction datasets")
    merged_df = pd.concat([
        pd.read_csv(OUTDIR / "/work/output/9007_1109+1113/valid_beam=8.csv").assign(has_label=True),
        pd.read_csv(OUTDIR / "/work/output/9007_1109+1113/test_beam=8.csv").assign(has_label=False),
    ], ignore_index=True)
    merged_df["InChI"] = merged_df.pop("normed_InChI")
    merged_df = merged_df.drop_duplicates(subset=["image_id", "InChI"])

    # Filter only new candidates
    merged_df["candidate_id"] = merged_df.image_id.str.cat(merged_df.InChI)
    merged_df = merged_df.query("~candidate_id.isin(@candidate_ids)").reset_index(drop=True)

    # Save valid & test csvs
    valid_df, test_df = merged_df.query("has_label")[OUT_COLUMNS], merged_df.query("~has_label")[OUT_COLUMNS]
    logger.info(f"Save: {OUTDIR / outfile.format(datatype='valid')}")
    valid_df.to_csv(OUTDIR / outfile.format(datatype="valid"), index=False)
    logger.info(f"Save: {OUTDIR / outfile.format(datatype='test')}")
    test_df.to_csv(OUTDIR / outfile.format(datatype="test"), index=False)

merged_df = pd.concat([
    pd.read_csv(OUTDIR / outfile.format(datatype="valid")),
    pd.read_csv(OUTDIR / outfile.format(datatype="test")),
], ignore_index=True)
n_candidates_before = len(candidate_ids)
candidate_ids |= set(merged_df.image_id.str.cat(merged_df.InChI))
logger.info(f"Add candidates: {n_candidates_before} -> {len(candidate_ids)} (+{len(candidate_ids) - n_candidates_before})")

2021-05-28 10:42:51.497 | INFO     | __main__:<module>:3 - Load prediction datasets
2021-05-28 10:44:48.931 | INFO     | __main__:<module>:17 - Save: /work/input/kfujikawa/kf-bms-candidates/valid_kf_0527.csv
2021-05-28 10:44:54.192 | INFO     | __main__:<module>:19 - Save: /work/input/kfujikawa/kf-bms-candidates/test_kf_0527.csv
2021-05-28 10:45:42.556 | INFO     | __main__:<module>:28 - Add candidates: 13168028 -> 18869462 (+5701434)
