# About This Notebook

This notebook is based on https://www.kaggle.com/konradb/model-train-efficientnet & https://www.kaggle.com/konradb/model-infer-efficientnet, with a final score of 8.90 achieved in the BMS competition.

# Import Libraries

In [None]:
import os
import re
import gc
import cv2
import timm
import time
import math
import torch
import random
import Levenshtein
import numpy as np
import pandas as pd
import torch.nn as nn
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.optim import Adam, SGD
import torchvision.models as models
from torch.nn.parameter import Parameter
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from logging import getLogger, INFO, FileHandler,  Formatter,  StreamHandler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from albumentations import (
    Compose, OneOf, Normalize, Resize, RandomResizedCrop, RandomCrop, HorizontalFlip, VerticalFlip, 
    RandomBrightness, RandomContrast, RandomBrightnessContrast, Rotate, ShiftScaleRotate, Cutout, 
    IAAAdditiveGaussianNoise, Transpose, Blur
    )
from albumentations.pytorch import ToTensorV2
from albumentations import ImageOnlyTransform

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Read Input Data
> Import the train dataframe containing image IDs, InChI strings, their actual lengths and parsed sequences.

In [None]:
# read the input data contained in the pickle file saved previously
train_df = pd.read_pickle('../../data/train.pkl')

# Add File Paths
> Make the process of reading the input data more efficient by storing paths to files in the train dataframe.

In [None]:
def get_file_path(image_id: str) -> str:
    
    """
    This method returns the paths to train images by indexing into the overall directory
    and the image_id's components.
    
    :param image_id: ID of the image
    :type  image_id: str
    :return:         path to image
    :rtype:          str
    """
    
    # index into original train images if '-' is not present
    return '../../data/bms-molecular-translation/train/{}/{}/{}/{}.png'.format(
        image_id[0], image_id[1], image_id[2], image_id
    )

In [None]:
# get file paths
train_df['file_path'] = train_df['image_id'].apply(get_file_path)
train_df.to_csv('../../data/train_df.csv')

In [None]:
# import the file back
train_df = pd.read_csv('../../data/train_df.csv')

In [None]:
# limit to ±300K data sub-selected by size 200-350 (HxW)
valid_ids = pd.read_csv('../../data/bmssmalldataset/new_dataset.csv')['image_id']
train_df  = train_df[train_df['image_id'].isin(valid_ids)]
print(train_df.shape)

In [None]:
train_df.reset_index(inplace=True)

In [None]:
train_len = int(len(train_df) * 0.8)

# keep 20% as test dataset
test_df   = train_df.loc[train_len:, :]

# keep 80% as train dataset
train_df  = train_df.loc[:train_len, :]

# save as csv files
test_df.to_csv('../../data/test.csv')
train_df.to_csv('../../data/reduced_train.csv')

# Utilities
> This is a set of utility functions used throughout the computations.

In [None]:
def get_score(y_true: str, y_pred: str) -> float:
    
    """
    This function computes the Levenstein distance between a true label and a prediction.
    This gets computed for all the provided data and an average score is then returned.
    
    :param y_true: true InChI label
    :type  y_true: str
    :param y_pred: predicted InChI label
    :type  y_pred: str
    :return:       average Levenstein score
    :rtype:        float
    """
    
    # storage for all Levenstein scores
    scores = []
    
    # for each (true label, predicted label) pair, do
    for true, pred in zip(y_true, y_pred):
        
        # find Levenstein distance for the pair and append to storage
        score = Levenshtein.distance(true, pred)
        scores.append(score)
    
    # compute mean Levenstein distance
    avg_score = np.mean(scores)
    
    return avg_score

# Reload Predictions and Valid Labels for Each InChI Part

In [None]:
# reload predictions for each InChI part

inchi_1_preds = np.load('../../data/inchi1-predictions.npy')
inchi_2_preds = np.load('../../data/inchi2-predictions.npy')
inchi_3_preds = np.load('../../data/inchi3-predictions.npy')

In [None]:
# reload valid labels for each InChI part

inchi_1_valid_labels = np.load('../../data/inchi1-validations.npy')
inchi_2_valid_labels = np.load('../../data/inchi2-validations.npy')
inchi_3_valid_labels = np.load('../../data/inchi3-validations.npy')

# Get Total InChI Strings

In [None]:
# get total InChI prediction
interim_preds = np.add(inchi_1_preds, inchi_2_preds) 
final_preds   = np.add(interim_preds, inchi_3_preds)

In [None]:
# get total InChI validation label
interim_valid_labels = np.add(inchi_1_valid_labels, inchi_2_valid_labels)
final_valid_labels   = np.add(interim_valid_labels, inchi_3_valid_labels)

# Get Combined Mean Levenshtein Score

In [None]:
total_ld_score = get_score(final_valid_labels, final_preds)
print(total_ld_score)