# SUMMARY

This notebook normalizes model predictions using RDKit molecule translation functionality. Path to model predictions is specified as `orig_path`. To reproduce the submission, each model prediction needs to be normalized using the script below.

In [None]:
##### MODEL PREDICTIONS

orig_path = '../input/bms-sub-v6/submission.csv'

# PREPARATIONS

In [1]:
##### LIBRARIES

!conda install -y -c rdkit rdkit

import pandas as pd
import Levenshtein
from tqdm import tqdm 
from pathlib import Path

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / done
Solving environment: \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | 

In [2]:
##### IMPORT PREDICTIONS

sub_df = pd.read_csv(orig_path)
sub_df['event'] = 'error'
sub_df.to_csv('submission_orig.csv', index = False)
display(sub_df.head())
sub_df.shape

Unnamed: 0,image_id,InChI,event
0,e5dca4c9bebd,InChI=1S/CH4O/c1-2-3/h4H,error
1,c2d86f19e139,InChI=1S/C3HCl3NS/c4-1-2(5)8-3(6)7-1,error
2,2f939a72af1e,InChI=1S/C5H5N/c1-2-4-6-5-3-1/h1-5H/i4D,error
3,d92f14b46849,"InChI=1S/C2F3NO2S/c3-2(4,5)9(6,7)1-6",error
4,e5b5c145b588,"InChI=1S/C2BrCl4NO/c3-1(4,5)2(6,7)8-9",error


(1616107, 3)

In [3]:
##### NORMALIZATION SCRIPT

'''Adapted https://www.kaggle.com/nofreewill/normalize-your-predictions'''

%%writefile normalize_inchis.py

# packages
from tqdm import tqdm
from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from pathlib import Path

# normalization
def normalize_inchi(inchi):
    try:
        mol = Chem.MolFromInchi(inchi)
        return (inchi, 'none') if (mol is None) else (Chem.MolToInchi(mol), 'valid')     
    except: return (inchi, 'error')

# paths
orig_path = Path('submission_orig.csv')
norm_path = Path('submission_norm.csv')

# do the job
N = norm_path.read_text().count('\n') if norm_path.exists() else 0
print(N, 'number of predictions already normalized')

# open files
r = open(str(orig_path), 'r')
w = open(str(norm_path), 'a', buffering = 1)

for _ in range(N):
    r.readline()
    
line = r.readline()
w.write(line)

# loop through lines
for line in tqdm(r):
    splits   = line[:-1].split(',')
    image_id = splits[0]
    inchi    = ','.join(splits[1:-1]).replace('"','')
    inchi_norm, inchi_event = normalize_inchi(inchi)
    w.write(f'{image_id},"{inchi_norm}","{inchi_event}"\n')

# close files
r.close()
w.close()

Writing normalize_inchis.py


# NORMALIZATION

In [4]:
##### RUN THE SCRIPT

!while [ 1 ]; do python normalize_inchis.py && break; done

0 number of predictions already normalized
212042it [02:14, 1477.66it/s]/bin/bash: line 1:  9529 Segmentation fault      (core dumped) python normalize_inchis.py
212165 number of predictions already normalized
90894it [01:03, 1404.06it/s]/bin/bash: line 1:  9531 Segmentation fault      (core dumped) python normalize_inchis.py
303201 number of predictions already normalized
316858it [04:01, 1247.24it/s]/bin/bash: line 1:  9533 Segmentation fault      (core dumped) python normalize_inchis.py
620118 number of predictions already normalized
84596it [01:09, 1205.33it/s]/bin/bash: line 1:  9535 Segmentation fault      (core dumped) python normalize_inchis.py
704825 number of predictions already normalized
34335it [00:28, 1189.45it/s]/bin/bash: line 1:  9537 Segmentation fault      (core dumped) python normalize_inchis.py
739261 number of predictions already normalized
46656it [00:39, 1138.14it/s]/bin/bash: line 1:  9539 Segmentation fault      (core dumped) python normalize_inchis

In [5]:
##### CHECK PREDICTIONS

norm_path   = Path('submission_norm.csv')
sub_norm_df = pd.read_csv(norm_path)
sub_norm_df['event'].value_counts()

valid    1424052
none      192018
error         37
Name: event, dtype: int64

In [6]:
##### CHECK LEVENSTAIN GAIN

# paths
orig_path = Path('submission_orig.csv')
norm_path = Path('submission_norm.csv')

# read preds
sub_df      = pd.read_csv(orig_path)
sub_norm_df = pd.read_csv(norm_path)

lev = 0
N   = len(sub_df)

# compute distances
for i in tqdm(range(N)):
    inchi, inchi_norm = sub_df.iloc[i]['InChI'], sub_norm_df.iloc[i]['InChI']
    lev += Levenshtein.distance(inchi, inchi_norm)

print(lev/N)

100%|██████████| 1616107/1616107 [04:50<00:00, 5560.83it/s]

0.15851858818753956



