In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
vocab = pd.read_csv("data/vocabulary.txt", header=None)

I used precomputed fingerprints from PubChem. To reproduce, you can run `python download_data_from_pubchem.py`, however it takes some time.

In [3]:
fingerprints = pd.read_csv("pubchem_fingerprints.csv")

In [4]:
train_df = train.merge(fingerprints, on="SMILES", how="left")
test_df = test.merge(fingerprints, on="SMILES", how="left")
print(train_df.fingerprint.isnull().sum(), "train molecules have no associated fingerprint")
print(test_df.fingerprint.isnull().sum(), "test molecules have no associated fingerprint")

33 train molecules have no associated fingerprint
5 test molecules have no associated fingerprint


I use only molecules which have fingerprint available to find k nearest neighbours, that's why I filter both train and test data and use unpacked fingerprints to compute K nearest neighbours. 

In [5]:
def to_bits(x):
    try:
        unpacked = np.unpackbits(np.frombuffer(bytes.fromhex(x), dtype=np.uint8))
    except Exception as e:
        print(e)
        print(x)
        
    return unpacked


train_df = train_df[~train_df.fingerprint.isnull()]
train_fingerprints = train_df.fingerprint.apply(to_bits)#lambda fingerprint_string: [x=='1' for x in fingerprint_string])
train_fingerprints = np.stack(train_fingerprints.values)

test_df = test_df[~test_df.fingerprint.isnull()]
test_fingerprints = test_df.fingerprint.apply(to_bits)#lambda fingerprint_string: [x=='1' for x in fingerprint_string])
test_fingerprints = np.stack(test_fingerprints.values)

In [6]:
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(train_fingerprints)
distances, neighbour_indices = nbrs.kneighbors(test_fingerprints)

In [7]:
for i, neighbours in zip(test_df.index, neighbour_indices):
    test.loc[i, "PREDICTIONS"] = ";".join([train.loc[train_df.index[x], "SENTENCE"] for x in neighbours])

In [8]:
test.PREDICTIONS.isnull().sum()

5

We still need to fill several predictions, for this we use top-5 most common molecular scents from train dataset.

In [9]:
train.SENTENCE.value_counts()[:5]

odorless    57
mint        36
fruity      32
woody       28
oily        24
Name: SENTENCE, dtype: int64

In [10]:
default_prediction = ";".join(train.SENTENCE.value_counts()[:5].index)

In [11]:
test.loc[test.PREDICTIONS.isnull(), "PREDICTIONS"] = default_prediction

In [12]:
test.to_csv("baseline_submission.csv", index=None)