In [1]:
#https://www.aicrowd.com/challenges/learning-to-smell/leaderboards

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
data_csv = pd.read_csv("/content/drive/My Drive/Colab Notebooks/smell-ident/train.csv")
data_test = pd.read_csv("/content/drive/My Drive/Colab Notebooks/smell-ident/test.csv")

X_data = data_csv["SMILES"].values
y_data = data_csv["SENTENCE"].values

X_all_text  = "".join(X_data)
chars = list(sorted(set(X_all_text)))

chars_to_idx = {ch:i for i, ch in enumerate(chars)}
idx_to_chars = {i:ch for ch, i in chars_to_idx.items()}

class_labels = []
class_index = {}

with open("/content/drive/My Drive/Colab Notebooks/smell-ident/vocab.txt","r") as lab_file:
  i=0
  for line in lab_file:
    class_index[line[:-1]] = i
    class_labels.append(line[:-1])
    i+=1

In [4]:
def encode_labels(original_labels):
  enc = []
  label_length = len(class_index)
  
  for row in original_labels:
    raw = np.zeros((label_length))
    for cls in row.split(','):
        raw[class_index[cls]] = 1
    
    enc.append(raw)
  return np.array(enc)

def prep_data(original_features):
  data = []
  for smile in original_features:
    row_enc = [chars_to_idx[c] for c in smile]
    data.append(np.array(row_enc))
  return np.array(data)


  

In [5]:
targets = encode_labels(y_data)
data = prep_data(X_data)

In [6]:
vocab_size  = len(chars_to_idx)
max_len = max([len(x) for x in data])

In [32]:
input_data = tf.keras.preprocessing.sequence.pad_sequences(data, padding="post", value=0, maxlen=150)

In [33]:
input_data.shape, targets.shape, targets[1]

((4316, 150),
 (4316, 109),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.]))

In [34]:
embed_dim = 200
lstm_out = 250

model = keras.models.Sequential()
model.add(keras.layers.Embedding(vocab_size, embed_dim,input_length = input_data.shape[1]))

#model.add(keras.layers.Conv1D(32, 3, activation='sigmoid', padding='same'))
#model.add(keras.layers.Conv1D(64, 3, activation='sigmoid', padding='same'))
model.add(keras.layers.MaxPooling1D(2))

model.add(keras.layers.Bidirectional(keras.layers.GRU(lstm_out, dropout=0.2, recurrent_dropout=0, return_sequences=True)))
#model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Bidirectional(keras.layers.GRU(lstm_out, dropout=0.2, recurrent_dropout=0)))

#model.add(keras.layers.Dropout(0.4))
#model.add(keras.layers.Dense(100))
#model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(len(class_labels),activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam')


In [35]:
X_train, X_test, y_train, y_test = train_test_split(input_data, targets, test_size=0.2, random_state=42)


In [37]:
model.fit(X_train, y_train, batch_size=32, epochs=15, validation_split=0.2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fd0573abdd8>

In [38]:
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

In [39]:
top_5 = np.argsort(test_predictions[0])[::-1][:5]
for idx in top_5:
  print(class_labels[idx])

res = np.where(y_test[0]==1)

print("\n")
for idx in res[0]:
  
  print(class_labels[idx])


fruity
resinous
balsamic
berry
sweet


balsamic
cinnamon
fruity
powdery
sweet


In [40]:
test_inp = X_data[112]
actual_op = y_data[112]
print(test_inp)

data_list = []
data_list.append(test_inp)

enc_inp = prep_data(data_list)
padded_inp = tf.keras.preprocessing.sequence.pad_sequences(enc_inp, padding='post', maxlen=250)

preds = model.predict(padded_inp)

top_ = np.argsort(preds[0])[::-1][:5]
for idx in top_:
  print(class_labels[idx])

print("\n")
print(actual_op)

CC(=CCCC(=CCCC(=O)C)C)C
fresh
fruity
citrus
floral
rose


fresh,rose,leaf,sweet,floral,aldehydic,fruity


In [42]:
X_test_data = data_test["SMILES"].values

out = []

for smile in X_test_data: 
  data_list = []
  data_list.append(smile)

  enc_inp = prep_data(data_list)
  padded_inp = tf.keras.preprocessing.sequence.pad_sequences(enc_inp, padding='post', maxlen=150)

  preds = model.predict(padded_inp)

  top_ = np.argsort(preds[0])[::-1][:15]
  
  main_list = []
  for i in range(0, len(top_), 3):
    sublist = []
    for j in range(i, i+3):
      sublist.append(class_labels[top_[j]])
    main_list.append(",".join(sublist))
  ans = ";".join(main_list)
  
  out.append([smile, ans])

df = pd.DataFrame(out, columns=["SMILES", "PREDICTIONS"])

In [43]:
df.to_csv("/content/drive/My Drive/Colab Notebooks/smell-ident/preds.csv",index=False)