# Learning to Smell with Polyssifier

Problem introduction: https://www.aicrowd.com/challenges/learning-to-smell

Author: Huaping Ding

This notebook will show all experiments on produce the best models with classic ML classifers via Polyssifier. 
The step involves: 
* data loading
* feature engineering: use thousands of molecular descriptors as features
* data spliting
* model building and optimization
* prediction on test set

Requirements:
* Install conda
* Install rdkit, mordred: `conda install -c rdkit -c mordred-descriptor mordred`
* Install polyssifier: `pip install polyssifier`


In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from mordred import Calculator, descriptors
from polyssifier import poly
import numpy as np



In [2]:
raw_df = pd.read_csv('./dataset/train.csv')
raw_df['SENTENCE_ARRAY'] = raw_df['SENTENCE'].apply(lambda x: x.split(','))
raw_df.head(9)

Unnamed: 0,SMILES,SENTENCE,SENTENCE_ARRAY
0,C/C=C/C(=O)C1CCC(C=C1C)(C)C,"fruity,rose","[fruity, rose]"
1,COC(=O)OC,"fresh,ethereal,fruity","[fresh, ethereal, fruity]"
2,Cc1cc2c([nH]1)cccc2,"resinous,animalic","[resinous, animalic]"
3,C1CCCCCCCC(=O)CCCCCCC1,"powdery,musk,animalic","[powdery, musk, animalic]"
4,CC(CC(=O)OC1CC2C(C1(C)CC2)(C)C)C,"coniferous,camphor,fruity","[coniferous, camphor, fruity]"
5,CCC[C@H](CCO)SC,tropicalfruit,[tropicalfruit]
6,CC(C)C(=O)OCc1ccco1,"fruity,coffee","[fruity, coffee]"
7,OC[C@H]1[C@H]2CC[C@H]3[C@@]1(C)CCCC([C@@H]23)(C)C,woody,[woody]
8,CCc1ccc(cc1)CC(C=O)(C)C,"clean,fresh","[clean, fresh]"


#### Create one-hot encoding for the labels

In [3]:
vocabulary = pd.read_csv('./dataset/vocabulary.txt', header=None, names=['SMELL'])
vocabulary.head(3)

Unnamed: 0,SMELL
0,alcoholic
1,aldehydic
2,alliaceous


In [4]:
# vocal_dict = { row['SMELL']: i for i, row in vocabulary.iterrows()}

for idx, row in vocabulary.iterrows():
    class_name = "class_" + str(idx)
    raw_df[class_name] = raw_df['SENTENCE_ARRAY'].apply(lambda x: 1 if row['SMELL'] in x else 0)

raw_df.head()

Unnamed: 0,SMILES,SENTENCE,SENTENCE_ARRAY,class_0,class_1,class_2,class_3,class_4,class_5,class_6,...,class_99,class_100,class_101,class_102,class_103,class_104,class_105,class_106,class_107,class_108
0,C/C=C/C(=O)C1CCC(C=C1C)(C)C,"fruity,rose","[fruity, rose]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,COC(=O)OC,"fresh,ethereal,fruity","[fresh, ethereal, fruity]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Cc1cc2c([nH]1)cccc2,"resinous,animalic","[resinous, animalic]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,C1CCCCCCCC(=O)CCCCCCC1,"powdery,musk,animalic","[powdery, musk, animalic]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CC(CC(=O)OC1CC2C(C1(C)CC2)(C)C)C,"coniferous,camphor,fruity","[coniferous, camphor, fruity]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Feature engineering with molecular descriptors

In [5]:
raw_df['SMILES']

0                     C/C=C/C(=O)C1CCC(C=C1C)(C)C
1                                       COC(=O)OC
2                             Cc1cc2c([nH]1)cccc2
3                          C1CCCCCCCC(=O)CCCCCCC1
4                CC(CC(=O)OC1CC2C(C1(C)CC2)(C)C)C
                          ...                    
4311    CCOC(=O)[C@]12CCC[C@@H]2[C@H]2C[C@@H]1CC2
4312                       CC1(O)C2(C)CCCC1(C)CC2
4313                       C1CC(=N[C@@H]1C(=O)O)O
4314                            CCCc1c(C)ncc(C)n1
4315                          COC1C=CC2C1C1CCC2C1
Name: SMILES, Length: 4316, dtype: object

In [6]:
calc = Calculator(descriptors, ignore_3D=True)
print("number of molecular descriptors: ", len(calc.descriptors))
print("Top 10 descriptors: ", calc.descriptors[0:10])

number of molecular descriptors:  1613
Top 10 descriptors:  (mordred.ABCIndex.ABCIndex(), mordred.ABCIndex.ABCGGIndex(), mordred.AcidBase.AcidicGroupCount(), mordred.AcidBase.BasicGroupCount(), mordred.AdjacencyMatrix.AdjacencyMatrix('SpAbs'), mordred.AdjacencyMatrix.AdjacencyMatrix('SpMax'), mordred.AdjacencyMatrix.AdjacencyMatrix('SpDiam'), mordred.AdjacencyMatrix.AdjacencyMatrix('SpAD'), mordred.AdjacencyMatrix.AdjacencyMatrix('SpMAD'), mordred.AdjacencyMatrix.AdjacencyMatrix('LogEE'))


In [7]:
mol = Chem.MolFromSmiles(raw_df.iloc[0]['SMILES'])
#res = calc(mol)

In [8]:
# calculate multiple molecules
raw_mols = [Chem.MolFromSmiles(smi) for smi in raw_df['SMILES']]

mols_desc_df = calc.pandas(raw_mols)
mols_desc_df.head()

100%|██████████| 4316/4316 [05:32<00:00, 12.15it/s]


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,10.355232,9.695963,0,0,15.9217,2.37344,4.74689,15.9217,1.13727,3.53381,...,9.474165,45.559427,192.151415,5.651512,305,19,68.0,76.0,6.64583,3.138889
1,3.644924,4.097495,0,0,6.89898,1.93185,3.8637,6.89898,1.14983,2.5958,...,7.280008,29.753427,90.031694,7.502641,31,4,20.0,19.0,3.61111,1.666667
2,7.847124,7.174621,0,0,13.043,2.36361,4.55714,13.043,1.3043,3.26103,...,9.085117,53.938362,131.073499,6.898605,108,11,52.0,60.0,2.83333,2.194444
3,12.130205,8.271213,0,0,21.3099,2.06235,4.12469,21.3099,1.25352,3.68802,...,8.551595,46.913417,238.229666,5.068716,592,18,70.0,71.0,4.86111,4.166667
4,13.314894,12.642147,0,0,20.1051,2.63153,4.9989,20.1051,1.18265,3.77866,...,10.15995,67.54691,238.19328,5.539379,505,27,94.0,115.0,7.81944,3.520833


In [None]:
full_info_df = pd.concat([raw_df, mols_desc_df], axis=1)
full_info_df.head()

In [None]:
output_file = './dataset/train-data-with-class-and-descriptors-v2.csv'
full_info_df.to_csv(output_file, index=False)

#### Clean dataframe to start data spliting and model training

In [None]:
full_info_df.columns

In [None]:
feature_names = full_info_df.columns[-len(calc.descriptors):]
raw_data_df = full_info_df[feature_names]
raw_data_df.head()

In [None]:
label_names = full_info_df.columns[3:vocabulary.shape[0]+3]
raw_label_df = full_info_df[label_names]
raw_label_df.head()

### Training model with Polyssifier 

In [None]:
np.unique(raw_label_df['class_1'].values)

In [None]:
data = raw_data_df.values
label = raw_label_df['class_10'].values

print(data.shape)
print(label.shape)

report = poly(data,label, n_folds=3, verbose=True, save=False, scale=True,
              feature_selection=False, scoring='auc', concurrency=5)
report.plot_scores()

In [None]:
report = poly(data,label, n_folds=3, verbose=True, save=False, scale=True,
              feature_selection=False, scoring='auc', concurrency=1)