In [53]:
# Importing Libraries
import numpy as np
import pandas as pd
import dtale
import dtale.app as dtale_app
import matplotlib.pyplot as plt
from matplotlib import rcParams
from tqdm.notebook import tqdm
from pathlib import Path
import pickle
from rdkit.Chem import Descriptors
from rdkit import Chem
import seaborn as sns
from aux_functions.name_2lines import name_2lines

In [54]:
#Set seed
seed=41

# Load Dataset

In [28]:
df = pd.read_pickle('~/Tese/Dataset.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290042 entries, 0 to 290041
Columns: 132 entries, InChIKey to Mass
dtypes: float64(119), object(13)
memory usage: 292.1+ MB


In [13]:
dtale.show(df, host='localhost')







In [29]:
columns = list(df.columns)
print(columns)

['InChIKey', 'Chemical Formula', 'Kingdom', 'Superclass', 'Class', 'Subclass', 'Level 5', 'Level 6', 'Level 7', 'Level 8', 'Level 9', 'Level 10', 'Level 11', 'C', 'H', 'O', 'P', 'N', 'S', 'Th', 'Cm', 'K', 'Na', 'Rb', 'Li', 'Cs', 'Fr', 'Ca', 'Mg', 'Be', 'Sr', 'Ba', 'Ce', 'La', 'Nd', 'Gd', 'Sm', 'Eu', 'Lu', 'Pr', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Te', 'Si', 'As', 'B', 'Ge', 'Sb', 'Al', 'Bi', 'Cr', 'Co', 'Cu', 'Fe', 'Mo', 'Mn', 'Zr', 'Ti', 'W', 'Ni', 'V', 'Ag', 'Hg', 'Cd', 'Au', 'Ta', 'Y', 'Ru', 'Pd', 'Pt', 'Re', 'Zn', 'Hf', 'Nb', 'Sc', 'Os', 'Ir', 'Cl', 'I', 'F', 'Br', 'He', 'Ar', 'Se', 'Ga', 'Sn', 'Tl', 'Pb', 'Xe', 'Rn', 'Ra', 'Pu', 'Kr', 'U', 'Tc', 'At', 'In', 'Po', 'Ne', 'Ac', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Rh', 'Mt', 'Ds', 'Rg', 'Pm', 'Yb', 'Pa', 'Np', 'Am', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Cn', 'Fl', 'Lv', 'Mc', 'Nh', 'Ts', 'Charge', 'Mass']


In [30]:
## Drop additional levels
df = df.drop(columns=['Level 5', 'Level 6', 'Level 7', 'Level 8', 'Level 9', 'Level 10', 'Level 11'])

In [32]:
df.insert(5, 'InChIKey', df.pop('InChIKey')) #Change column positions
df.insert(5, 'Chemical Formula', df.pop('Chemical Formula'))

# Prepare Dataset

In [18]:
print('Total of compounds -> ', len(df))

print(df[['Kingdom', 'Superclass', 'Class', 'Subclass']].nunique())

Total of compounds ->  290042
Kingdom         2
Superclass     28
Class         486
Subclass      876
dtype: int64


In [None]:
plt.style.use('seaborn-whitegrid')
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 12

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(16,10), dpi=500)
fig.suptitle("Categories' size distribution", fontsize=20, y=0.91)

df['Kingdom'].value_counts().plot.bar(ax=ax1)
ax1.set_xlabel('Kingdoms', size= 15)
ax1.set_ylabel('No. compounds (log scale)', size= 15)
ax1.set_yscale('log')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0, ha='center', va='top')

ax2.set_xlabel('No. compounds (log scale)', size= 15)
ax2.set_ylabel('No. Superclasses', size= 15)
ax2.set_xscale('log')
ax2.hist(df['Superclass'].value_counts().values, bins=np.logspace(start=np.log10(1), stop=np.log10(200000), num=10))

ax3.set_xlabel('No. compounds (log scale)', size= 15)
ax3.set_ylabel('No. Classes', size= 15)
ax3.set_xscale('log')
ax3.hist(df['Class'].value_counts().values, bins=np.logspace(start=np.log10(1), stop=np.log10(125000), num=25))

ax4.set_xlabel('No. compounds (log scale)', size= 15)
ax4.set_ylabel('No. Subclasses', size= 15)
ax4.set_xscale('log')
ax4.hist(df['Subclass'].value_counts().values, bins=np.logspace(start=np.log10(1), stop=np.log10(83000), num=25))
plt.savefig('Plots/Categories_size_distribution.png')
plt.show()

Transform "no classification" (np.nan) to "lowest level with classification" + "- Unspecified" to standardize the hierarchy<br>

In [19]:
#Example
dtale.show(df.loc[[129654, 21266, 1144]].loc[:,['Kingdom', 'Superclass', 'Class', 'Subclass']])







In [35]:
new_df = df.copy()

lvls = ['Kingdom', 'Superclass', 'Class', 'Subclass']
for i, row in tqdm(new_df[~new_df[lvls].notnull().all(axis=1)][lvls].iterrows(), total=new_df[~new_df[lvls].notnull().all(axis=1)][lvls].shape[0]):
    for i_lvl in range(1, 4):
        if type(row[lvls[i_lvl]]) is float: #nan is a float
            class_ = row[lvls][i_lvl-1]
            for i_class in range(i_lvl, 4):
                new_df.iloc[i, i_class] = class_ + ' - Unspecified'
            break
    if not new_df.iloc[i][lvls].notnull().all():
        print(new_df.iloc[i][lvls])

  0%|          | 0/17991 [00:00<?, ?it/s]

In [36]:
dtale.show(new_df.loc[[129654, 21266, 1144]].loc[:,['Kingdom', 'Superclass', 'Class', 'Subclass']])



In [37]:
new_df[['Kingdom', 'Superclass', 'Class', 'Subclass']].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290042 entries, 0 to 290041
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Kingdom     290042 non-null  object
 1   Superclass  290042 non-null  object
 2   Class       290042 non-null  object
 3   Subclass    290042 non-null  object
dtypes: object(4)
memory usage: 8.9+ MB


Filter smaller categories and leave them as 'Unspecified' as well, joining them into a bigger one if there is more than one small category with the same parent <br>

Filtering small subclasses into "Unspecified" (blanks) is not enough to make all subclasses bigger than 10. They would be otherwise discarded, therefore, in order to get the most out of the data, the categories at the level above the Unspecified will be joined as Unspecified as well, going from the bottom (subclasses) up (kingdom), joining categories that are smaller than 10.

In [38]:
print(new_df[['Subclass']].value_counts()[new_df[['Subclass']].value_counts()<10])

Subclass                                      
Dithianes - Unspecified                           9
Hydroxypyridines                                  9
Transition metal iodides                          9
Pyrimidodiazepines - Unspecified                  9
Rhoeadine alkaloids - Unspecified                 9
                                                 ..
Alkali metal hypochlorites                        1
Organotransition metal compounds - Unspecified    1
Tetrathianes - Unspecified                        1
Iodohydrins                                       1
Gamma sultones                                    1
Length: 589, dtype: int64


In [39]:
counts = new_df['Subclass'].value_counts()
#only iterate small subclasses that are not already "Unspecified" due to imputation
small_subclasses = list(counts[np.logical_and(counts.values<10, np.invert(counts.index.str.endswith(' - Unspecified')))].index)

#change small subclasses name to (class) + " - Unspecified"
for small_subclass in tqdm(small_subclasses):
    new_df.loc[(new_df['Subclass']==small_subclass) , 'Subclass'] = new_df['Class'].str.replace(' - Unspecified', '') + ' - Unspecified'

  0%|          | 0/401 [00:00<?, ?it/s]

In [40]:
#To count small classes both levels have to be considered because there are compounds that can have the same class 
#but different subclass, and from this set of small Subclasses, we want the classes that are not yet "Unspecified" 
#due to imputation
counts_CSc = new_df[['Class', 'Subclass']].value_counts()
print(counts_CSc[counts_CSc<10])
small_counts = counts_CSc[np.logical_and(counts_CSc<10, counts_CSc.index.get_level_values(0).str[-14:] != ' - Unspecified')]
small_classes = small_counts.index.get_level_values(0)

Class                        Subclass                                 
Organobromides               Organobromides - Unspecified                 9
Tetrazines                   Tetrazines - Unspecified                     9
Dithianes                    Dithianes - Unspecified                      9
Brevetoxins and derivatives  Brevetoxins and derivatives - Unspecified    9
Transition metal salts       Transition metal salts - Unspecified         9
                                                                         ..
Isothiochromenes             Isothiochromenes - Unspecified               1
Isoxazolopyridines           Isoxazolopyridines - Unspecified             1
Actinide salts               Actinide salts - Unspecified                 1
Trialkylphosphites           Trialkylphosphites - Unspecified             1
Metal alkyl halides          Metal alkyl halides - Unspecified            1
Length: 239, dtype: int64


In [41]:
#Filter is now made on the Class level. classes on the Class level with less than 10 compounds get the 
#superclass unspecified designation
for small_class in tqdm(small_classes):
    new_df.loc[(new_df['Class']==small_class) & (new_df['Subclass']==new_df['Class'] + ' - Unspecified' ) , 'Class'] = new_df['Superclass'].str.replace(' - Unspecified', '') + ' - Unspecified'

#To propagate the changes into the Subclass level
new_df.loc[new_df['Class'].str[-14:]==' - Unspecified', 'Subclass'] = new_df.loc[new_df['Class'].str[-14:]==' - Unspecified', 'Class']

  0%|          | 0/234 [00:00<?, ?it/s]

In [42]:
counts_ScC = new_df[['Superclass', 'Class']].value_counts()
print(counts_ScC[counts_ScC<10])
small_counts = counts_ScC[np.logical_and(counts_ScC<10, counts_ScC.index.get_level_values(0).str[-14:] != ' - Unspecified')]
small_superclasses = small_counts.index.get_level_values(0)

Superclass                       Class                                      
Organic salts                    Organic salts - Unspecified                    9
Organic 1,3-dipolar compounds    Organic 1,3-dipolar compounds - Unspecified    9
Hydrocarbons                     Hydrocarbons - Unspecified                     4
Inorganic salts                  Inorganic salts - Unspecified                  3
Allenes                          Allenes - Unspecified                          3
Organic compounds - Unspecified  Organic compounds - Unspecified                2
Organic cations                  Organic cations - Unspecified                  1
Organic anions                   Organic anions - Unspecified                   1
dtype: int64


In [43]:
dtale.show(new_df[np.logical_and(new_df['Superclass'].isin(small_superclasses), new_df['Class'].str[-14:]==' - Unspecified')].iloc[:,:4])



In [44]:
for small_class in tqdm(small_superclasses):
    new_df.loc[(new_df['Superclass']==small_class) & (new_df['Class']==new_df['Superclass'] + ' - Unspecified' ) , 'Superclass'] = new_df['Kingdom'].str.replace(' - Unspecified', '') + ' - Unspecified'

#To propagate the changes into the Class and Subclass levels
new_df.loc[new_df['Superclass'].str[-14:]==' - Unspecified', 'Class'] = new_df.loc[new_df['Superclass'].str[-14:]==' - Unspecified', 'Superclass']
new_df.loc[new_df['Superclass'].str[-14:]==' - Unspecified', 'Subclass'] = new_df.loc[new_df['Superclass'].str[-14:]==' - Unspecified', 'Superclass']

  0%|          | 0/7 [00:00<?, ?it/s]

In [45]:
new_df[['Kingdom', 'Superclass', 'Class', 'Subclass']].value_counts()[new_df[['Kingdom', 'Superclass', 'Class', 'Subclass']].value_counts()<10]

Kingdom              Superclass                         Class                              Subclass                         
Inorganic compounds  Inorganic compounds - Unspecified  Inorganic compounds - Unspecified  Inorganic compounds - Unspecified    3
dtype: int64

There are still 3 Inorganic compounds with no Superclass, so they were removed

In [46]:
new_df = new_df[new_df['Superclass']!='Inorganic compounds - Unspecified'].reset_index(drop=True)

In [47]:
new_df[['Kingdom', 'Superclass', 'Class', 'Subclass']].value_counts()

Kingdom            Superclass                                 Class                             Subclass                                 
Organic compounds  Lipids and lipid-like molecules            Glycerophospholipids              Glycerophosphoglycerophosphoglycerols        82829
                                                              Glycerolipids                     Triradylcglycerols                           31288
                   Organic acids and derivatives              Carboxylic acids and derivatives  Amino acids, peptides, and analogues         14068
                   Lipids and lipid-like molecules            Fatty Acyls                       Eicosanoids                                   9318
                   Organic oxygen compounds                   Organooxygen compounds            Carbohydrates and carbohydrate conjugates     7877
                                                                                                                               

In [48]:
new_df[['Kingdom', 'Superclass', 'Class', 'Subclass']].nunique()

Kingdom         2
Superclass     26
Class         311
Subclass      724
dtype: int64

### Feature Engineering (Create new features from the data)

In [50]:
df = new_df

metals = ['Li', 'Na', 'K', 'Rb', 'Cs', 'Fr', 'Be', 'Mg', 'Ca', 'Sr', 'Ba', 'Ra', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 
          'Co', 'Ni', 'Cu', 'Zn', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'Hf', 'Ta', 'W', 'Re', 
          'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Al', 'Ga', 'In', 'Sn', 'Tl', 'Pb', 'Bi', 'Po', 
          'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Ac', 'Th', 'Pa', 
          'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr']

synthetic = ['Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts']

df['Halogens'] = df[['F', 'Cl', 'Br', 'I']].sum(axis=1)
df['All Metals'] = df[metals].sum(axis=1)
df['Alkaline Metals'] = df[metals[:6]].sum(axis=1)
df['Alkaline Earth Metals'] = df[metals[6:12]].sum(axis=1)
df['Transition Metals'] = df[metals[12:46]].sum(axis=1)
df['Post Transition Metals'] = df[metals[46:54]].sum(axis=1)
df['Lanthanoids'] = df[metals[54:69]].sum(axis=1)
df['Actinoids'] = df[metals[69:84]].sum(axis=1)
df['Synthetics'] = df[synthetic].sum(axis=1)
df['O/C'] = df['O']/df['C']
df['H/C'] = df['H']/df['C']
df['N/C'] = df['N']/df['C']
df['P/C'] = df['P']/df['C']
df['N/P'] = df['N']/df['P']


#Data imputation -> 0 when the chemical element or one of them (ratios) are not in the formula
#Reset Index
df = df.replace([np.inf, -np.inf, np.nan], 0).reset_index(drop=True)

df.insert(0, 'InChIKey', df.pop('InChIKey')) #Change back columns position
df.insert(1, 'Chemical Formula', df.pop('Chemical Formula'))
df.to_pickle('~/Tese/Dataset_Preprocessed.pkl')