In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
from aux_functions.name_2lines import name_2lines

In [None]:
#Set seed
seed=41

# Load data

In [None]:
feature_names = pickle.load(open('Dataset/feature_names.pkl', 'rb'))
X_train = pickle.load(open(f'Dataset/X_train.pkl', 'rb'))
y_train = pickle.load(open(f'Dataset/y_train.pkl', 'rb'))
sc_u = pickle.load(open(f'Dataset/SC_wunique_child.pkl', 'rb'))
c_u = pickle.load(open(f'Dataset/C_wunique_child.pkl', 'rb'))
feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))

# Feature Selection

In [None]:
##!!!! Resets feature selection file
#pickle.dump({}, open('Results/selected_features.pkl', 'wb'))

## Kingdom

In [None]:
X_train_ = X_train
y_train_ = y_train[:, 0]

sel = SelectFromModel(RandomForestClassifier(random_state = seed), threshold=1e-5)
sel.fit(X_train_, y_train_)
importances = sel.estimator_.feature_importances_
std = np.std([tree.feature_importances_ for tree in sel.estimator_.estimators_], axis=0)
mdi_f = sel.get_feature_names_out(feature_names)
mdi_f = list(mdi_f)
print('Important features:', mdi_f)

f_index = [feature_names.index(feature) for feature in mdi_f]
cor_matrix = pd.DataFrame(X_train[:, f_index], columns=mdi_f).corr(method='spearman').abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
sel_f = [f for f in mdi_f if f not in to_drop]
print('To drop:', to_drop)
print('Selected features:', sel_f)

feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
feat_dic['Kingdom'] = {'Chemical entities': {'MDI': {'importances': importances, 'std': std, 'selected features':mdi_f}, 
                                      'correlation (to drop)': to_drop, 'selected features': sel_f}}
pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

In [None]:
importances = feat_dic['Kingdom']['Chemical entities']['MDI']['importances']
std = feat_dic['Kingdom']['Chemical entities']['MDI']['std']
    
%matplotlib inline
plt.style.use('seaborn')
forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots(dpi=300)
forest_importances.plot.barh(xerr=std, ax=ax)
plt.gca().invert_yaxis()
ax.set_title("Feature importance using MDI",size=25)
ax.set_xlabel("Mean decrease in impurity", size=20)
fig.tight_layout()
fig.set_size_inches(12, 25)
plt.savefig(f'Plots/FeatureImportance/Kingdom.png')
plt.show()

## Superclass

In [None]:
kings = np.unique(y_train[:,0])
for i, king in enumerate(kings):
    print(i, king)

In [None]:
feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
feat_dic['Superclass'] = {}
pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

for i, king in enumerate(kings):
    print(i, king)
    X_train_ = X_train[y_train[:,0] == king, :]
    y_train_ = y_train[y_train[:,0] == king, :][:, 1]
    
    sel = SelectFromModel(RandomForestClassifier(random_state = seed), threshold=1e-5)
    sel.fit(X_train_, y_train_)
    importances = sel.estimator_.feature_importances_
    std = np.std([tree.feature_importances_ for tree in sel.estimator_.estimators_], axis=0)
    mdi_f = sel.get_feature_names_out(feature_names)
    mdi_f = list(mdi_f)
    print('Important features:', mdi_f)
    
    f_index = [feature_names.index(feature) for feature in mdi_f]
    cor_matrix = pd.DataFrame(X_train[:, f_index], columns=mdi_f).corr(method='spearman').abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
    sel_f = [f for f in mdi_f if f not in to_drop]
    print('To drop:', to_drop)
    print('Selected features:', sel_f)
    
    feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
    feat_dic['Superclass'][king] = {'MDI': {'importances': importances, 'std': std, 'selected features':mdi_f}, 
                                      'correlation (to drop)': to_drop, 'selected features': sel_f}
    pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

In [None]:
%matplotlib inline
for i, king in enumerate(kings):
    feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
    importances = feat_dic['Superclass'][king]['MDI']['importances']
    std = feat_dic['Superclass'][king]['MDI']['std']
    plt.style.use('seaborn-notebook')
    forest_importances = pd.Series(importances, index=feature_names)
    fig, ax = plt.subplots(dpi=300)
    forest_importances.plot.barh(xerr=std, ax=ax)
    plt.gca().invert_yaxis()
    ax.set_title(f"Feature importance using MDI\n({king})",size=25)
    ax.set_xlabel("Mean decrease in impurity", size=20)
    fig.tight_layout()
    fig.set_size_inches(12, 25)
    plt.savefig(f'Plots/FeatureImportance/Superclass_{i}.png')
    plt.show()

### Binary classifiers

In [None]:
sclasses_bin = {king:np.unique(y_train[y_train[:,0]==king][:, 1]) for king in np.unique(y_train[:,0])}
for king, sclasses in sclasses_bin.items():
    print(king, sclasses)

In [None]:
feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
feat_dic['Superclass_binary'] = {}
pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

for king, sclasses in sclasses_bin.items():
    print(king)
    for i, sclass in enumerate(sclasses):
        print(i, sclass)
        X_train_ = X_train[y_train[:,0] == king, :]
        y_train_ = y_train[y_train[:,0] == king, :][:, 1]
        X_train_ = np.append(X_train_[y_train_ == sclass, :], X_train_[y_train_ != sclass, :], axis=0)
        y_train_ = [*[1]*len(y_train_[y_train_ == sclass]), *[0]*len(y_train_[y_train_ != sclass])]
        y_train_ = np.array(y_train_)
    
        sel = SelectFromModel(RandomForestClassifier(random_state = seed), threshold=1e-5)
        sel.fit(X_train_, y_train_)
        importances = sel.estimator_.feature_importances_
        std = np.std([tree.feature_importances_ for tree in sel.estimator_.estimators_], axis=0)
        mdi_f = sel.get_feature_names_out(feature_names)
        mdi_f = list(mdi_f)
        print('Important features:', mdi_f)

        f_index = [feature_names.index(feature) for feature in mdi_f]
        cor_matrix = pd.DataFrame(X_train[:, f_index], columns=mdi_f).corr(method='spearman').abs()
        upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
        to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
        sel_f = [f for f in mdi_f if f not in to_drop]
        print('To drop:', to_drop)
        print('Selected features:', sel_f)

        feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
        feat_dic['Superclass_binary'][sclass] = {'MDI': {'importances': importances, 'std': std, 'selected features':mdi_f}, 
                                          'correlation (to drop)': to_drop, 'selected features': sel_f}
        pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

In [None]:
%matplotlib inline
for king, sclasses in sclasses_bin.items():
    print(king)
    for i, sclass in enumerate(sclasses):
        print(i, sclass)
        feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))['Superclass_binary'][sclass]['MDI']
        importances = feat_dic['importances']
        std = feat_dic['std']
        plt.style.use('seaborn-notebook')
        forest_importances = pd.Series(importances, index=feature_names)
        fig, ax = plt.subplots(dpi=300)
        forest_importances.plot.barh(xerr=std, ax=ax)
        plt.gca().invert_yaxis()
        ax.set_title(f"Feature importance using MDI\n(Binary Superclass - {sclass})",size=25)
        ax.set_xlabel("Mean decrease in impurity", size=20)
        fig.tight_layout()
        fig.set_size_inches(12, 25)
        plt.savefig(f'Plots/FeatureImportance/Kingdom={king}_Binary_Superclass={i}.png')
        plt.show()

## Class

In [None]:
sclasses = np.unique(y_train[:,1])
for i, sclass in enumerate(sclasses):
    if sclass in sc_u:
        continue
    print(i, sclass)

In [None]:
feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
feat_dic['Class'] = {}
pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

for i, sclass in enumerate(sclasses):
    if sclass in sc_u:
        continue
    print(i, sclass)
    
    X_train_ = X_train[y_train[:,1] == sclass, :]
    y_train_ = y_train[y_train[:,1] == sclass, :][:, 2]
    
    sel = SelectFromModel(RandomForestClassifier(random_state = seed), threshold=1e-5)
    sel.fit(X_train_, y_train_)
    importances = sel.estimator_.feature_importances_
    std = np.std([tree.feature_importances_ for tree in sel.estimator_.estimators_], axis=0)
    mdi_f = sel.get_feature_names_out(feature_names)
    mdi_f = list(mdi_f)
    print('Important features:', mdi_f)
    
    f_index = [feature_names.index(feature) for feature in mdi_f]
    cor_matrix = pd.DataFrame(X_train[:, f_index], columns=mdi_f).corr(method='spearman').abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
    sel_f = [f for f in mdi_f if f not in to_drop]
    print('To drop:', to_drop)
    print('Selected features:', sel_f, '\n')
    
    feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
    feat_dic['Class'][sclass] = {'MDI': {'importances': importances, 'std': std, 'selected features':mdi_f}, 
                                      'correlation (to drop)': to_drop, 'selected features': sel_f}
    pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

In [None]:
%matplotlib inline
for i, sclass in enumerate(sclasses):
    if sclass in sc_u:
        continue
    feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
    importances = feat_dic['Class'][sclass]['MDI']['importances']
    std = feat_dic['Class'][sclass]['MDI']['std']
    plt.style.use('seaborn-notebook')
    forest_importances = pd.Series(importances, index=feature_names)
    fig, ax = plt.subplots(dpi=300)
    forest_importances.plot.barh(xerr=std, ax=ax)
    plt.gca().invert_yaxis()
    ax.set_title(f"Feature importance using MDI\n({sclass})",size=25)
    ax.set_xlabel("Mean decrease in impurity", size=20)
    fig.tight_layout()
    fig.set_size_inches(12, 25)
    plt.savefig(f'Plots/FeatureImportance/Class_{i}.png')
    plt.show()

## Subclass

In [None]:
classes = np.unique(y_train[:,2])
for i, class_ in enumerate(classes):
    if class_ in c_u:
        continue
    print(i, class_)

In [None]:
feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
feat_dic['Subclass'] = {}
pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

for i, class_ in enumerate(classes):
    if class_ in c_u:
        continue
    print(i, class_)
    
    X_train_ = X_train[y_train[:,2] == class_, :]
    y_train_ = y_train[y_train[:,2] == class_, :][:, 3]
    
    sel = SelectFromModel(RandomForestClassifier(random_state = seed), threshold=1e-5)
    sel.fit(X_train_, y_train_)
    importances = sel.estimator_.feature_importances_
    std = np.std([tree.feature_importances_ for tree in sel.estimator_.estimators_], axis=0)
    mdi_f = sel.get_feature_names_out(feature_names)
    mdi_f = list(mdi_f)
    print('Important features:', mdi_f)
    
    f_index = [feature_names.index(feature) for feature in mdi_f]
    cor_matrix = pd.DataFrame(X_train[:, f_index], columns=mdi_f).corr(method='spearman').abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
    to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
    sel_f = [f for f in mdi_f if f not in to_drop]
    print('To drop:', to_drop)
    print('Selected features:', sel_f, '\n')
    
    feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
    feat_dic['Subclass'][class_] = {'MDI': {'importances': importances, 'std': std, 'selected features':mdi_f}, 
                                      'correlation (to drop)': to_drop, 'selected features': sel_f}
    pickle.dump(feat_dic, open('Results/selected_features.pkl', 'wb'))

In [None]:
%matplotlib inline
for i, class_ in enumerate(classes):
    if class_ in c_u:
        continue
    feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
    importances = feat_dic['Subclass'][class_]['MDI']['importances']
    std = feat_dic['Subclass'][class_]['MDI']['std']
    plt.style.use('seaborn-notebook')
    forest_importances = pd.Series(importances, index=feature_names)
    fig, ax = plt.subplots(dpi=300)
    forest_importances.plot.barh(xerr=std, ax=ax)
    plt.gca().invert_yaxis()
    ax.set_title(f"Feature importance using MDI\n({class_})",size=25)
    ax.set_xlabel("Mean decrease in impurity", size=20)
    fig.tight_layout()
    fig.set_size_inches(12, 25)
    plt.savefig(f'Plots/FeatureImportance/Subclass_{i}.png')
    plt.show()

## Importance heatmap

In [None]:
feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
importances = []
y = []
for classif_level, classif_name in {0:'Kingdom', 1:'Superclass', 2:'Class', 3:'Subclass'}.items():
    if classif_level == 0:
        importances.append(list(feat_dic[classif_name]['Chemical entities']['MDI']['importances']))
        y.append('(Kingdom) Chemical entities')
    else:
        for i in np.unique(y_train[:, classif_level-1]):
            if (i not in sc_u) and (i not in c_u):
                importances.append(list(feat_dic[classif_name][i]['MDI']['importances']))
                y.append(f'({classif_name}) {i}')
            else:
                continue

In [None]:
fig, ax = plt.subplots(figsize=(16, 30),dpi=800)
sns.heatmap(importances, cmap='Blues', xticklabels=feature_names, yticklabels=y, cbar_kws={"shrink": .40})
ax.set_title("Mean Decrease in Giny Feature Importance", fontsize=20)
ax.set_xlabel('Features', fontsize=15)
ax.set_ylabel('(ChemOnt level) Parent node', fontsize=15)
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 6)
plt.savefig('Plots/MDI Feature Importances', bbox_inches='tight', transparent=True)
plt.show()

In [None]:
importances_w_threshold = [[] for i in range(len(y))]
importances_w_threshold_features = []
for j in range(len(feature_names)):
    for i in range(len(y)):
        if importances[i][j] >= 0.1: # threshold
            importances_w_threshold_features.append(feature_names[j])
            for i in range(len(y)):
                importances_w_threshold[i].append(importances[i][j])
            break

In [None]:
rcParams['xtick.labelsize'] = 16
fig, ax = plt.subplots(figsize=(17, 30), dpi=800)
sns.heatmap(importances_w_threshold, cmap='Blues',
            yticklabels=y, cbar_kws={'shrink': .4, 'pad':0.02})
ax.set_title("Mean Decrease in Giny Feature Importance", fontsize=20)
ax.set_xlabel('Features', fontsize=15)
ax.set_xticklabels(name_2lines(importances_w_threshold_features, 10), ha='center', va='top', rotation=90)
ax.set_ylabel('(ChemOnt level) Parent node', fontsize=15)
plt.savefig('Plots/MDI Feature Importances (w_threshold)', bbox_inches='tight', transparent=True)
plt.show()

In [None]:
ranked_imp = []
for imp_class in importances_w_threshold:
    for rank in range(1, 11):
        highest_value = 0
        float_bool = False
        for i in imp_class:
            if i<1 and i>highest_value:
                highest_value = i
            if type(i) is np.float64:  #If there is float, sinalize
                float_bool = True
        if not float_bool:
            #If a rank ends finding no float, means we have hit the lowest number, so this middle rank is considered the last (11)
            imp_class = list(map(lambda i: 11 if i==rank-1 else i, imp_class))
        imp_class = list(map(lambda i: rank if i==highest_value else i, imp_class))
    ranked_imp.append(list(map(lambda i: 11 if type(i)!=int else i, imp_class)))

In [None]:
fig, ax = plt.subplots(figsize=(16, 30), dpi=800)
sns.heatmap(ranked_imp, cmap=sns.color_palette('Blues_r', 11, as_cmap=True), xticklabels=importances_w_threshold_features, yticklabels=y, 
            cbar_kws={"shrink": .40})
ax.set_title("Mean Decrease in Giny Feature Importance")
ax.set_xlabel('Features')
ax.set_ylabel('(ChemOnt level) Parent node')
plt.savefig('MDI Feature Importances (Ranked)', bbox_inches='tight', transparent=True)
plt.show()

### Correlated features

In [None]:
for key1, value1 in feat_dic.items():
    for key2, value2 in value1.items():
        print(value2['correlation (to drop)'])

### Selected features

In [None]:
feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))
df = []
for key1, value1 in feat_dic.items():
    if key1 == 'Superclass_binary':
        key1 = 'Superclass (binary classifiers)'
    for key2, value2 in value1.items():
        df.append({'Level':key1, 'Node':key2, 'Selected features': ", ".join(value2['selected features'])})
df = pd.DataFrame(df)
df.to_csv('Results/Selected_features.csv')

### N/C and Nitrogen correlation

In [None]:
feat_dic = pickle.load(open('Results/selected_features.pkl', 'rb'))

In [None]:
for key, value in feat_dic.items():
    for key2, value2 in value.items():
        if len(value2['correlation (to drop)']) != 3:
            print(key2)
            print(value2['MDI']['selected features'])
            print(value2['selected features'])
            print(value2['correlation (to drop)'])
            print()

In [None]:
%matplotlib inline
fig, ax = plt.subplots(figsize=(20, 10), dpi=800)
ax.plot(X_train[:, 4], X_train[:, -3], 'bo')
ax.set_xlabel('N/C')
ax.set_ylabel('Nitrogen')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 20), dpi=800)
ax.plot(X_train[:, 0], X_train[:, 1], 'bo')
ax.set_xlabel('Carbon')
ax.set_ylabel('Hydrogen')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 20), dpi=800)
ax.plot(X_train[:, -15], X_train[:, 0], 'bo')
ax.set_xlabel('Mass')
ax.set_ylabel('Carbon')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 20), dpi=800)
ax.plot(X_train[:, -15], X_train[:, 1], 'bo')
ax.set_xlabel('Mass')
ax.set_ylabel('Hydrogen')
plt.show()