In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from sklearnex import patch_sklearn #Improves sklearn alghoritms performance
patch_sklearn()
import sklearn
print('scikit-learn version\n', sklearn.__version__)
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,  f1_score, precision_score, classification_report, balanced_accuracy_score, confusion_matrix
import seaborn as sns
from aux_functions.name_2lines import name_2lines
from aux_functions.make_plots import plot_confusion_matrix, OOMFormatter
from aux_functions.prediction import get_class_w_prob
from itertools import product
import pickle
import os
from tqdm.notebook import tqdm
import dtale
import plotly.express as px

In [None]:
#Set seed
seed=41

In [None]:
feature_names = pickle.load(open('Dataset/feature_names.pkl', 'rb'))
X_test = pickle.load(open(f'Dataset/X_test.pkl', 'rb'))
X_test_scal = pickle.load(open(f'Dataset/X_test_scal.pkl', 'rb'))
y_test = pickle.load(open(f'Dataset/y_test.pkl', 'rb'))
y_train = pickle.load(open(f'Dataset/y_train.pkl', 'rb'))
all_categories = pickle.load(open('Dataset/all_categories.pkl', 'rb')) #all classes in the dataset
sc_u = pickle.load(open(f'Dataset/SC_wunique_child.pkl', 'rb')) #unique superclasses
c_u = pickle.load(open(f'Dataset/C_wunique_child.pkl', 'rb')) #unique classes
kings, sclasses, classes, subclasses = all_categories['Kingdom'], all_categories['Superclass'], all_categories['Class'], all_categories['Subclass'] #nodes in each level

#Set parent classes of the respective level, and which of them have only one child (doesn't have cf)
hierarchy = {'Kingdom':{'classes': ['Chemical entities'], 'uniques':[]}, 'Superclass':{'classes': kings, 'uniques': []},
     'Class':{'classes': sclasses, 'uniques': sc_u}, 'Subclass': {'classes': classes, 'uniques': c_u}}

### Join GSCV results from all multiclass classifiers into a DF

In [None]:
#Add  results to a DataFrame which will contain all of GSCV scores of multiclass classifiers
results = []
    
for level in ['Kingdom', 'Superclass', 'Class', 'Subclass']:
    cls = hierarchy[level]['classes']
    uniques = hierarchy[level]['uniques']
    for i, class_ in enumerate(cls):
        if class_ in uniques:
            continue
        elif level == 'Kingdom':
            node_name = f'({level})'
            i = ''
        else:
            node_name = f'({level}) {class_}'
            i = f'{i}_'
        for f_sel in ['all', 'sel']:
            for alg in ['RF', 'KNN', 'LR', 'SVM', 'NB']:
                gs = pickle.load(open(f'Models/{level}/{i}{alg}_{f_sel}_feat.pkl', 'rb'))
                results.append({'Node': node_name, 'Feature selection': f_sel, 'Algorithm':alg,
                                 'F1-Score-Macro (Val)': gs.cv_results_['mean_test_f1_macro'][gs.best_index_],
                                 'F1-Score-Macro (Train)': gs.cv_results_['mean_train_f1_macro'][gs.best_index_],
                                 'F1-Score-Micro (Val)': gs.cv_results_['mean_test_f1_micro'][gs.best_index_],
                                 'F1-Score-Micro (Train)': gs.cv_results_['mean_train_f1_micro'][gs.best_index_]})
results = pd.DataFrame(results)
results.to_pickle('Results/Multiclass_GSCV_scores.pkl')

### Class vs Subclass classifiers performance

In [None]:
h = pd.DataFrame(y_test, columns=['Kingdom', 'Superclass', 'Class', 'Subclass']).sort_values(by=['Kingdom', 'Superclass', 'Class', 'Subclass'])

In [None]:
h[['Superclass', 'Class']].drop_duplicates()['Superclass'].value_counts().mean()

In [None]:
h[['Class', 'Subclass']].drop_duplicates()['Class'].value_counts().mean()

In [None]:
h['Superclass'].value_counts().mean()

In [None]:
h['Class'].value_counts().mean()

## Compare algorithms' performance

In [None]:
results = pd.read_pickle('Results/Multiclass_GSCV_scores.pkl')
count = []
for node in results['Node'].drop_duplicates().values:
    print(node)
    filter_ = results[results['Node']==node]
    alg = filter_.loc[filter_['F1-Score-Macro (Val)'].idxmax(), :]['Algorithm']
    score = filter_.loc[filter_['F1-Score-Macro (Val)'].idxmax(), :]['F1-Score-Macro (Val)']
    filter_score = filter_.loc[filter_['F1-Score-Macro (Val)']==score]
    print(filter_score['Algorithm'].drop_duplicates().values)
    count.extend(filter_score['Algorithm'].drop_duplicates().values)

In [None]:
total = len(results['Node'].drop_duplicates().values)

In [None]:
np.unique(count, return_counts=True)

In [None]:
95*100/total

In [None]:
37*100/total

In [None]:
24*100/total

In [None]:
18*100/total

In [None]:
7*100/total

In [None]:
### NB is taken out because there is always another algorithm with the same score
### SVM does not return probability of prediction estimates
###Superclass - Inorganic
###75.13 - macro RF
###91.75 - micro RF
###76.13 - macro SVM
###93.72 - micro SVM
###Subclass
###Azolydines 0.8088/0.8061 (LR)
###Linear 1,3-diarylpropanoids 0.6030/0.5995 (KNN)
###Miscellaneous mixed metals/non-metals equal to the 4th decimal
###Triazines 0.6012/0.5935 (LR)

## Select classifiers

All algorithms. Ignores NB and SVM. NB always has other algorithms with the same value, SVM does not output proba.
For SVM, when there is no other algorithm with the same score, chooses the next best algorithm
Na escolha entre algoritmos, dar prioridade (RF, KNN, LR), para não haver tanta heterogeneidade

In [None]:
results = pd.read_pickle('Results/Multiclass_GSCV_scores.pkl')
results = results[results['Algorithm'].isin(['RF', 'KNN', 'LR'])]
level_scores = {}
category_size = pd.Series(y_train.flat).value_counts()
for level in ['Kingdom', 'Superclass', 'Class', 'Subclass']:
    cls = hierarchy[level]['classes']
    uniques = hierarchy[level]['uniques']
    print(level)
    macro = 0
    macro_n = 0
    micro = 0
    micro_n = 0
    for i, class_ in enumerate(cls):
        if class_ in uniques:
            continue
        elif level == 'Kingdom':
            node_name = f'({level})'
            load_i = ''
            dump_i = ''
        else:
            node_name = f'({level}) {class_}'
            load_i = f'{i}_'
            dump_i = f'_{i}'
        print(class_)
        #Chooses the best algorithm/f_sel combination
        score = results.loc[results.loc[results['Node']==node_name, 'F1-Score-Macro (Val)'].idxmax(), 'F1-Score-Macro (Val)']
        filter_ = results.loc[(results['Node']==node_name) & (results['F1-Score-Macro (Val)']==score)]
        #If there is more than one combination
        if len(filter_) > 1:
            #Priority to algorithms, by order RF, KNN, LR
            algs = filter_['Algorithm'].drop_duplicates().values
            for alg in ['RF', 'KNN', 'LR']:
                if alg in algs:
                    #Priority to no feature selection == 'all'
                    if 'all' in filter_[filter_['Algorithm']==alg]['Feature selection'].values:
                        f_sel = 'all'
                    else:
                        f_sel = 'sel'
                    print(alg, f_sel)
                    break
        else:
            alg = filter_.iloc[0].loc['Algorithm']
            f_sel = filter_ .iloc[0].loc['Feature selection']
        
        gs = pickle.load(open(f'Models/{level}/{load_i}{alg}_{f_sel}_feat.pkl', 'rb'))
        pickle.dump(gs.best_estimator_, open(f'Selected_classifiers/{level}{dump_i}.pkl', 'wb'))
        
        scores = results[(results['Node']==node_name) & (results['Algorithm']==alg) & (results['Feature selection']==f_sel)].iloc[0]
        macro += scores['F1-Score-Macro (Val)']*len(gs.best_estimator_.classes_)
        macro_n += len(gs.best_estimator_.classes_)
        if class_ == 'Chemical entities':
            micro += scores['F1-Score-Micro (Val)']*(category_size['Organic compounds'] + category_size['Inorganic compounds'])
            micro_n += (category_size['Organic compounds'] + category_size['Inorganic compounds'])
        else:
            micro += scores['F1-Score-Micro (Val)']*category_size[class_]
            micro_n += category_size[class_]
    level_scores[level] = {'macro_f1_score':macro/macro_n, 'micro_f1_score':micro/micro_n}

In [None]:
#Performance of the classifiers at each level, based on local score from cross_validation
pd.DataFrame(level_scores)

## Validation on Test

### Load the classifiers

In [None]:
cfs = {}
for level in ['Kingdom', 'Superclass', 'Class', 'Subclass']:
    print(level)
    cls = hierarchy[level]['classes']
    uniques = hierarchy[level]['uniques']
    cfs[level] = {}
    for i, class_ in tqdm(enumerate(cls), total=len(cls)):
        if class_ in uniques:
            continue
        elif level == 'Kingdom':
            i = ''
        else:
            i = f'_{i}'
        with open(f'Selected_classifiers/{level}{i}.pkl', 'rb') as f:
            cf = pickle.load(f)
            if cf.n_features_in_ != 133:
                if level == 'Kingdom':
                    sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))[level]['selected features']
                else:
                    sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))[level][class_]['selected features']
                f_index = [feature_names.index(feature) for feature in sel_f]
            else:
                f_index = list(range(len(feature_names)))
            cfs[level][class_] = (cf, f_index)

### Predict

In [None]:
levels = ['Kingdom', 'Superclass', 'Class', 'Subclass']

#Kingdom prediction
cf, fi = cfs['Kingdom']['Chemical entities']
#first level prediction is joined into a df
#decision of scaling or not is done inside this function (based if it is RF or not)
y_pred = get_class_w_prob(cf, 'Kingdom', X_test[:, fi], X_test_scal[:, fi])

#Sublevels prediction
for i, level in enumerate(levels):
    if i==0:
        continue
    print(level)
    #classes that do not need classifier
    uniques = hierarchy[level]['uniques']
    #df for the prediciton of the whole level
    new_pred = pd.DataFrame()
    level_cfs = cfs[level]
    #iterate each parent node to make predictions on the whole level
    for class_ in tqdm(level_cfs.keys(), desc='Classifiers prediction'):
        cf, fi = level_cfs[class_]
        #index of the samples to be classified (the ones with the parent node predicted at previous level)
        index = y_pred[y_pred[f'{levels[i-1]} pred']==class_].index
        if np.shape(X_test[index])[0] == 0: #There is nothing to predict
            continue
        y_pred_class = get_class_w_prob(cf, level, X_test[index][:, fi], X_test_scal[index][:, fi], index=index)
        new_pred = pd.concat([new_pred, y_pred_class], axis=0)
    if len(uniques)>0:
        for class_ in tqdm(uniques, desc='Unique classes atribution'):
            index = y_pred[y_pred[f'{levels[i-1]} pred']==class_].index
            class_pred = y_test[y_test[:,i-1]==class_][0][i] #Look up for the only class (might be unspecified or not)
            y_pred_class = pd.DataFrame([[class_pred, 1]]*len(index), columns=[f'{level} pred', f'{level} prob'], index=index)
            new_pred = pd.concat([new_pred, y_pred_class], axis=0)
    y_pred = y_pred.join(new_pred)
    y_pred[f'{level} prob'] = y_pred.apply(lambda row : row[f'{levels[i-1]} prob']*row[f'{level} prob'], axis=1)
y_pred.to_pickle('Results/test_validation/pred_df.pkl') #Results from prediction on the 4 levels

### Blocking

In [None]:
y_pred = pd.read_pickle('Results/test_validation/pred_df.pkl') #Real prediction on test results
block_df = pd.DataFrame() 
for threshold in tqdm([0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]):
    res_thr = y_pred.copy()
    res_thr.loc[res_thr['Kingdom prob']<threshold] = np.nan
    res_thr.loc[res_thr['Superclass prob']<threshold, ['Superclass pred', 'Superclass prob', 'Class pred', 'Class prob', 'Subclass pred', 'Subclass prob']] = np.nan
    res_thr.loc[res_thr['Class prob']<threshold, ['Class pred', 'Class prob', 'Subclass pred', 'Subclass prob']] = np.nan
    res_thr.loc[res_thr['Subclass prob']<threshold, ['Subclass pred', 'Subclass prob']] = np.nan
    res_thr['Threshold'] = threshold
    res_thr['Number predicted levels'] = res_thr.apply(lambda row: row.iloc[[0, 2, 4, 6]].count(), axis=1)
    res_thr = res_thr.reset_index().rename(columns={'index':'sample index'})
    block_df = pd.concat([block_df, res_thr])
block_df.reset_index(drop=True, inplace=True)
#Join true classes to the DF
block_df = block_df.join(pd.DataFrame(y_test, columns=['Kingdom', 'Superclass', 'Class', 'Subclass']), on='sample index')
block_df.to_pickle('Results/test_validation/pred_w_blocking.pkl')

### Number of compounds by leaf level after blocking

In [None]:
%matplotlib inline
block_df = pd.read_pickle('Results/test_validation/pred_w_blocking.pkl')
sns.set_style('white')

rcParams['figure.figsize'] = 16, 10
rcParams['figure.dpi'] = 800
rcParams['axes.titlesize'] = 20
rcParams['xtick.labelsize'] = 15
rcParams['ytick.labelsize'] = 13
rcParams['axes.labelsize'] = 15
rcParams['font.size'] = 10

ax = sns.countplot(x="Number predicted levels", hue='Threshold', data=block_df[block_df['Threshold']>=0.5], 
                   palette=sns.color_palette("Blues"), dodge=True)

class_order = [0, 1, 2, 3, 4] 
hue_order = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
bar_order = product(hue_order, class_order)
spots = zip(ax.patches, bar_order)

for spot in spots:
    total = len(block_df[block_df['Threshold']==spot[1][0]])
    level_total = len(block_df[(block_df['Number predicted levels']==spot[1][1]) & 
        (block_df['Threshold']==spot[1][0])])
    height = spot[0].get_height()
    if np.isnan(height):
        height=0
    #ax.text(spot[0].get_x()+0.07, height+2*10**3, '{:1.0f}'.format(level_total), ha='center', va='center')
    ax.text(spot[0].get_x()+0.07, height+ 1e3, '{:1.0f}%'.format(level_total*100/total), ha='center', va='center')

ax.set_ylabel('Number of samples')
ax.set_xlabel('')
ax.set_xticklabels(['No prediction', 'Kingdom', 'Kingdom &\n Superclass', 'Kingdom, Superclass,\n& Class', 
                    'Kingdom, Superclass,\nClass, & Subclass'])
ax.set(yscale='linear')
ax.yaxis.set_major_formatter(OOMFormatter(3, '%1.0f'))
plt.legend(title='Threshold', fontsize='large', loc='upper left')

plt.savefig(f'Plots/Test prediction(n of levels).png', bbox_inches='tight', transparent=True, dpi=800)
plt.show()

In [None]:
block_df = pd.read_pickle('Results/test_validation/pred_w_blocking.pkl')
##Number of compounds with subclass prediction not using an actual classifier for Class and/or Subclass prediction, 
#from direct interpolation from unique childs
for thr in [0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]:
    print('Threshold - ', thr)
    thr_results = block_df[block_df['Threshold']==thr]
    thr_results = thr_results[thr_results['Number predicted levels']==4]
    a = thr_results.apply(lambda row: 1 if row['Class pred'] in c_u or row['Superclass pred'] in sc_u else 0, axis=1)
    print(a.sum())

In [None]:
##Number of compounds with Class leaf prediction not using an actual classifier for Class prediction, 
#from direct interpolation from parents with unique childs
#0 thr not included because prediction will never end on Class level
for thr in [0.5, 0.6, 0.7, 0.8, 0.9, 0.95]:
    print('Threshold - ', thr)
    thr_results = block_df[block_df['Threshold']==thr]
    thr_results = thr_results[thr_results['Number predicted levels']==3]
    a = thr_results.apply(lambda row: 1 if row['Superclass pred'] in sc_u else 0, axis=1)
    print(a.sum())

#number increases and then decreases because this is restricted to compounds with a leaf on Class level. Increasing
#threshold will make more difficult for compounds to be further predicted and pass 0.7, confidence of prediction of 
# these compounds is progressively lost

### Compute scores and classification reports

when blocking is applied, samples without that level on prediction are filtered out

In [None]:
block_df = pd.read_pickle('Results/test_validation/pred_w_blocking.pkl')
scores = []
clf_reports = []
total = len(block_df[(block_df['Threshold']==0) & (block_df['Number predicted levels']==4)])
for i, level in enumerate(['Kingdom', 'Superclass', 'Class', 'Subclass']):
    for thr in tqdm([0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95], desc=f'Threshold lvl {i+1}'):
        results_thr = block_df[block_df['Threshold']==thr]
        y_pred = results_thr[results_thr[level + ' pred'].notna()][level + ' pred']
        y_true = results_thr[results_thr[level + ' pred'].notna()][level]
        clf_report = classification_report(y_true, y_pred, labels=all_categories[level], output_dict=True)
        try:
            micro = clf_report['accuracy']
        except:
            micro = clf_report['micro avg']['f1-score']
        n = len(block_df[(block_df['Threshold']==thr) & (block_df['Number predicted levels']>=i+1)])
        scores.append({'Level': level, 'Threshold': thr,
                        'F1 score (micro)': np.round(micro, 4),
                        'F1 score (macro)': np.round(clf_report['macro avg']['f1-score'], 4),
                      'Compound coverage': f'{n} ({np.round(n*100/total, 1)}%)'})
        for key, value in clf_report.items():
            if key in ['accuracy', 'macro avg', 'weighted avg', 'micro avg']:
                continue
            for key2, value2 in value.items():
                clf_reports.append({'Level':level, 'Threshold': thr, 'Categorie':key, 'metric_name':key2, 'value':value2})
pd.DataFrame(scores).to_pickle('Results/test_validation/scores.pkl')
pd.DataFrame(clf_reports).to_pickle('Results/test_validation/classification_report.pkl')

In [None]:
test_scores = pd.read_pickle('Results/test_validation/scores.pkl')
test_scores

In [None]:
#Flatten above df
df = {0:{}, 0.5:{}, 0.6:{}, 0.7:{}, 0.8:{}, 0.9:{}, 0.95:{}}
for i, row in test_scores.iterrows():
    lvl = row['Level']
    df[row['Threshold']][f'{lvl} - macro'] =  row['F1 score (macro)']
    df[row['Threshold']][f'{lvl} - micro'] =  row['F1 score (micro)']
    df[row['Threshold']][f'{lvl} - compound coverage'] = row['Compound coverage']
df = pd.DataFrame.from_dict(df, orient='index')
df.to_excel('Results/test_validation/scores.xlsx')

In [None]:
df

### Confusion matrixes

In [None]:
block_df = pd.read_pickle('Results/test_validation/pred_w_blocking.pkl')

In [None]:
level = 'Kingdom'
results_thr = block_df[block_df['Threshold']==0]
y_pred = results_thr[level + ' pred']
y_true = results_thr[level]


plot_confusion_matrix(y_true, y_pred, all_categories['Kingdom'], annot=True, font_scale=0.5, annot_size = 6, dpi=800, figsize=(2*1.2,2), 
                      title=f'Kingdom level\n(no blocking)', title_size=8, ticklabels_size=4, xticklabel_rotation=0, 
                      yticklabel_rotation=0, label_size=6, x_ha='center', x_va='top', y_ha='right', y_va='center',
                      cbar_kws = {"shrink":1, "pad":0.02},
                      save_dir=f'Plots/ConfusionM/Kingdom.png')

In [None]:
level = 'Kingdom'
results_thr = block_df[block_df['Threshold']==0.95]
y_pred = results_thr[level + ' pred']
y_true = results_thr[level][y_pred.notna()]
y_pred = y_pred[y_pred.notna()]


plot_confusion_matrix(y_true, y_pred, all_categories['Kingdom'], annot=True, font_scale=0.5, annot_size = 6, dpi=800, figsize=(2*1.2,2), 
                      title=f'Kingdom level\n(threshold=0.95)', title_size=8, ticklabels_size=4, xticklabel_rotation=0, 
                      yticklabel_rotation=0, label_size=6, x_ha='center', x_va='top', y_ha='right', y_va='center',
                      cbar_kws = {"shrink":1, "pad":0.02},
                      save_dir=f'Plots/ConfusionM/Kingdom_thr=0.95.png')

In [None]:
level = 'Superclass'
results_thr = block_df[block_df['Threshold']==0]
y_pred = results_thr[level + ' pred']
y_true = results_thr[level]


plot_confusion_matrix(y_true, y_pred, all_categories['Superclass'], annot=True, font_scale=0.5, annot_size = 3, dpi=800, 
                      figsize=(7,5), title=f'Superclass level\n(no blocking)', title_size=8, ticklabels_size=4, 
                      xticklabel_rotation=90, yticklabel_rotation=0, label_size=4, x_ha='center', x_va='top', y_ha='right', 
                      y_va='center', cbar_kws = {'shrink':0.5, 'pad':0.01},
                      save_dir=f'Plots/ConfusionM/Superclass.png')

In [None]:
level = 'Superclass'
results_thr = block_df[block_df['Threshold']==0.95]
y_pred = results_thr[level + ' pred']
y_true = results_thr[level][y_pred.notna()]
y_pred = y_pred[y_pred.notna()]


plot_confusion_matrix(y_true, y_pred, all_categories['Superclass'], annot=True, font_scale=0.5, annot_size = 3, dpi=800, 
                      figsize=(7,5), title=f'Superclass level\n(threshold=0.95)', title_size=8, ticklabels_size=4, 
                      xticklabel_rotation=90, yticklabel_rotation=0, label_size=4, x_ha='center', x_va='top', y_ha='right', 
                      y_va='center', cbar_kws = {'shrink':0.5, 'pad':0.01},
                      save_dir=f'Plots/ConfusionM/Superclass_thr=0.95.png')

In [None]:
level = 'Superclass'
results_thr = block_df[block_df['Threshold']==0.5]
y_pred = results_thr[level + ' pred']
y_true = results_thr[level][y_pred.notna()]
y_pred = y_pred[y_pred.notna()]


plot_confusion_matrix(y_true, y_pred, all_categories['Superclass'], annot=True, font_scale=0.5, annot_size = 3, dpi=800, figsize=(7,5), 
                      title=f'Superclass level\n(threshold=0.5)', title_size=8, ticklabels_size=4, xticklabel_rotation=90, 
                      yticklabel_rotation=0, label_size=4, x_ha='center', x_va='top', y_ha='right', y_va='center',
                      cbar_kws = {'shrink':0.5, 'pad':0.01},
                      save_dir=f'Plots/ConfusionM/Superclass_thr=0.5.png')

Confusion matrixes for Class and Subclass would be too big

In [None]:
clf_reports = pickle.load(open('Results/test_validation/classification_report.pkl', 'rb'))

### Class level

In [None]:
sns.set_theme(style='whitegrid')

rcParams['figure.figsize'] = 3.5/2.54, 42/2.54
rcParams['figure.dpi'] = 800
rcParams['axes.titlesize'] = 12
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 7
rcParams['font.size'] = 7.5

for i, classes_parts in enumerate(np.split(classes, [52, 104, 156, 208, 260])):
    data = clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Class') & 
                      (clf_reports['metric_name'].isin(['precision', 'recall'])) &
                      (clf_reports['Categorie'].isin(classes_parts))]
    ax = sns.barplot(x="value", y="Categorie", hue='metric_name', data=data)
    ax.set(ylabel=None, xlabel=None)
    ax.set_yticklabels(name_2lines(classes_parts, 30), ha='right', va='center')
    ax.set_xticks([0, 0.5, 1])
    ax.set(xlim=(0, 1))
    sns.move_legend(ax, "lower center", bbox_to_anchor=(0, 1), ncol=2, title=None, fontsize=9)
    plt.savefig(f'Plots/Class_test_classf_report_{i}', bbox_inches='tight', transparent=True, dpi=800)
    plt.show()

### No blocking

In [None]:
len(classes)

#### number of classes without any prediction

In [None]:
#number of classes without any prediction
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Class') & 
                      (clf_reports['metric_name']=='f1-score') &
                      (clf_reports['value']==0)])

Of which inorganic

In [None]:
ino_classes = np.unique(y_test[y_test[:, 0]=='Inorganic compounds'][:, 2])
len(ino_classes)

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Class') & 
                      (clf_reports['metric_name']=='f1-score') &
                      (clf_reports['value']==0) & (clf_reports['Categorie'].isin(ino_classes))])

#### number of classes with recall lower than 0.2 and different than 0

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Class') & 
                      (clf_reports['metric_name']=='recall') &
                      (clf_reports['value']<=0.2)]) - 74

of which inorganic

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Class') & 
                      (clf_reports['metric_name']=='f1-score') &
                      (clf_reports['value']<=0.2) & (clf_reports['Categorie'].isin(ino_classes))]) - 2 

#### number of classes with recall higher than 0.7

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Class') & 
                      (clf_reports['metric_name']=='recall') &
                      (clf_reports['value']>=0.7)])

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Class') & 
                      (clf_reports['metric_name']=='f1-score') &
                      (clf_reports['value']>=0.7) & (clf_reports['Categorie'].isin(ino_classes))])

### Blocking

#### number of classes without any prediction (0.5 thr)

In [None]:
#number of classes without any prediction
len(clf_reports[(clf_reports['Threshold']==0.5) & (clf_reports['Level']=='Class') & 
                (clf_reports['metric_name']=='f1-score') &
                (clf_reports['value']==0)])

#### number of classes without any prediction (0.95 thr)

In [None]:
#number of classes without any prediction
len(clf_reports[(clf_reports['Threshold']==0.95) & (clf_reports['Level']=='Class') & 
                (clf_reports['metric_name']=='f1-score') &
                (clf_reports['value']==0)])

### Subclass level

In [None]:
no_classified = clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & (clf_reports['metric_name'] == 'f1-score') & (clf_reports['value']==0)]['Categorie'].drop_duplicates().values

In [None]:
classified = clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & (clf_reports['metric_name'] == 'f1-score') & (clf_reports['value']!=0)]['Categorie'].drop_duplicates().values

In [None]:
len(no_classified)

In [None]:
len(classified)

Take out all classes with score==0 to decrease figures size

In [None]:
sns.set_theme(style='whitegrid')

rcParams['figure.figsize'] = 3.5/2.54, 42/2.54
rcParams['figure.dpi'] = 800
rcParams['axes.titlesize'] = 12
rcParams['xtick.labelsize'] = 9
rcParams['ytick.labelsize'] = 7
rcParams['font.size'] = 7.5

for i, classes_parts in enumerate(np.split(classified, [59, 118, 177, 236, 295, 354, 413, 472])):
    data = clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & 
                      (clf_reports['metric_name'].isin(['precision', 'recall'])) &
                      (clf_reports['Categorie'].isin(classes_parts))]
    ax = sns.barplot(x="value", y="Categorie", hue='metric_name', data=data)
    ax.set(ylabel=None, xlabel=None)
    ax.set_yticklabels(name_2lines(classes_parts, 30), ha='right', va='center')
    ax.set_xticks([0, 0.5, 1])
    ax.set(xlim=(0, 1))
    sns.move_legend(ax, "lower center", bbox_to_anchor=(0, 1), ncol=2, title=None, fontsize=9)
    plt.savefig(f'Plots/Subclass_test_classf_report_{i}', bbox_inches='tight', transparent=True, dpi=800)
    plt.show()

#### Analysis

##### No blocking

#### number of classes without any prediction

In [None]:
len(subclasses)

In [None]:
#number of classes without any prediction
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & 
                      (clf_reports['metric_name']=='f1-score') &
                      (clf_reports['value']==0)])

Of which inorganic

In [None]:
ino_subclasses = np.unique(y_test[y_test[:, 0]=='Inorganic compounds'][:, 3])
len(ino_subclasses)

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & 
                      (clf_reports['metric_name']=='f1-score') &
                      (clf_reports['value']==0) & (clf_reports['Categorie'].isin(ino_subclasses))])

#### number of classes with recall lower than 0.2 and different than 0

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & 
                      (clf_reports['metric_name']=='recall') &
                      (clf_reports['value']<=0.2)]) - 192

of which inorganic

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & 
                      (clf_reports['metric_name']=='f1-score') &
                      (clf_reports['value']<=0.2) & (clf_reports['Categorie'].isin(ino_subclasses))]) - 5

#### number of classes with recall higher than 0.7

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & 
                      (clf_reports['metric_name']=='recall') &
                      (clf_reports['value']>=0.7)])

of which inorganic

In [None]:
len(clf_reports[(clf_reports['Threshold']==0) & (clf_reports['Level']=='Subclass') & 
                      (clf_reports['metric_name']=='f1-score') &
                      (clf_reports['value']>=0.7) & (clf_reports['Categorie'].isin(ino_subclasses))])

### Blocking

#### number of classes without any prediction (0.5 thr)

In [None]:
#number of classes without any prediction
len(clf_reports[(clf_reports['Threshold']==0.5) & (clf_reports['Level']=='Subclass') & 
                (clf_reports['metric_name']=='f1-score') &
                (clf_reports['value']==0)])

#### number of classes without any prediction (0.95 thr)

In [None]:
#number of classes without any prediction
len(clf_reports[(clf_reports['Threshold']==0.95) & (clf_reports['Level']=='Subclass') & 
                (clf_reports['metric_name']=='f1-score') &
                (clf_reports['value']==0)])

## Biological Validation

In [None]:
db = pd.read_pickle('../Dataset_Preprocessed.pkl')
yeast_data = pd.read_excel('Results/bio_validation/Yeast MS data.xlsx')
fing_data = pd.read_excel('Results/bio_validation/Fingerprint MS data.xlsx')
scaler = pickle.load(open('Dataset/fitted_scaler.pkl', 'rb'))
yeast_data['Bucket label'] = yeast_data['Bucket label'].str.split(expand=True)[0].str.replace(',', '.').astype(float)
yeast_data = yeast_data.set_index('Bucket label')
fing_data['Bucket label'] = fing_data['Bucket label'].str.split(expand=True)[0].str.replace(',', '.').astype(float)
fing_data = fing_data.set_index('Bucket label')

In [None]:
yeast_df = {}
#Join samples, keep peak it it is present in at least 2 of the replicates
for sample in ['BY0', 'GRE3', 'ENO1', 'GLO1', 'GLO2']:
    yeast_df[sample] = yeast_data.loc[:, yeast_data.columns[yeast_data.columns.str.startswith(sample)]].apply(lambda x: len(x[x==0])<2, axis=1)
yeast_df = pd.DataFrame(yeast_df)
yeast_df = yeast_df[yeast_df.any(axis=1)]
yeast_df

In [None]:
all_matches = []
for peak in tqdm(yeast_df.index, total=len(yeast_df)):
    matches = []
    ppm_dev = abs((db['Mass']-peak)/peak)*10**6
    ppm_dev_below_1 = ppm_dev[ppm_dev<1]
    min_ = ppm_dev_below_1.min()
    if not np.isnan(min_):
        ppm_dev_min = ppm_dev[ppm_dev==min_].index
        matches = []
        features = db.loc[ppm_dev_min[0], :].iloc[6:]
        yeast_df.loc[peak, features.index] = features.values
        for i, row in db.loc[ppm_dev_min, ['Kingdom', 'Superclass', 'Class', 'Subclass']].drop_duplicates().iterrows():
            matches.append([row['Kingdom'], row['Superclass'], row['Class'], row['Subclass']])
        all_matches.append(matches)
    else:
        all_matches.append(np.nan)
yeast_df.loc[:, 'Matches'] = pd.Series(all_matches, index=yeast_df.index)
yeast_df.insert(5, 'Matches', yeast_df.pop('Matches'))
yeast_df = yeast_df.dropna()
yeast_df.to_pickle('Results/bio_validation/yeast_annotation.pkl')

In [None]:
fing_df = {}
#Join samples, keep peak it it is present in at least 2 of the replicates
for sample in ['V1', 'V2', 'V3', 'V4', 'V5', 'V6']:
    fing_df[sample] = fing_data.loc[:, fing_data.columns[fing_data.columns.str.startswith(sample)]].apply(lambda x: len(x[x==0])<2, axis=1)
fing_df = pd.DataFrame(fing_df)
fing_df = fing_df[fing_df.any(axis=1)]
fing_df

In [None]:
all_matches = []
for peak in tqdm(fing_df.index, total=len(fing_df)):
    matches = []
    ppm_dev = abs((db['Mass']-peak)/peak)*10**6
    ppm_dev_below_1 = ppm_dev[ppm_dev<1]
    min_ = ppm_dev_below_1.min()
    if not np.isnan(min_):
        ppm_dev_min = ppm_dev[ppm_dev==min_].index
        matches = []
        features = db.loc[ppm_dev_min[0], :].iloc[6:]
        fing_df.loc[peak, features.index] = features.values
        for i, row in db.loc[ppm_dev_min, ['Kingdom', 'Superclass', 'Class', 'Subclass']].drop_duplicates().iterrows():
            matches.append([row['Kingdom'], row['Superclass'], row['Class'], row['Subclass']])
        all_matches.append(matches)
    else:
        all_matches.append(np.nan)
fing_df.loc[:, 'Matches'] = pd.Series(all_matches, index=fing_df.index)
fing_df.insert(6, 'Matches', fing_df.pop('Matches'))
fing_df = fing_df.dropna()
fing_df.to_pickle('Results/bio_validation/fing_annotation.pkl')

In [None]:
yeast_df = pd.read_pickle('Results/bio_validation/yeast_annotation.pkl')
fing_df = pd.read_pickle('Results/bio_validation/fing_annotation.pkl')

In [None]:
for sample in ['BY0', 'GRE3', 'ENO1', 'GLO1', 'GLO2']:
    print(len(yeast_df[yeast_df[sample]==True]))
print(len(yeast_df))

In [None]:
for sample in ['V1', 'V2', 'V3', 'V4', 'V5', 'V6']:
    print(len(fing_df[fing_df[sample]==True]))
print(len(fing_df))

### Make prediction

### Load the classifiers

In [None]:
cfs = {}
for level in ['Kingdom', 'Superclass', 'Class', 'Subclass']:
    print(level)
    cls = hierarchy[level]['classes']
    uniques = hierarchy[level]['uniques']
    cfs[level] = {}
    for i, class_ in tqdm(enumerate(cls), total=len(cls)):
        if class_ in uniques:
            continue
        elif level == 'Kingdom':
            i = ''
        else:
            i = f'_{i}'
        with open(f'Selected_classifiers/{level}{i}.pkl', 'rb') as f:
            cf = pickle.load(f)
            if cf.n_features_in_ != 133:
                if level == 'Kingdom':
                    sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))[level]['selected features']
                else:
                    sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))[level][class_]['selected features']
                f_index = [feature_names.index(feature) for feature in sel_f]
            else:
                f_index = list(range(len(feature_names)))
            cfs[level][class_] = (cf, f_index)

### Predict

##### Yeast

In [None]:
df = pd.read_pickle('Results/bio_validation/yeast_annotation.pkl')
X = df.iloc[:, 6:].values
X_scal = scaler.transform(X)

levels = ['Kingdom', 'Superclass', 'Class', 'Subclass']

#Kingdom prediction
cf, fi = cfs['Kingdom']['Chemical entities']
#first level prediction is joined into a df
#decision of scaling or not is done inside this function (based if it is RF or not)
y_pred = get_class_w_prob(cf, 'Kingdom', X[:, fi], X_scal[:, fi])

#Sublevels prediction
for i, level in enumerate(levels):
    if i==0:
        continue
    print(level)
    #classes that do not need classifier
    uniques = hierarchy[level]['uniques']
    #df for the prediciton of the whole level
    new_pred = pd.DataFrame()
    level_cfs = cfs[level]
    #iterate each parent node to make predictions on the whole level
    for class_ in tqdm(level_cfs.keys(), desc='Classifiers prediction'):
        cf, fi = level_cfs[class_]
        #index of the samples to be classified (the ones with the parent node predicted at previous level)
        index = y_pred[y_pred[f'{levels[i-1]} pred']==class_].index
        if np.shape(X_test[index])[0] == 0: #There is nothing to predict
            continue
        y_pred_class = get_class_w_prob(cf, level, X[index][:, fi], X_scal[index][:, fi], index=index)
        new_pred = pd.concat([new_pred, y_pred_class], axis=0)
    if len(uniques)>0:
        for class_ in tqdm(uniques, desc='Unique classes atribution'):
            index = y_pred[y_pred[f'{levels[i-1]} pred']==class_].index
            class_pred = y_test[y_test[:,i-1]==class_][0][i] #Look up for the only class (might be unspecified or not)
            y_pred_class = pd.DataFrame([[class_pred, 1]]*len(index), columns=[f'{level} pred', f'{level} prob'], index=index)
            new_pred = pd.concat([new_pred, y_pred_class], axis=0)
    y_pred = y_pred.join(new_pred)
    y_pred[f'{level} prob'] = y_pred.apply(lambda row : row[f'{levels[i-1]} prob']*row[f'{level} prob'], axis=1)

In [None]:
#Join pred to original DF
df.loc[:, y_pred.columns] = y_pred.values

In [None]:
for i,row in df.iterrows():
    pred = row[['Kingdom pred', 'Superclass pred', 'Class pred', 'Subclass pred']].values
    matches = np.array(row['Matches'])
    if np.shape(matches)[0] == 1:
        df.loc[row.name, ['Kingdom', 'Superclass', 'Class', 'Subclass']] = matches[0]
        continue
    for match in matches:
        if np.array_equal(match,pred): #if there is an exact match
            df.loc[row.name, ['Kingdom', 'Superclass', 'Class', 'Subclass']] = match
            continue
    for i in [2, 1, 0]:
        if np.any(matches[:, i] == pred[i]):
            df.loc[row.name, ['Kingdom', 'Superclass', 'Class', 'Subclass']] = matches[matches[:, i]==pred[i]][0]
            break
    continue
print(df[['Kingdom', 'Superclass', 'Class', 'Subclass']].info())
df = df[['BY0', 'GRE3', 'ENO1', 'GLO1', 'GLO2', 'Kingdom', 'Kingdom pred', 'Kingdom prob', 
                 'Superclass', 'Superclass pred', 'Superclass prob', 'Class', 'Class pred', 'Class prob', 
                 'Subclass', 'Subclass pred', 'Subclass prob']]
df = df.reset_index()
df.to_pickle('Results/bio_validation/yeast_pred_df.pkl')

##### Fingerprint

In [None]:
df = pd.read_pickle('Results/bio_validation/fing_annotation.pkl')
X = df.iloc[:, 7:].values
X_scal = scaler.transform(X)

levels = ['Kingdom', 'Superclass', 'Class', 'Subclass']

#Kingdom prediction
cf, fi = cfs['Kingdom']['Chemical entities']
#first level prediction is joined into a df
#decision of scaling or not is done inside this function (based if it is RF or not)
y_pred = get_class_w_prob(cf, 'Kingdom', X[:, fi], X_scal[:, fi])

#Sublevels prediction
for i, level in enumerate(levels):
    if i==0:
        continue
    print(level)
    #classes that do not need classifier
    uniques = hierarchy[level]['uniques']
    #df for the prediciton of the whole level
    new_pred = pd.DataFrame()
    level_cfs = cfs[level]
    #iterate each parent node to make predictions on the whole level
    for class_ in tqdm(level_cfs.keys(), desc='Classifiers prediction'):
        cf, fi = level_cfs[class_]
        #index of the samples to be classified (the ones with the parent node predicted at previous level)
        index = y_pred[y_pred[f'{levels[i-1]} pred']==class_].index
        if np.shape(X_test[index])[0] == 0: #There is nothing to predict
            continue
        y_pred_class = get_class_w_prob(cf, level, X[index][:, fi], X_scal[index][:, fi], index=index)
        new_pred = pd.concat([new_pred, y_pred_class], axis=0)
    if len(uniques)>0:
        for class_ in tqdm(uniques, desc='Unique classes atribution'):
            index = y_pred[y_pred[f'{levels[i-1]} pred']==class_].index
            class_pred = y_test[y_test[:,i-1]==class_][0][i] #Look up for the only class (might be unspecified or not)
            y_pred_class = pd.DataFrame([[class_pred, 1]]*len(index), columns=[f'{level} pred', f'{level} prob'], index=index)
            new_pred = pd.concat([new_pred, y_pred_class], axis=0)
    y_pred = y_pred.join(new_pred)
    y_pred[f'{level} prob'] = y_pred.apply(lambda row : row[f'{levels[i-1]} prob']*row[f'{level} prob'], axis=1)

In [None]:
#Join pred to original DF
df.loc[:, y_pred.columns] = y_pred.values

In [None]:
for i,row in df.iterrows():
    pred = row[['Kingdom pred', 'Superclass pred', 'Class pred', 'Subclass pred']].values
    matches = np.array(row['Matches'])
    if np.shape(matches)[0] == 1:
        df.loc[row.name, ['Kingdom', 'Superclass', 'Class', 'Subclass']] = matches[0]
        continue
    for match in matches:
        if np.array_equal(match,pred): #if there is an exact match
            df.loc[row.name, ['Kingdom', 'Superclass', 'Class', 'Subclass']] = match
            continue
    for i in [2, 1, 0]:
        if np.any(matches[:, i] == pred[i]):
            df.loc[row.name, ['Kingdom', 'Superclass', 'Class', 'Subclass']] = matches[matches[:, i]==pred[i]][0]
            break
    continue
print(df[['Kingdom', 'Superclass', 'Class', 'Subclass']].info())
df = df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'Kingdom', 'Kingdom pred', 'Kingdom prob', 
                 'Superclass', 'Superclass pred', 'Superclass prob', 'Class', 'Class pred', 'Class prob', 
                 'Subclass', 'Subclass pred', 'Subclass prob']]
df = df.reset_index()
df.to_pickle('Results/bio_validation/fing_pred_df.pkl')

### Blocking

#### Yeast

In [None]:
y_pred = pd.read_pickle('Results/bio_validation/yeast_pred_df.pkl') #Real prediction on test results
block_df = pd.DataFrame() 
for threshold in tqdm([0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]):
    res_thr = y_pred.copy()
    res_thr.loc[res_thr['Kingdom prob']<threshold] = np.nan
    res_thr.loc[res_thr['Superclass prob']<threshold, ['Superclass pred', 'Superclass prob', 'Class pred', 'Class prob', 'Subclass pred', 'Subclass prob']] = np.nan
    res_thr.loc[res_thr['Class prob']<threshold, ['Class pred', 'Class prob', 'Subclass pred', 'Subclass prob']] = np.nan
    res_thr.loc[res_thr['Subclass prob']<threshold, ['Subclass pred', 'Subclass prob']] = np.nan
    res_thr['Threshold'] = threshold
    res_thr['Number predicted levels'] = res_thr.apply(lambda row: row.iloc[[7, 10, 13, 16]].count(), axis=1)
    res_thr = res_thr.reset_index().rename(columns={'index':'sample index'})
    block_df = pd.concat([block_df, res_thr])
block_df.reset_index(drop=True, inplace=True)
#Join true classes to the DF
block_df.to_pickle('Results/bio_validation/yeast_pred_w_blocking.pkl')

#### Fingerprint

In [None]:
y_pred = pd.read_pickle('Results/bio_validation/fing_pred_df.pkl') #Real prediction on test results
block_df = pd.DataFrame() 
for threshold in tqdm([0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]):
    res_thr = y_pred.copy()
    res_thr.loc[res_thr['Kingdom prob']<threshold] = np.nan
    res_thr.loc[res_thr['Superclass prob']<threshold, ['Superclass pred', 'Superclass prob', 'Class pred', 'Class prob', 'Subclass pred', 'Subclass prob']] = np.nan
    res_thr.loc[res_thr['Class prob']<threshold, ['Class pred', 'Class prob', 'Subclass pred', 'Subclass prob']] = np.nan
    res_thr.loc[res_thr['Subclass prob']<threshold, ['Subclass pred', 'Subclass prob']] = np.nan
    res_thr['Threshold'] = threshold
    res_thr['Number predicted levels'] = res_thr.apply(lambda row: row.iloc[[8, 11, 14, 17]].count(), axis=1)
    res_thr = res_thr.reset_index().rename(columns={'index':'sample index'})
    block_df = pd.concat([block_df, res_thr])
block_df.reset_index(drop=True, inplace=True)
#Join true classes to the DF
block_df.to_pickle('Results/bio_validation/fing_pred_w_blocking.pkl')

### Compute scores and classification reports

when blocking is applied, samples without that level on prediction are filtered out

##### Yeast

In [None]:
block_df = pd.read_pickle('Results/bio_validation/yeast_pred_w_blocking.pkl')
scores = []
clf_reports = []
total = len(block_df[(block_df['Threshold']==0) & (block_df['Number predicted levels']==4)])
for i, level in enumerate(['Kingdom', 'Superclass', 'Class', 'Subclass']):
    for thr in tqdm([0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95], desc=f'Threshold lvl {i+1}'):
        results_thr = block_df[block_df['Threshold']==thr]
        y_pred = results_thr[results_thr[level + ' pred'].notna()][level + ' pred']
        y_true = results_thr[results_thr[level + ' pred'].notna()][level]
        clf_report = classification_report(y_true, y_pred, labels=all_categories[level], output_dict=True)
        try:
            micro = clf_report['accuracy']
        except:
            micro = clf_report['micro avg']['f1-score']
        n = len(block_df[(block_df['Threshold']==thr) & (block_df['Number predicted levels']>=i+1)])
        scores.append({'Level': level, 'Threshold': thr,
                        'F1 score (micro)': np.round(micro, 4),
                        'F1 score (macro)': np.round(clf_report['macro avg']['f1-score'], 4),
                      'Compound coverage': f'{n} ({np.round(n*100/total, 1)}%)'})
        for key, value in clf_report.items():
            if key in ['accuracy', 'macro avg', 'weighted avg', 'micro avg']:
                continue
            for key2, value2 in value.items():
                clf_reports.append({'Level':level, 'Threshold': thr, 'Categorie':key, 'metric_name':key2, 'value':value2})
pd.DataFrame(scores).to_pickle('Results/bio_validation/yeast_scores.pkl')
pd.DataFrame(clf_reports).to_pickle('Results/bio_validation/yeast_classification_report.pkl')

In [None]:
#Flatten above df
test_scores = pd.read_pickle('Results/bio_validation/yeast_scores.pkl')
df = {0:{}, 0.5:{}, 0.6:{}, 0.7:{}, 0.8:{}, 0.9:{}, 0.95:{}}
for i, row in test_scores.iterrows():
    lvl = row['Level']
    df[row['Threshold']][f'{lvl} - macro'] =  row['F1 score (macro)']
    df[row['Threshold']][f'{lvl} - micro'] =  row['F1 score (micro)']
    df[row['Threshold']][f'{lvl} - compound coverage'] = row['Compound coverage']
df = pd.DataFrame.from_dict(df, orient='index')
df.to_excel('Results/bio_validation/yeast_scores.xlsx')

In [None]:
block_df = pd.read_pickle('Results/bio_validation/yeast_pred_w_blocking.pkl')

In [None]:
block_df[(block_df['Threshold']==0)]['Superclass'].value_counts()*100/403

In [None]:
true = block_df[(block_df['Threshold']==0)]['Class']
pred = block_df[(block_df['Threshold']==0)]['Class pred']

In [None]:
df = pd.DataFrame(pd.concat([true, pred]), columns=['Category'])
df['Set'] = pd.concat([pd.Series(['True']*403),pd.Series(['Pred']*403)])

In [None]:
h = pd.DataFrame(y_train, columns=['Kingdom', 'Superclass', 'Class', 'Subclass'])[['Superclass', 'Class']].sort_values(by=['Superclass', 'Class']).drop_duplicates()

In [None]:
h = h[h['Class'].isin(np.unique(df['Category']))]

In [None]:
h1 = h[:34]
h2 = h[34:]
class_order_1 = h1['Class'].drop_duplicates().values
class_order_2 = h2['Class'].drop_duplicates().values
super_size_1 = h1['Superclass'].value_counts().loc[h1['Superclass'].drop_duplicates().values]
super_size_2 = h2['Superclass'].value_counts().loc[h2['Superclass'].drop_duplicates().values]

In [None]:
%matplotlib inline
rcParams['figure.figsize'] = 10, 20
rcParams['figure.dpi'] = 800
rcParams['axes.titlesize'] = 12
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 14
rcParams['font.size'] = 7.5

sns.set_theme(style='whitegrid')

ax = sns.countplot(data=df[df['Category'].isin(class_order_1)], y='Category', hue='Set', order=class_order_1, orient='v')
ax.set_yticklabels(name_2lines(class_order_1, 39), ha='right', va='center', rotation=0, fontsize=14)
ax.set_ylabel(None)
ax.set_xlabel(None)

before=0
for y, size in super_size_1.iteritems():
    plt.text(-40, before-0.5 + size/2, name_2lines(y), ha='right', va='center', fontsize=14, weight='bold')
    before += size
plt.xlim(0, 90)
plt.legend(fontsize='x-large', loc='lower right')
plt.savefig(f'Plots/yeast_validation_1.png', bbox_inches='tight', transparent=True, dpi=800)
plt.show()

In [None]:
rcParams['figure.figsize'] = 10, 20
rcParams['figure.dpi'] = 800
rcParams['axes.titlesize'] = 12
rcParams['xtick.labelsize'] = 10
rcParams['ytick.labelsize'] = 14
rcParams['font.size'] = 7.5

sns.set_theme(style='whitegrid')

ax = sns.countplot(data=df[df['Category'].isin(class_order_2)], y='Category', hue='Set', order=class_order_2, orient='v')
ax.set_yticklabels(name_2lines(class_order_2, 40), ha='right', va='center', rotation=0, fontsize=14)
ax.set_ylabel(None)
ax.set_xlabel(None)

before=0
for y, size in super_size_2.iteritems():
    plt.text(-40, before-0.5 + size/2, name_2lines(y), ha='right', va='center', fontsize=14, weight='bold')
    before += size
plt.xlim(0, 90)
plt.legend(fontsize='x-large', loc='lower right')
plt.savefig(f'Plots/yeast_validation_2.png', bbox_inches='tight', transparent=True, dpi=800)
plt.show()

##### Fingerprint

In [None]:
block_df = pd.read_pickle('Results/bio_validation/fing_pred_w_blocking.pkl')
scores = []
clf_reports = []
total = len(block_df[(block_df['Threshold']==0) & (block_df['Number predicted levels']==4)])
for i, level in enumerate(['Kingdom', 'Superclass', 'Class', 'Subclass']):
    for thr in tqdm([0, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95], desc=f'Threshold lvl {i+1}'):
        results_thr = block_df[block_df['Threshold']==thr]
        y_pred = results_thr[results_thr[level + ' pred'].notna()][level + ' pred']
        y_true = results_thr[results_thr[level + ' pred'].notna()][level]
        clf_report = classification_report(y_true, y_pred, labels=all_categories[level], output_dict=True)
        try:
            micro = clf_report['accuracy']
        except:
            micro = clf_report['micro avg']['f1-score']
        n = len(block_df[(block_df['Threshold']==thr) & (block_df['Number predicted levels']>=i+1)])
        scores.append({'Level': level, 'Threshold': thr,
                        'F1 score (micro)': np.round(micro, 4),
                        'F1 score (macro)': np.round(clf_report['macro avg']['f1-score'], 4),
                      'Compound coverage': f'{n} ({np.round(n*100/total, 1)}%)'})
        for key, value in clf_report.items():
            if key in ['accuracy', 'macro avg', 'weighted avg', 'micro avg']:
                continue
            for key2, value2 in value.items():
                clf_reports.append({'Level':level, 'Threshold': thr, 'Categorie':key, 'metric_name':key2, 'value':value2})
pd.DataFrame(scores).to_pickle('Results/bio_validation/fing_scores.pkl')
pd.DataFrame(clf_reports).to_pickle('Results/bio_validation/fing_classification_report.pkl')

In [None]:
#Flatten above df
test_scores = pd.read_pickle('Results/bio_validation/fing_scores.pkl')
df = {0:{}, 0.5:{}, 0.6:{}, 0.7:{}, 0.8:{}, 0.9:{}, 0.95:{}}
for i, row in test_scores.iterrows():
    lvl = row['Level']
    df[row['Threshold']][f'{lvl} - macro'] =  row['F1 score (macro)']
    df[row['Threshold']][f'{lvl} - micro'] =  row['F1 score (micro)']
    df[row['Threshold']][f'{lvl} - compound coverage'] = row['Compound coverage']
df = pd.DataFrame.from_dict(df, orient='index')
df.to_excel('Results/bio_validation/fing_scores.xlsx')

In [None]:
block_df = pd.read_pickle('Results/bio_validation/fing_pred_w_blocking.pkl')

In [None]:
block_df[(block_df['Threshold']==0)]['Superclass'].value_counts()*100/311

In [None]:
true = block_df[(block_df['Threshold']==0)]['Class']
pred = block_df[(block_df['Threshold']==0)]['Class pred']

In [None]:
df = pd.DataFrame(pd.concat([true, pred]), columns=['Category'])
df['Set'] = pd.concat([pd.Series(['True']*311),pd.Series(['Pred']*311)])

In [None]:
h = a[['Superclass', 'Class']].sort_values(by=['Superclass', 'Class']).drop_duplicates()

In [None]:
h = h[h['Class'].isin(np.unique(df['Category']))]

In [None]:
h1 = h[:16]
h2 = h[16:]
class_order_1 = h1['Class'].drop_duplicates().values
class_order_2 = h2['Class'].drop_duplicates().values
super_size_1 = h1['Superclass'].value_counts().loc[h1['Superclass'].drop_duplicates().values]
super_size_2 = h2['Superclass'].value_counts().loc[h2['Superclass'].drop_duplicates().values]

In [None]:
%matplotlib inline
rcParams['figure.figsize'] = 10, 20
rcParams['figure.dpi'] = 800
rcParams['axes.titlesize'] = 12
rcParams['xtick.labelsize'] = 12
rcParams['ytick.labelsize'] = 14
rcParams['font.size'] = 7.5

sns.set_theme(style='whitegrid')

ax = sns.countplot(data=df[df['Category'].isin(class_order_1)], y='Category', hue='Set', order=class_order_1, orient='v')
ax.set_yticklabels(name_2lines(class_order_1, 30), ha='right', va='center', rotation=0, fontsize=14)
ax.set_ylabel(None)
ax.set_xlabel(None)

before=0
for y, size in super_size_1.iteritems():
    plt.text(-25, before-0.5 + size/2, name_2lines(y), ha='right', va='center', fontsize=14, weight='bold')
    before += size
plt.xlim(0, 60)
plt.legend(fontsize='x-large', loc='lower right')
plt.savefig(f'Plots/fing_validation_1.png', bbox_inches='tight', transparent=True, dpi=800)
plt.show()

In [None]:
rcParams['figure.figsize'] = 10, 20
rcParams['figure.dpi'] = 800
rcParams['axes.titlesize'] = 12
rcParams['xtick.labelsize'] = 10
rcParams['ytick.labelsize'] = 14
rcParams['font.size'] = 7.5

sns.set_theme(style='whitegrid')

ax = sns.countplot(data=df[df['Category'].isin(class_order_2)], y='Category', hue='Set', order=class_order_2, orient='v')
ax.set_yticklabels(name_2lines(class_order_2, 30), ha='right', va='center', rotation=0, fontsize=14)
ax.set_ylabel(None)
ax.set_xlabel(None)

before=0
for y, size in super_size_2.iteritems():
    plt.text(-25, before-0.5 + size/2, name_2lines(y), ha='right', va='center', fontsize=14, weight='bold')
    before += size
plt.xlim(0, 60)
plt.legend(fontsize='x-large', loc='lower right')
plt.savefig(f'Plots/fing_validation_2.png', bbox_inches='tight', transparent=True, dpi=800)
plt.show()

In [None]:
gs = pickle.load(open(f'Models/Subclass_flat_RF.pkl', 'rb'))

In [None]:
print('f1_macro_test ->', gs.cv_results_['mean_test_f1_macro'][gs.best_index_])
print('f1_macro_train ->', gs.cv_results_['mean_train_f1_macro'][gs.best_index_])
print('f1_micro_test ->', gs.cv_results_['mean_test_f1_micro'][gs.best_index_])
print('f1_micro_train ->', gs.cv_results_['mean_train_f1_micro'][gs.best_index_])