In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
from sklearnex import patch_sklearn #Improves sklearn alghoritms performance
patch_sklearn()
import sklearn
print('scikit-learn version\n', sklearn.__version__)
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.model_selection import GridSearchCV
from aux_functions.binary_classifiers import binary_data_sampling
import pickle
from tqdm.notebook import tqdm
import os

In [None]:
#Set seed
seed=41

In [None]:
#Classification Level -> Superclass
classif_level = 1

# Load Data

In [None]:
feature_names = pickle.load(open('Dataset/feature_names.pkl', 'rb'))
X_train = pickle.load(open(f'Dataset/X_train.pkl', 'rb'))
X_test = pickle.load(open(f'Dataset/X_test.pkl', 'rb'))
y_train = pickle.load(open(f'Dataset/y_train.pkl', 'rb'))
y_test = pickle.load(open(f'Dataset/y_test.pkl', 'rb'))

In [None]:
len(feature_names)

In [None]:
X_train_org = X_train[y_train[:,classif_level-1] == 'Organic compounds', :]
X_test_org = X_test[y_test[:,classif_level-1] == 'Organic compounds', :]
y_train_org = y_train[y_train[:,classif_level-1] == 'Organic compounds'][:, classif_level]
y_test_org = y_test[y_test[:,classif_level-1] == 'Organic compounds'][:, classif_level]

In [None]:
#all classes
org_sclasses = np.unique(y_train_org)

In [None]:
for i, sclass_ in enumerate(org_sclasses):
    print(i, sclass_)

In [None]:
results = []
for f_sel in [False, True]:
    for samp_strat in [None, 'stratf', 'ObyO']:
        for mult_factor in [1.3, 1.5, 2, 3, 4, 5, 6]:
            if samp_strat is None:
                mult_factor = None
            for i, sclass_ in enumerate(org_sclasses):
                print(f_sel)
                print(samp_strat)
                print(mult_factor)
                print(sclass_)
                
                if f_sel is True:
                    sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Superclass_binary'][sclass_]['selected features']
                    f_index = [feature_names.index(feature) for feature in sel_f]
                else:
                    sel_f = None
                    f_index = None
                    
                X_train, y_train, X_test, y_test, train_pos_size, train_neg_size = binary_data_sampling(X_train_org, y_train_org, 
                                                                                                        X_test_org, y_test_org, 
                                                                                                        sclass_, samp_strat, 
                                                                                                        mult_factor, seed=seed, 
                                                                                                        feature_selection=f_sel, 
                                                                                                        features_index=f_index)
                if f_sel is True:
                    grid = {'class_weight': [None, 'balanced', 'balanced_subsample'],
                            'criterion' : ['gini', 'entropy'],
                            'max_features' : range(1, np.shape(X_train)[1]+1)}
                else:
                    grid = {'class_weight': [None, 'balanced', 'balanced_subsample'],
                            'criterion' : ['gini', 'entropy'],
                            'max_features' : [1, 2, 3, 5, 7, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 133]}
                '''
                estimator = RandomForestClassifier(random_state=seed, n_jobs=-1)
                gs = GridSearchCV(estimator, param_grid=grid, scoring='balanced_accuracy', cv=3, verbose=100, 
                                  pre_dispatch=True, error_score='raise', return_train_score=True, n_jobs=-1)
                gs.fit(X_train, y_train)
                pickle.dump(gs, open(f'Models/Superclass/Org_RF_Binary/Feature_selection={f_sel}/Sampl_strategy={samp_strat}/{i}_multfactor={mult_factor}.pkl', 'wb'))
                '''
                gs = pickle.load(open(f'Models/Superclass/Org_RF_Binary/Feature_selection={f_sel}/Sampl_strategy={samp_strat}/{i}_multfactor={mult_factor}.pkl', 'rb'))
                
                print(gs.best_score_)
                estimator = gs.best_estimator_
                cval_results = gs.cv_results_
                print('Pos size=',train_pos_size)
                print('Neg size=',train_neg_size)
                score_train = cval_results['mean_train_score'][gs.best_index_]
                score_val = cval_results['mean_test_score'][gs.best_index_]
                print('Balanced accuracy (cv-train) ->', score_train)
                print('Balanced accuracy (cv-val) ->', score_val)
                ##Test binary RF
                y_pred = estimator.predict(X_test)
                score_test = balanced_accuracy_score(y_test, y_pred)
                print('Balanced accuracy (test) ->', score_test)
                results.append({'Superclass': sclass_, 'Feature selection':f_sel, 'Sampling strategy': str(samp_strat), 
                                'Multiplying factor': mult_factor,
                                'Train positive size': train_pos_size, 'Train negative size':train_neg_size,
                                'Balanced accuracy (cv-train)':score_train,
                                'Balanced accuracy (cv-val)':score_val,
                                'Balanced accuracy (test)':score_test})
            if samp_strat is None:
                break
results = pd.DataFrame(results)
results.replace({True:'Yes', False:'No'}).to_csv('Results/GS_Binary_RF_(Superclass-Org compounds).csv')
results.to_pickle('Results/GS_Binary_RF_(Superclass-Org compounds).pkl')

# Prediction

In [None]:
df = pd.read_pickle('Results/GS_Binary_RF_(Superclass-Org compounds).pkl')

### select classifiers from all

In [None]:
cfs = {}
for i, sclass_ in enumerate(org_sclasses):
    c_results = df[df['Superclass']==sclass_]
    max_ = c_results['Balanced accuracy (cv-val)'].idxmax()
    row = c_results.loc[max_]
    samp_strat = row['Sampling strategy']
    mult_factor = row['Multiplying factor']
    f_sel = row['Feature selection']
    if samp_strat == 'None':
        mult_factor = None
    if mult_factor in [1, 2, 3, 4, 5, 6]:
        mult_factor = int(mult_factor)
    cfs[sclass_] = (pickle.load(open(f'Models/Superclass/Org_RF_Binary/Feature_selection={f_sel}/Sampl_strategy={samp_strat}/{i}_multfactor={mult_factor}.pkl', 'rb')).best_estimator_, f_sel)

In [None]:
new_pred = {}
index = np.where([True if i in org_sclasses else False for i in y_test_org])[0]
y_test = y_test_org[index]
for sclass_ in tqdm(cfs):
    X_test_org_ = X_test_org
    cf = cfs[sclass_][0]
    f_sel = cfs[sclass_][1]
    if f_sel:
        sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Superclass_binary'][sclass_]['selected features']
        f_index = [feature_names.index(feature) for feature in sel_f]
        X_test_org_= X_test_org_[:, f_index]
    new_pred[sclass_] = cf.predict_proba(X_test_org_[index])[:, 1]

In [None]:
new_pred = pd.DataFrame(new_pred)

In [None]:
y_pred = new_pred.idxmax(axis=1).values

In [None]:
f1_score(y_test, y_pred, average='macro')

In [None]:
f1_score(y_test, y_pred, average='micro')

In [None]:
classification_report(y_test, y_pred, output_dict=True)

### select classifiers from no sampling

In [None]:
cfs = {}
for i, sclass_ in enumerate(org_sclasses):
    c_results = df[(df['Superclass']==sclass_) & (df['Sampling strategy'].isnull())]
    max_ = c_results['Balanced accuracy (cv-val)'].idxmax()
    row = c_results.loc[max_]
    samp_strat = None
    mult_factor = None
    f_sel = row['Feature selection']
    cfs[sclass_] = (pickle.load(open(f'Models/Superclass/Org_RF_Binary/Feature_selection={f_sel}/Sampl_strategy={samp_strat}/{i}_multfactor={mult_factor}.pkl', 'rb')).best_estimator_, f_sel)

In [None]:
new_pred = {}
index = np.where([True if i in org_sclasses else False for i in y_test_org])[0]
y_test = y_test_org[index]
for sclass_ in tqdm(cfs):
    X_test_org_ = X_test_org
    cf = cfs[sclass_][0]
    f_sel = cfs[sclass_][1]
    if f_sel:
        sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Superclass_binary'][sclass_]['selected features']
        f_index = [feature_names.index(feature) for feature in sel_f]
        X_test_org_= X_test_org_[:, f_index]
    new_pred[sclass_] = cf.predict_proba(X_test_org_[index])[:, 1]

In [None]:
new_pred = pd.DataFrame(new_pred)

In [None]:
y_pred = new_pred.idxmax(axis=1).values

In [None]:
f1_score(y_test, y_pred, average='macro')

In [None]:
f1_score(y_test, y_pred, average='micro')

### select classifiers from stratf sampling

In [None]:
cfs = {}
for i, sclass_ in enumerate(org_sclasses):
    c_results = df[(df['Superclass']==sclass_) & (df['Sampling strategy']=='stratf')]
    max_ = c_results['Balanced accuracy (cv-val)'].idxmax()
    row = c_results.loc[max_]
    samp_strat = 'stratf'
    mult_factor = row['Multiplying factor']
    f_sel = row['Feature selection']
    if mult_factor in [1, 2, 3, 4, 5, 6]:
        mult_factor = int(mult_factor)
    cfs[sclass_] = (pickle.load(open(f'Models/Superclass/Org_RF_Binary/Feature_selection={f_sel}/Sampl_strategy={samp_strat}/{i}_multfactor={mult_factor}.pkl', 'rb')).best_estimator_, f_sel)

In [None]:
new_pred = {}
index = np.where([True if i in org_sclasses else False for i in y_test_org])[0]
y_test = y_test_org[index]
for sclass_ in tqdm(cfs):
    X_test_org_ = X_test_org
    cf = cfs[sclass_][0]
    f_sel = cfs[sclass_][1]
    if f_sel:
        sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Superclass_binary'][sclass_]['selected features']
        f_index = [feature_names.index(feature) for feature in sel_f]
        X_test_org_= X_test_org_[:, f_index]
    new_pred[sclass_] = cf.predict_proba(X_test_org_[index])[:, 1]

In [None]:
new_pred = pd.DataFrame(new_pred)

In [None]:
y_pred = new_pred.idxmax(axis=1).values

In [None]:
f1_score(y_test, y_pred, average='macro')

In [None]:
f1_score(y_test, y_pred, average='micro')

### select classifiers from ObyO sampling

In [None]:
cfs = {}
for i, sclass_ in enumerate(org_sclasses):
    c_results = df[(df['Superclass']==sclass_) & (df['Sampling strategy']=='ObyO')]
    max_ = c_results['Balanced accuracy (cv-val)'].idxmax()
    row = c_results.loc[max_]
    samp_strat = 'ObyO'
    mult_factor = row['Multiplying factor']
    f_sel = row['Feature selection']
    if mult_factor in [1, 2, 3, 4, 5, 6]:
        mult_factor = int(mult_factor)
    cfs[sclass_] = (pickle.load(open(f'Models/Superclass/Org_RF_Binary/Feature_selection={f_sel}/Sampl_strategy={samp_strat}/{i}_multfactor={mult_factor}.pkl', 'rb')).best_estimator_, f_sel)

In [None]:
new_pred = {}
index = np.where([True if i in org_sclasses else False for i in y_test_org])[0]
y_test = y_test_org[index]
for sclass_ in tqdm(cfs):
    X_test_org_ = X_test_org
    cf = cfs[sclass_][0]
    f_sel = cfs[sclass_][1]
    if f_sel:
        sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Superclass_binary'][sclass_]['selected features']
        f_index = [feature_names.index(feature) for feature in sel_f]
        X_test_org_= X_test_org_[:, f_index]
    new_pred[sclass_] = cf.predict_proba(X_test_org_[index])[:, 1]

In [None]:
new_pred = pd.DataFrame(new_pred)

In [None]:
y_pred = new_pred.idxmax(axis=1).values

In [None]:
f1_score(y_test, y_pred, average='macro')

In [None]:
f1_score(y_test, y_pred, average='micro')

### Prediction on test using the multiclass RF

In [None]:
for f_sel in ['all', 'sel']:
    print(f_sel)
    gs = pickle.load(open(f'Models/Superclass/1_RF_{f_sel}_feat.pkl', 'rb'))
    print('f1_macro_test ->', gs.cv_results_['mean_test_f1_macro'][gs.best_index_])
    print('f1_macro_train ->', gs.cv_results_['mean_train_f1_macro'][gs.best_index_])
    print('f1_micro_test ->', gs.cv_results_['mean_test_f1_micro'][gs.best_index_])
    print('f1_micro_train ->', gs.cv_results_['mean_train_f1_micro'][gs.best_index_])

In [None]:
#Best one is with feature selection (f1 macro test)
gs = pickle.load(open(f'Models/Superclass/1_RF_sel_feat.pkl', 'rb'))
sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Superclass']['Organic compounds']['selected features']
f_index = [feature_names.index(feature) for feature in sel_f]
y_pred = gs.best_estimator_.predict(X_test_org[:, f_index])

In [None]:
f1_score(y_test_org, y_pred, average='macro')

In [None]:
f1_score(y_test_org, y_pred, average='micro')