In [None]:
# Importing Libraries
from sklearnex import patch_sklearn #Improves sklearn alghoritms performance
patch_sklearn()
import sklearn
print('scikit-learn version\n', sklearn.__version__)
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import OrderedDict
from aux_functions.name_2lines import name_2lines

In [None]:
#Set seed
seed=41

In [None]:
classif_level = 3

# Load Data

In [None]:
feature_names = pickle.load(open('Dataset/feature_names.pkl', 'rb'))
X_train = pickle.load(open(f'Dataset/X_train.pkl', 'rb'))
X_train_scal = pickle.load(open(f'Dataset/X_train_scal.pkl', 'rb'))
y_train = pickle.load(open(f'Dataset/y_train.pkl', 'rb'))
#all classes
classes = pickle.load(open('Dataset/all_categories.pkl', 'rb'))['Class']
#Classes with unique child
c_u = pickle.load(open(f'Dataset/C_wunique_child.pkl', 'rb'))

In [None]:
for i, class_ in enumerate(classes):
    print(i, class_)

## Train and Tune classifiers

### Random Forests

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        print('Only one subchild')
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        if f_sel == 'sel':
            sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Subclass'][class_]['selected features']
            f_index = [feature_names.index(feature) for feature in sel_f]
            X_train_ = X_train[y_train[:, classif_level-1]==class_][:, f_index]
            grid = {'class_weight': [None, 'balanced', 'balanced_subsample'],
                'criterion' : ['gini', 'entropy'],
                'max_features' : range(1, len(sel_f)+1)}
        else:
            X_train_ = X_train[y_train[:, classif_level-1]==class_]
            grid = {'class_weight': [None, 'balanced', 'balanced_subsample'],
                'criterion' : ['gini', 'entropy'],
                'max_features' : [1, 2, 3, 5, 7, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 133]}
        y_train_ = y_train[y_train[:, classif_level-1]==class_][:, classif_level]
        
        estimator = RandomForestClassifier(random_state=seed, n_jobs=-1, n_estimators=200)
        gs = GridSearchCV(estimator, param_grid=grid, scoring=['f1_macro', 'f1_micro'], refit='f1_macro', cv=3, verbose=3, 
                          error_score='raise', return_train_score=True, n_jobs=-1)
        gs.fit(X_train_, y_train_)
        pickle.dump(gs, open(f'Models/Subclass/{i}_RF_{f_sel}_feat.pkl', 'wb'))

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        gs = pickle.load(open(f'Models/Subclass/{i}_RF_{f_sel}_feat.pkl', 'rb'))
        print('f1_macro_val ->', gs.cv_results_['mean_test_f1_macro'][gs.best_index_])
        print('f1_macro_train ->', gs.cv_results_['mean_train_f1_macro'][gs.best_index_])
        print('f1_micro_val ->', gs.cv_results_['mean_test_f1_micro'][gs.best_index_])
        print('f1_micro_train ->', gs.cv_results_['mean_train_f1_micro'][gs.best_index_])

### KNN

In [None]:
grid = {'n_neighbors': range(1, 31), 'weights' : ['uniform', 'distance'],
        'metric':['manhattan', 'euclidean', 'cosine']}

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        print('Only one subchild')
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        grid['n_neighbors'] = range(1, 31)
        if f_sel == 'sel':
            sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Subclass'][class_]['selected features']
            f_index = [feature_names.index(feature) for feature in sel_f]
            X_train_ = X_train_scal[y_train[:, classif_level-1]==class_][:, f_index]
        else:
            X_train_ = X_train_scal[y_train[:, classif_level-1]==class_]
            
        y_train_ = y_train[y_train[:, classif_level-1]==class_][:, classif_level]
        n_samples = int(len(y_train_)*2/3)
        if n_samples<30:
            grid['n_neighbors'] = range(1, n_samples + 1)
        estimator = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)
        gs = GridSearchCV(estimator, param_grid=grid, scoring=['f1_macro', 'f1_micro'], refit='f1_macro', cv=3, verbose=3, 
                          error_score='raise', return_train_score=True, n_jobs=10)
        gs.fit(X_train_, y_train_)
        pickle.dump(gs, open(f'Models/Subclass/{i}_KNN_{f_sel}_feat.pkl', 'wb'))

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        gs = pickle.load(open(f'Models/Subclass/{i}_KNN_{f_sel}_feat.pkl', 'rb'))
        print('f1_macro_test ->', gs.cv_results_['mean_test_f1_macro'][gs.best_index_])
        print('f1_macro_train ->', gs.cv_results_['mean_train_f1_macro'][gs.best_index_])
        print('f1_micro_test ->', gs.cv_results_['mean_test_f1_micro'][gs.best_index_])
        print('f1_micro_train ->', gs.cv_results_['mean_train_f1_micro'][gs.best_index_])

### Logistic Regression

In [None]:
grid = [{'penalty': ['l1'],
         'C': [0.01, 0.1, 1, 10, 100],
         'fit_intercept': [True, False],
         'class_weight': [None, 'balanced'],
         'multi_class':['ovr', 'multinomial'],
         'solver': ['saga']},
        {'penalty': ['elasticnet'],
         'C': [0.01, 0.1, 1, 10, 100],
         'fit_intercept': [True, False],
         'class_weight': [None, 'balanced'],
         'multi_class':['ovr', 'multinomial'],
         'l1_ratio': [0.25, 0.5, 0.75],
         'solver': ['saga']},
        {'penalty': ['l2'],
         'C': [0.01, 0.1, 1, 10, 100],
         'fit_intercept': [True, False],
         'class_weight': [None, 'balanced'],
         'multi_class':['ovr', 'multinomial'],
         'solver': ['sag', 'saga']},
       {'penalty': ['none'],
         'fit_intercept': [True, False],
         'class_weight': [None, 'balanced'],
         'multi_class':['ovr', 'multinomial'],
         'solver': ['sag', 'saga']}]

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        print('Only one subchild')
        continue
    if i in [42, 74, 103, 111, 112]:
        max_iter = 100
    else:
        max_iter = 5000
    for f_sel in ['all', 'sel']:
        print(f_sel)
        if f_sel == 'sel':
            sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Subclass'][class_]['selected features']
            f_index = [feature_names.index(feature) for feature in sel_f]
            X_train_ = X_train_scal[y_train[:, classif_level-1]==class_][:, f_index]
        else:
            X_train_ = X_train_scal[y_train[:, classif_level-1]==class_]
            
        y_train_ = y_train[y_train[:, classif_level-1]==class_][:, classif_level]
        
        estimator = LogisticRegression(max_iter=max_iter, tol=1e-6, random_state=seed, n_jobs=-1)
        gs = GridSearchCV(estimator, param_grid=grid, scoring=['f1_macro', 'f1_micro'], refit='f1_macro', cv=3, verbose=3, 
                          error_score='raise', return_train_score=True, n_jobs=-1)
        gs.fit(X_train_, y_train_)
        pickle.dump(gs, open(f'Models/Subclass/{i}_LR_{f_sel}_feat.pkl', 'wb'))

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        gs = pickle.load(open(f'Models/Subclass/{i}_LR_{f_sel}_feat.pkl', 'rb'))
        print('f1_macro_test ->', gs.cv_results_['mean_test_f1_macro'][gs.best_index_])
        print('f1_macro_train ->', gs.cv_results_['mean_train_f1_macro'][gs.best_index_])
        print('f1_micro_test ->', gs.cv_results_['mean_test_f1_micro'][gs.best_index_])
        print('f1_micro_train ->', gs.cv_results_['mean_train_f1_micro'][gs.best_index_])

### SVM

In [None]:
grid = [{'penalty': ['l1', 'l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'class_weight': [None, 'balanced']}]

In [None]:
for i, class_ in enumerate(classes):
    if class_ in c_u:
        print('Only one subchild')
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        if f_sel == 'sel':
            sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Subclass'][class_]['selected features']
            f_index = [feature_names.index(feature) for feature in sel_f]
            X_train_ = X_train_scal[y_train[:, classif_level-1]==class_][:, f_index]
        else:
            X_train_ = X_train_scal[y_train[:, classif_level-1]==class_]
            
        y_train_ = y_train[y_train[:, classif_level-1]==class_][:, classif_level]
        
        estimator = LinearSVC(dual=False, loss='squared_hinge', multi_class='ovr', tol=1e-6, max_iter=5000, 
                              random_state=seed, verbose=3)
        gs = GridSearchCV(estimator, param_grid=grid, scoring=['f1_macro', 'f1_micro'], refit='f1_macro', cv=3, verbose=3, 
                          error_score='raise', return_train_score=True, n_jobs=-1)
        gs.fit(X_train_, y_train_)
        pickle.dump(gs, open(f'Models/Subclass/{i}_SVM_{f_sel}_feat.pkl', 'wb'))

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        gs = pickle.load(open(f'Models/Subclass/{i}_SVM_{f_sel}_feat.pkl', 'rb'))
        print('f1_macro_test ->', gs.cv_results_['mean_test_f1_macro'][gs.best_index_])
        print('f1_macro_train ->', gs.cv_results_['mean_train_f1_macro'][gs.best_index_])
        print('f1_micro_test ->', gs.cv_results_['mean_test_f1_micro'][gs.best_index_])
        print('f1_micro_train ->', gs.cv_results_['mean_train_f1_micro'][gs.best_index_])

### ComplementNB

In [None]:
grid = {'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
        'norm': [True, False]}

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        print('Only one subchild')
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        if f_sel == 'sel':
            sel_f = pickle.load(open('Results/selected_features.pkl', 'rb'))['Subclass'][class_]['selected features']
            f_index = [feature_names.index(feature) for feature in sel_f]
            X_train_ = X_train_scal[y_train[:, classif_level-1]==class_][:, f_index]
        else:
            X_train_ = X_train_scal[y_train[:, classif_level-1]==class_]
            
        y_train_ = y_train[y_train[:, classif_level-1]==class_][:, classif_level]
        
        estimator = ComplementNB()
        gs = GridSearchCV(estimator, param_grid=grid, scoring=['f1_macro', 'f1_micro'], refit='f1_macro', cv=3, verbose=3, 
                          error_score='raise', return_train_score=True, n_jobs=-1)
        gs.fit(X_train_, y_train_)
        pickle.dump(gs, open(f'Models/Subclass/{i}_NB_{f_sel}_feat.pkl', 'wb'))

In [None]:
for i, class_ in enumerate(classes):
    print(class_)
    if class_ in c_u:
        continue
    for f_sel in ['all', 'sel']:
        print(f_sel)
        gs = pickle.load(open(f'Models/Subclass/{i}_NB_{f_sel}_feat.pkl', 'rb'))
        print('f1_macro_test ->', gs.cv_results_['mean_test_f1_macro'][gs.best_index_])
        print('f1_macro_train ->', gs.cv_results_['mean_train_f1_macro'][gs.best_index_])
        print('f1_micro_test ->', gs.cv_results_['mean_test_f1_micro'][gs.best_index_])
        print('f1_micro_train ->', gs.cv_results_['mean_train_f1_micro'][gs.best_index_])

### Load GS scores

In [None]:
scores_df = []
params_df = []
for i, class_ in enumerate(classes):
    if class_ in c_u:
        continue
    #Select if result is better with or without f selection
    scores_rows = [{'Classifier':class_, 'F1-score average': 'macro'}, {'Classifier':class_, 'F1-score average': 'micro'}]
    params_row = {'Classifier':class_}
    for alg in ['RF', 'KNN', 'LR', 'SVM', 'NB']:
        f_sel_gs = {}
        for f_sel in ['all', 'sel']:
            f_sel_gs[f_sel] = pickle.load(open(f'Models/Subclass/{i}_{alg}_{f_sel}_feat.pkl', 'rb'))
        score_all = f_sel_gs['all'].cv_results_[f'mean_test_f1_macro'][f_sel_gs['all'].best_index_]
        score_sel = f_sel_gs['sel'].cv_results_[f'mean_test_f1_macro'][f_sel_gs['sel'].best_index_]
        if score_all >= score_sel:
            f_sel = 'all'
            f = 'No'
        else:
            f_sel = 'sel'
            f = 'Yes'
        gs = f_sel_gs[f_sel]
        #Save scores
        scores_rows[0][f'{alg} (val)'] = gs.cv_results_[f'mean_test_f1_macro'][gs.best_index_]
        scores_rows[0][f'{alg} (train)'] = gs.cv_results_[f'mean_train_f1_macro'][gs.best_index_]
        scores_rows[1][f'{alg} (val)'] = gs.cv_results_[f'mean_test_f1_micro'][gs.best_index_]
        scores_rows[1][f'{alg} (train)'] = gs.cv_results_[f'mean_train_f1_micro'][gs.best_index_]
        #Save parameters
        params = gs.cv_results_['params'][gs.best_index_]
        params_row[f'{alg} (Feature Selection)'] = f
        for param_name, param in params.items():
            params_row[f'{alg} ({param_name})'] = param
    scores_df.extend(scores_rows)
    params_df.append(params_row)    
scores_df = pd.DataFrame(scores_df).set_index(['Classifier', 'F1-score average'])
params_df = pd.DataFrame(params_df)

In [None]:
with pd.ExcelWriter('Results/GS_scores.xlsx', mode='a') as writer:  
    scores_df.to_excel(writer, sheet_name='Subclass')
scores_df

In [None]:
params_df['Level'] = 'Subclass'
pd.concat([pd.read_pickle('Results/GS_best_comb_params.pkl'), params_df]).to_pickle('Results/GS_best_comb_params.pkl')

In [None]:
column_order = ['RF (Feature Selection)', 'RF (class_weight)', 'RF (criterion)', 'RF (max_features)', 
                'KNN (Feature Selection)', 'KNN (n_neighbors)', 'KNN (weights)', 'KNN (metric)',
                'LR (Feature Selection)', 'LR (penalty)', 'LR (C)', 'LR (l1_ratio)', 'LR (fit_intercept)', 
                'LR (class_weight)', 'LR (solver)', 'LR (multi_class)',
                'SVM (Feature Selection)', 'SVM (penalty)', 'SVM (C)', 'SVM (class_weight)', 
                'NB (Feature Selection)', 'NB (alpha)', 'NB (norm)']

In [None]:
df = pd.read_pickle('Results/GS_best_comb_params.pkl').set_index(['Level', 'Classifier'])[column_order]

In [None]:
df.loc[:, ['RF (class_weight)', 'LR (class_weight)', 'SVM (class_weight)']] = df.loc[:, ['RF (class_weight)', 'LR (class_weight)', 'SVM (class_weight)']].replace({None: 'None'})

In [None]:
df.to_csv('Results/GS_best_comb_params.csv')