In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
import pprint
from sklearn.metrics import accuracy_score, make_scorer, f1_score, recall_score,balanced_accuracy_score,precision_score
import seaborn as sns
import os
import time
from tqdm import tqdm
import matplotlib.pyplot as plt

In [None]:
df_result_aa = pd.read_csv('df_result_aa.csv', index_col = 0, keep_default_na=False)
df_result_kmer = pd.read_csv('df_result_kmer.csv', index_col = 0, keep_default_na=False)
df_result_one_hot = pd.read_csv('df_result_one_hot.csv', index_col = 0, keep_default_na=False)

In [None]:
# Box plot after CV-fold crossvalidation
# this function returns a dataframe with  'accuracy', 'f1', 'sensitivity', 'specificity' 
# calculated within k-fold cross validation of a list of classifiers

# models = list of classifiers
# X = list of feature vectors
# y = list of labels
# models_names = custom names of the classifiers for the dataframe
# CV = number of folds in validation
# number_of_processors = number of processors used for peerforming the cross validation

# if number_of_processors = -1, all processors are used 
# (DISCLAIMER: it may cause a warning ``timeout or by a memory leak'') 

def Multi_class_Crossvalidating_(
    X,
    y,
    CV = 5,
    list_of_chosen = ['SVM'],
    plotting = True,
    plot_name = '-fold',
    number_of_processors = -1
    ):
    sns.set_context('paper')
    dictionary_of_classifiers = {
        'Logistic Regression': LogisticRegression(
            C = 30.0,
            class_weight = 'balanced',
            solver = 'newton-cg',
            multi_class = 'multinomial',
            n_jobs = -1,
            random_state = 42
            ),
        'Decision Tree': DecisionTreeClassifier(random_state = 42),
        'Random Forest': RandomForestClassifier(
            n_estimators = 100,
            max_leaf_nodes = 15,
            random_state = 42,
            class_weight='balanced'
            )
        }
    table = pd.DataFrame(index = range(CV*len(list_of_chosen)))
    table_entries = []
    y_series = pd.Series(y)
    skf=StratifiedKFold(n_splits =5, shuffle = True, random_state = 0)
    
    for name, classifier in dictionary_of_classifiers.items():
        print(name)
        
        if name not in list_of_chosen:
            scoring = {
                'accuracy': make_scorer(accuracy_score),
                'balanced_accuracy': make_scorer(balanced_accuracy_score),
                'precision':make_scorer(precision_score,average='macro'),
                'recall': make_scorer(recall_score, average = 'macro'),
                'f1_macro':make_scorer(f1_score, average = 'macro')
                }
            
            scores = cross_validate(
                classifier,
                X,
                y_series,
                cv = skf,
                n_jobs = number_of_processors,
                scoring = scoring,
                return_train_score = False
                )
            
            accuracy = scores['test_accuracy']
            precision=scores['test_precision']
            f1 = scores['test_f1_macro']
            recall = scores['test_recall']
    
            
            print(len(precision),len(f1))   
            for j in range(0,5):
                print((name, accuracy[j], precision[j], f1[j], recall[j]))
                table_entries.append((name, accuracy[j], precision[j], f1[j], recall[j]))
    
    
    table = pd.DataFrame(table_entries, columns = ['classifier', 'accuracy','precision', 'F1','recall'])
    if plotting == True:
        df_melted = pd.melt(
            table, id_vars = ['classifier'],
            value_vars = ['accuracy','precision', 'F1','recall'],
            var_name = 'scores'
            )
        sns.boxplot(
            x = 'scores',
            y = 'value',
            data = df_melted,
            hue = 'classifier',
            palette = 'Set3',
            showfliers = False
            
            )
        
        # Put the legend out of the figure
        plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad = 0.)
        
        output_file_name = str(CV) + plot_name + '.tif'
        
        if os.path.exists(output_file_name):
            plt.savefig(output_file_name.format(int(time.time())), bbox_inches = 'tight', dpi = 300)
        else:
            plt.savefig(output_file_name, bbox_inches = 'tight', dpi = 300)
        plt.clf()
    return table

In [None]:
def sort_by_greek_alphabet(x):
    greek_order = ['Alpha', 'Beta', 'Gamma', 'Delta', 'Epsilon', 'Eta', 'Iota', 'Kappa', 'Lambda', 'N/A', 'Omicron', 'Zeta', 'Mu', 'other']
    return sorted(x, key=greek_order.index)

def create_species_list(df):
    species_list = sort_by_greek_alphabet(set(df['species_name']))
    species_list.remove('VUM') 
    return species_list

def assign_amino_acid_parameter(df):
    # Amino acid property [amino acid volume, amino acid hydrophilicity]. Both of these are relative values.
    amino_acid_propeties = {
        'A':[-2.90, -1.03], 'R':[2.41, 1.31], 'N':[-0.68, 0.79],
        'D':[-0.92, 1.23], 'C':[-1.89, 0.15], 'Q':[0.36, 1.09],
        'E':[0.16, 1.28], 'G':[-4.04, 0.01], 'H':[0.83, 1.15],
        'I':[0.51, -1.32], 'L':[0.52, -1.40], 'K':[0.92, 1.23],
        'M':[0.92, -1.42], 'F':[2.22, -1.47], 'P':[-1.25, -0.64],
        'S':[-2.36, 0.38], 'T':[-1.19, 0.28], 'W':[4.28, -0.18],
        'Y':[2.75, -0.18], 'V':[-0.65, -1.27], '-':[0, 0]
    }
    
    # Assign amino acid properties from amino acid sequences.
    pairwised_aa = [list(item) for item in tqdm(df['pairwised_sequence'].tolist())]
    amino_acid_property = [[amino_acid_propeties[i] for i in sublist] for sublist in tqdm(pairwised_aa)]
    
    pairwised_volume = [[item[0] for item in sublist] for sublist in tqdm(amino_acid_property)] 
    pairwised_hydrophilicity = [[item[1] for item in sublist] for sublist in tqdm(amino_acid_property)]
    amino_acid_volume_hydrophilicity = [sum([volume, hydro], []) for volume, hydro in tqdm(zip(pairwised_volume, pairwised_hydrophilicity))]
    return amino_acid_volume_hydrophilicity

In [None]:
train_data_aa, train_label_aa = assign_amino_acid_parameter(df_result_aa), df_result_aa['species_name'].tolist()

In [None]:
rnd_clf = RandomForestClassifier(n_estimators = 100, max_leaf_nodes = 16)

In [None]:
cross_validation_result_aa = Multi_class_Crossvalidating_(train_data_aa, train_label_aa, plotting = False, plot_name = '-fold_aa')

In [None]:
# RAW .csv data for Table 2
os.makedirs(os.path.join('Results', 'Table_2'), exist_ok = True)
cross_validation_result_aa.to_csv('Table_2_cross_validation_result_RAW_aa.csv', index = False)

In [None]:
result_mean_aa = cross_validation_result_aa.groupby('classifier').mean()
result_mean_l_aa = result_mean_aa.values.tolist()

In [None]:
result_sd_aa = cross_validation_result_aa.groupby('classifier').std()
result_sd_l_aa = result_sd_aa.values.tolist()

In [None]:
# Table 2
result_l_aa = [[f'{result_mean_l_aa[i][j]: .4f} ± {result_sd_l_aa[i][j]: .4f}' for i in range(len(result_mean_aa))] for j in range(len(result_mean_aa.columns))]
pd.DataFrame(result_l_aa, index= ['accuracy', 'precision', 'F1', 'recall'], columns = ['Decision Tree', 'Logistic Regression', 'Random Forest'])\
    .to_csv('Table_2_cross_validation_result_summary_aa.csv')