This notebook documents the development of classifiers.py, a script that allows a user to train or test our pipeline with 9 different classifier options. This script does not yet contain neural networks, though we are interesting in adding this feature.

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [3]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [4]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k_exploration.csv')

In [5]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [6]:
hmmer = pd.read_csv('protein_match_50k')
hmmer['protein_match'] = hmmer['protein_match'].map({'Yes': 1, 'No': 0})
hmmer

Unnamed: 0.1,Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
0,0,48641291,1,1.00
1,1,92992745,1,1.00
2,2,157628663,1,1.00
3,3,136708305,1,1.00
4,4,133672542,1,1.00
...,...,...,...,...
48845,4875,78849058,0,0.25
48846,4876,108797464,1,1.00
48847,4877,161110219,0,0.25
48848,4878,74177185,1,0.50


In [7]:
from sklearn.utils import resample

# Assuming your data is in a pandas DataFrame called 'data'
# Separate the majority and minority classes
majority_class = hmmer[hmmer['protein_match'] == 1]
minority_class = hmmer[hmmer['protein_match'] == 0]

# Undersample the majority class to match the number of minority class samples
n_samples = len(minority_class)
undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

# Combine the undersampled majority class with the minority class
balanced_data = pd.concat([undersampled_majority, minority_class])

In [8]:
balanced_data

Unnamed: 0.1,Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
37479,3287,171351561,1,0.500000
42508,3431,62658808,1,0.333333
16645,1991,139826652,1,0.333333
17440,2786,179301357,1,1.000000
26624,2216,16609299,1,1.000000
...,...,...,...,...
48833,4863,70996712,0,0.285714
48839,4869,14456723,0,0.250000
48845,4875,78849058,0,0.250000
48847,4877,161110219,0,0.250000


In [9]:
df = pd.merge(df, balanced_data, on=['prot_pair_index'])
df.shape

(17108, 33)

In [10]:
#initiate random structure target while structure component is developed

df['structure_match'] =  np.random.randint(0,2,len(df))

In [11]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       ,'ogt_difference', 'Jaccard_Score',
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'Unnamed: 0_x', 'Unnamed: 0_y', 'thermo_index', 
                       'm_protein_desc', 't_protein_desc'])

In [12]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'prot_pair_index', 'm_protein_seq', 't_protein_seq',
       'm_protein_len', 't_protein_len', 'protein_match', 'structure_match'],
      dtype='object')

In [13]:
#deal with NaN's
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna(axis=1, how='any')

In [14]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
subject_align_len                    0
subject_align_cov                    0
bit_score                            0
prot_pair_index                      0
m_protein_seq                        0
t_protein_seq                        0
m_protein_len                        0
t_protein_len                        0
protein_match                        0
structure_match                      0
dtype: int64


array([0])

In [15]:
test_df = df.drop(columns=['m_protein_seq', 't_protein_seq'])

In [16]:
test_df = test_df.drop(columns = ['prot_pair_index'])
test_df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,protein_match,structure_match
0,0.333333,0.267380,0.254453,148,0.791444,163,0.791262,127,206,187,1,0
1,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,0,0
2,0.541126,0.500000,0.465549,231,0.924000,231,0.804878,546,287,250,1,1
3,0.270000,0.220408,0.222680,195,0.795918,210,0.875000,173,240,245,1,1
4,0.366120,0.340102,0.322115,178,0.903553,192,0.876712,214,219,197,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.284091,0.223214,0.245700,190,0.848214,175,0.956284,112,183,224,0,1
17104,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,271,353,0,1
17105,0.417989,0.389163,0.392060,190,0.935961,187,0.935000,314,200,203,1,0
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,331,338,0,1


In [17]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(test_df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(14541, 12)
(2567, 12)
(12359, 12)
(2182, 12)


In [18]:
#ID target and features, separate into separate arrays

target = ['protein_match', 'structure_match']
input_features = [columns for columns in test_df.drop(columns=target)]
# input_features.remove(target)

In [19]:
input_features

['local_gap_compressed_percent_id',
 'scaled_local_query_percent_id',
 'scaled_local_symmetric_percent_id',
 'query_align_len',
 'query_align_cov',
 'subject_align_len',
 'subject_align_cov',
 'bit_score',
 'm_protein_len',
 't_protein_len']

In [20]:
target

['protein_match', 'structure_match']

In [21]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values
test_y = test[target].values

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(14541, 10) (2567, 10) (14541, 2) (2567, 2)


In [22]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values
val_y = val[target].values

Scale the data

In [23]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [25]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier()

model.fit(train_X, train_y)

Test the model, report relevant statistics

In [26]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.39092575618698444
[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [1 0]
 [1 1]]


In [27]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

true_labels = test_y

predicted_labels = preds

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

# Calculate precision
precision = precision_score(true_labels, predicted_labels, average=None)

# Calculate recall
recall = recall_score(true_labels, predicted_labels, average=None)

# Calculate F1 score
f1 = f1_score(true_labels, predicted_labels, average=None)

# Print the scores
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.36190105181145305
Precision: [0.78373702 0.48171701]
Recall: [0.69907407 0.48363927]
F1 Score: [0.73898858 0.48267622]


In [28]:
df = df.drop(columns='prot_pair_index')

In [30]:
from sklearn.metrics import accuracy_score, \
    confusion_matrix, \
    roc_auc_score,\
    average_precision_score,\
    auc,\
    roc_curve, f1_score, recall_score, matthews_corrcoef, auc

In [31]:
#define evalute_model function for use in the main PairProphet pipeline

def evaluate_model(output_path, model, target: list, dataframe):
    '''
    Takes a trained model and test data and tests the model.
    Runs a single or multi-class Classifier depending on input.

    Args:
        output path: File path: str
        model: sklearn.neighbors.KNeighborsClassifier
        target: target for classifier (list)
        dataframe: pandas dataframe

    Returns:
        Vector of predictions (numpy arrray)
        precision score (numpy array)
        results (csv)
    '''
    from sklearn.metrics import accuracy_score, \
    confusion_matrix, \
    roc_auc_score,\
    average_precision_score,\
    auc,\
    roc_curve, f1_score, recall_score, matthews_corrcoef, auc
    
    if 'structure_match' not in target:
        # initialize empty eval results file
        F = open('evaluationResults.txt', 'w')

        F.write('Evaluation Scale:' + '\n')
        F.write('0.0% <=Accuracy<= 100.0%' + '\n')
        F.write('0.0 <=AUC<= 1.0' + '\n')  # area under curve
        F.write('0.0 <=auPR<= 1.0' + '\n')  # average_Precision
        F.write('0.0 <=F1_Score<= 1.0' + '\n')
        F.write('-1.0 <=MCC<= 1.0' + '\n')
        F.write('_______________________________________' + '\n')

        results_df = dataframe[['m_protein_seq', 't_protein_seq']]

        dataframe = dataframe.drop(columns=['m_protein_seq', 't_protein_seq'])

        features = [columns for columns in dataframe.drop(columns=target)]

        # split into input and output feature(s)
        test_X = dataframe[features].values
        test_y = dataframe[target].values.reshape(-1, 1)

        # scale data
        scaler = sklearn.preprocessing.StandardScaler()
        test_X = scaler.fit_transform(test_X)

        # test input arguments
        assert "sklearn" in str(type(model))
        assert "numpy.ndarray" in str(type(test_X))
        assert "numpy.ndarray" in str(type(test_y))

        # vector of predictions
        preds = model.predict(test_X)

        # Calculate ROC Curve and Area the Curve
        proba_y = model.predict_proba(test_X)[:, 1]
        FPR, TPR, _ = roc_curve(test_y, proba_y, pos_label=1)
        roc_auc = auc(FPR, TPR)

        # calculate scoring metrics
        # include option to return these scores
        accuracy = 100*(accuracy_score(y_pred=preds, y_true=test_y))
        avg_precision = average_precision_score(
                y_true=test_y,
                y_score=proba_y,
                pos_label=1)
        F1_Score = f1_score(y_true=test_y, y_pred=preds, pos_label=1)
        MCC = matthews_corrcoef(y_true=test_y, y_pred=preds)
        Recall = recall_score(y_true=test_y, y_pred=preds, pos_label=1)
        AUC = roc_auc

        # confusion_matrix = sklearn.metrics.confusion_matrix(
        #     y_pred=preds, y_true=test_y)
        # sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

        F.write('Accuracy: {0:.4f}%\n'.format(accuracy))
        F.write('AUC: {0:.4f}\n'.format(AUC))
        F.write(
            'auPR: {0:.4f}\n'.format(avg_precision)
            )  # average_Precision
        F.write('F1_Score: {0:.4f}\n'.format(F1_Score))
        F.write('MCC: {0:.4f}\n'.format(MCC))

    #         TN, FP, FN, TP = CM.ravel()
        F.write('Recall: {0:.4f}\n'.format(Recall))
        F.write('_______________________________________' + '\n')

        # merge dataframes together to report results
        results_df['prediction'] = preds

        # save to csv
        results_df.to_csv('predictions.csv')

    else:
         # initialize empty eval results file
        F = open('evaluationResults.txt', 'w')

        F.write('Evaluation Scale:' + '\n')
        F.write('0.0% <=Accuracy<= 100.0%' + '\n')
        F.write('0.0 <=AUC<= 1.0' + '\n')  # area under curve
        F.write('0.0 <=auPR<= 1.0' + '\n')  # average_Precision
        F.write('0.0 <=F1_Score<= 1.0' + '\n')
        F.write('-1.0 <=MCC<= 1.0' + '\n')
        F.write('_______________________________________' + '\n')

        results_df = dataframe[['m_protein_seq', 't_protein_seq']]

        dataframe = dataframe.drop(columns=['m_protein_seq', 't_protein_seq'])

        features = [columns for columns in dataframe.drop(columns=target)]

        # split into input and output feature(s)
        test_X = dataframe[features].values
        test_y = dataframe[target].values

        # scale data
        scaler = sklearn.preprocessing.StandardScaler()
        test_X = scaler.fit_transform(test_X)

        # test input arguments
        assert "sklearn" in str(type(model))
        assert "numpy.ndarray" in str(type(test_X))
        assert "numpy.ndarray" in str(type(test_y))

        # vector of predictions
        preds = model.predict(test_X)
        hmmer_preds = preds[:, 0]
        structure_preds = preds[:, 1]
        assert len(hmmer_preds) == len(structure_preds)

        # calculate scoring metrics
        accuracy = 100*accuracy_score(y_pred=preds, y_true=test_y)
        Precision = precision_score(
                y_pred=preds,
                y_true=test_y,
                average=None,
                pos_label=1
                )
        F1_Score = f1_score(y_true=test_y, y_pred=preds, pos_label=1, average=None)
        Recall = recall_score(y_true=test_y, y_pred=preds, pos_label=1, average=None)

        print(accuracy)
        print(Precision)
        print(F1_Score)
        print(Recall)
    
        # confusion_matrix = sklearn.metrics.confusion_matrix(
        #     y_pred=preds, y_true=test_y)
        # sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

        F.write('Accuracy: {0:.4f}%\n'.format(accuracy))
        F.write('Mean Precision: {0:.4f}\n'.format(np.mean(Precision)))
        F.write('Mean F1_Score: {0:.4f}\n'.format(np.mean(F1_Score)))

    #         TN, FP, FN, TP = CM.ravel()
        F.write('Mean Recall: {0:.4f}\n'.format(np.mean(Recall)))
        F.write('_______________________________________' + '\n')

        # merge dataframes together to report results
        results_df['hmmer_prediction'] = hmmer_preds
        results_df['structure_prediction'] = structure_preds
        results_df['hmmer_structure_match'] = results_df['hmmer_prediction'] == results_df['structure_prediction']

        # save to csv
        results_df.to_csv('predictions.csv')

    return preds, results_df

In [32]:
evaluate_model('hello', model, ['protein_match', 'structure_match'], df)

82.24807107785831
[0.94225437 0.86059545]
[0.92777019 0.86059545]
[0.91372457 0.86059545]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['hmmer_prediction'] = hmmer_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['structure_prediction'] = structure_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_df['hmmer_structure_match'] = results_df['hmmer_prediction'] == results_df['structure_prediction']


(array([[1, 0],
        [0, 0],
        [1, 1],
        ...,
        [1, 0],
        [0, 1],
        [0, 0]]),
                                            m_protein_seq  \
 0      MVTTGERQRNARGEGARLRLEIVAATQALLADGETATLRSIARRAG...   
 1      MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...   
 2      MSPSPREDTRPEGEHEVKGLAFRDVGEGEQEAPPPGPPDRKERRVT...   
 3      MIEVKQLVKSFGSLTVLKGVNLTVNEKEVVVLLGASGSGKSTLLRC...   
 4      MPLRVILAEDSALMREGLVGLLDRFGHTTVAAVGDAGEVAAAVERE...   
 ...                                                  ...   
 17103  MRRQVGSVPGVTRTAGAAGGGAAARTAPGGPPDVVLMSCSHPRAAE...   
 17104  MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...   
 17105  MFRTGVKAEIGRSLAVVGEAEDVERAVRVVLEQRPDVVLLDVHLPG...   
 17106  MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...   
 17107  MTGQGASGSETGVSDDSPITVVVVDDQELLRTGLRDLAEHDGDIAV...   
 
                                            t_protein_seq  hmmer_prediction  \
 0      MNRPTYHHGDLRAAILTEAARLVAERGAERVSLRELAREAGVSHAA...                 1 

In [33]:
results = pd.read_csv('predictions.csv')
results

Unnamed: 0.1,Unnamed: 0,m_protein_seq,t_protein_seq,hmmer_prediction,structure_prediction,hmmer_structure_match
0,0,MVTTGERQRNARGEGARLRLEIVAATQALLADGETATLRSIARRAG...,MNRPTYHHGDLRAAILTEAARLVAERGAERVSLRELAREAGVSHAA...,1,0,False
1,1,MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...,MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...,0,0,True
2,2,MSPSPREDTRPEGEHEVKGLAFRDVGEGEQEAPPPGPPDRKERRVT...,MSAADAVTVEAGAPARGWVLPLIAAALVAGMLGYLQTAELDSIEAR...,1,1,True
3,3,MIEVKQLVKSFGSLTVLKGVNLTVNEKEVVVLLGASGSGKSTLLRC...,MNAIEVRHLTKHFRKPYDKTLKGYLTSFIKGEKRYQEFTALDDVSF...,1,1,True
4,4,MPLRVILAEDSALMREGLVGLLDRFGHTTVAAVGDAGEVAAAVERE...,MIRVLLADDQHLIREAIASLLGLEPDLEVVAQVGRGDEVVAAVHVH...,1,0,False
...,...,...,...,...,...,...
17103,17103,MRRQVGSVPGVTRTAGAAGGGAAARTAPGGPPDVVLMSCSHPRAAE...,MAESHSTTRSGQIRVFVLDDHEVVRRGVRDLLDAEPDITVVGEAET...,0,1,False
17104,17104,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,0,1,False
17105,17105,MFRTGVKAEIGRSLAVVGEAEDVERAVRVVLEQRPDVVLLDVHLPG...,MILEAEPDIVVVGEAGDGEKAVEEARALQPDVVLMDIRMPRKDGVE...,1,0,False
17106,17106,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,0,1,False


In [34]:
#formulate script to run many different classifiers on same set of data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
    RandomForestClassifier, \
    AdaBoostClassifier, \
    GradientBoostingClassifier, \
    ExtraTreesClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

names = ['LR', 'KNN', 'DT', 'NB', 'RF', 'Bagging', 'AB', 'GB', 'SVM']

#list of classifiers (hyperparameters optimized)
classifiers = [
    #Regression
    LogisticRegression(),
    #KNN (neighbors optimized iteratively)
    KNeighborsClassifier(n_neighbors=20),
    #Decision Tree
    DecisionTreeClassifier(max_features=None),
    #Gaussian
    GaussianNB(),
    #RF Classifier (with optuna)
    RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        max_samples=0.3,
        max_features=0.5,
        min_weight_fraction_leaf=0,
        min_samples_split=17),
    #RF Classifier with bagging (with optuna)
    BaggingClassifier(sklearn.ensemble.RandomForestClassifier
    (n_estimators=200, max_depth=None, 
     min_weight_fraction_leaf=0.000215),max_samples=0.5, 
    max_features=0.5),
    #AdaBoost (with optuna)
    AdaBoostClassifier(n_estimators=53, learning_rate=0.156),
    #Gradient Boosting (with optuna)
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
                                max_depth=1),   
    #C-support vector classification (9)
#     SVC(),
]

F = open('evaluationResults.txt', 'w')

F.write('Evaluation Scale:'+'\n')
F.write('0.0% <=Accuracy<= 100.0%'+'\n')
F.write('0.0 <=AUC<= 1.0'+'\n') #area under curve
F.write('0.0 <=auPR<= 1.0'+'\n')  # average_Precision
F.write('0.0 <=F1_Score<= 1.0'+'\n')
F.write('-1.0 <=MCC<= 1.0'+'\n')
F.write('_______________________________________'+'\n')

def runClassifiers(filename:str, dataframe, columns=[], target=[], model=RandomForestClassifier()):

    """
    Takes dataframe and splits it into a training and testing set. 
    Trains a RF Classifier with data.

    Params
    ----------
    dataframe: Pandas dataframe
    columns: list of strings, representing input features
    target: list of strings, representing target feature(s)

    Returns
    -------
    -Accuracy score
    -area under ROC curve
    -train data (target)
    -validation data (features)
    -validation data (target)
    """
    
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)

    train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)
    
    dev_X = dev[columns].values
    test_X = test[columns].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1) 

    # test input arguments
    assert "pandas.core.frame.DataFrame" in str(type(train))
    assert "pandas.core.frame.DataFrame" in str(type(val))
    assert "str" in str(type(columns[0]))
    assert "str" in str(type(target[0]))

    # split into input and output feature(s)
    train_X = train[columns].values
    val_X = val[columns].values

    train_y = train[target].values.reshape(-1, 1)
    val_y = val[target].values.reshape(-1, 1)

    # scale data
    scaler = sklearn.preprocessing.StandardScaler()
    train_X = scaler.fit_transform(train_X)
    val_X = scaler.fit_transform(val_X)

    Results = {}  # compare algorithms

    from sklearn.metrics import accuracy_score, \
        confusion_matrix, \
        roc_auc_score,\
        average_precision_score,\
        auc,\
        roc_curve, f1_score, recall_score, matthews_corrcoef, auc

    for classifier, name in zip(classifiers, names):
        accuracy = []
        avg_precision = []
        F1_Score = []
        AUC = []
        MCC = []
        Recall = []
        
        mean_TPR = 0.0
        mean_FPR = np.linspace(0, 1, 100)

        print('{} is done.'.format(classifier.__class__.__name__))

        model = classifier

        # model
        model.fit(train_X, train_y)

        preds = model.predict(val_X)

        # Calculate ROC Curve and Area the Curve
        proba_y = model.predict_proba(val_X)[:,1]
        FPR, TPR, _ = roc_curve(val_y, proba_y, pos_label=1)
        roc_auc = auc(FPR, TPR)
        
        #calculate scoring metrics
        #include option to return these scores
        accuracy.append(accuracy_score(y_pred=preds, y_true=val_y))
        avg_precision.append(average_precision_score(y_true=val_y, y_score=proba_y, pos_label=1))
        F1_Score.append(f1_score(y_true=val_y, y_pred=preds, pos_label=1))
        MCC.append(matthews_corrcoef(y_true=val_y, y_pred=preds))
        Recall.append(recall_score(y_true=val_y, y_pred=preds, pos_label=1))
        AUC.append(roc_auc)

        confusion_matrix = sklearn.metrics.confusion_matrix(y_pred=preds, y_true=val_y)
        sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

        accuracy = [_*100.0 for _ in accuracy]
        Results[name + ' Accuracy, F1 Score'] = [accuracy, F1_Score]
        
        F.write('Classifier: {}\n'.format(name))
        F.write('Accuracy: {0:.4f}%\n'.format(np.mean(accuracy)))
        F.write('AUC: {0:.4f}\n'.format( np.mean(AUC)))
        F.write('auPR: {0:.4f}\n'.format(np.mean(avg_precision))) # average_Precision
        F.write('F1_Score: {0:.4f}\n'.format(np.mean(F1_Score)))
        F.write('MCC: {0:.4f}\n'.format(np.mean(MCC)))

#         TN, FP, FN, TP = CM.ravel()
        F.write('Recall: {0:.4f}\n'.format( np.mean(Recall)) )
        F.write('_______________________________________'+'\n')
    
    F.close()
    
    return Results, model


# if __name__ == '__main__':
#     # print('Please, enter number of cross validation:')
#     import argparse
#     p = argparse.ArgumentParser(description='Run Machine Learning Classifiers.')

#     p.add_argument('-cv', '--nFCV', type=int, help='Number of crossValidation', default=10)
#     p.add_argument('-data', '--dataset', type=str, help='~/dataset.csv', default='optimumDataset.csv')
#     p.add_argument('-roc', '--auROC', type=int, help='Print ROC Curve', default=1, choices=[0, 1])
#     p.add_argument('-box', '--boxPlot', type=int, help='Print Accuracy Box Plaot', default=1, choices=[0, 1])

#     args = p.parse_args()

#     runClassifiers(args)

In [35]:
#implement optional k-fold cross validation

def k_fold_cross_val(dataframe, n_splits=10):  
    """
    Runs k-fold cross validation on dataset.
    Default = 10-fold.

    Params
    ----------
    -dataframe: Pandas dataframe
    -n_splits: Number of cross validations (int)

    Returns
    -------
    -vector of predictions
    """
    
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)

    train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)
    
    target = 'protein_match'
    input_features = [columns for columns in df]
    input_features.remove(target)
    
    dev_X = dev[input_features].values
    test_X = test[input_features].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1) 

    from sklearn.model_selection import StratifiedKFold

    cv = StratifiedKFold(n_splits, shuffle=True)

    for (train_index, test_index) in cv.split(dev_X, dev_y):

        train_X = dev_X[train_index]
        val_X = dev_X[test_index]

        train_y = dev_y[train_index]
        val_y = dev_y[test_index]

        model.fit(train_X, train_y)

        preds = model.predict(val_X)

        return preds