This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [2]:
cd /Users/loganroberts/Learn2Therm/ValidProt/FAFSA

/Users/loganroberts/Learn2Therm/ValidProt/FAFSA


In [3]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [4]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [5]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [6]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    
    feature_dict_a = get_protein_descriptors(fasta_files[0], descriptors)
    
    feature_dict_b = get_protein_descriptors(fasta_files[1], descriptors)


    df = dataframe.reset_index()

    for desc in descriptors:

        feature_dict_a[desc].index = feature_dict_a[desc].index.astype(int)
        features_a = feature_dict_a[desc].reset_index()
        
        feature_dict_b[desc].index = feature_dict_b[desc].index.astype(int)
        features_b = feature_dict_b[desc].reset_index()
    
    feature_df = pd.merge(
            df,
            features_a,
            how='outer',
            left_index=True,
            right_index=True)
    
    new_df = pd.merge(
            feature_df,
            features_b,
            how='outer',
            left_index=True,
            right_index=True)

    return new_df

In [7]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [8]:
target = pd.read_csv('protein_match_50k')
target['protein_match'] = target['protein_match'].map({'Yes': 1, 'No': 0})
target

Unnamed: 0.1,Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
0,0,48641291,1,1.00
1,1,92992745,1,1.00
2,2,157628663,1,1.00
3,3,136708305,1,1.00
4,4,133672542,1,1.00
...,...,...,...,...
48845,4875,78849058,0,0.25
48846,4876,108797464,1,1.00
48847,4877,161110219,0,0.25
48848,4878,74177185,1,0.50


In [9]:
from sklearn.utils import resample

# Assuming your data is in a pandas DataFrame called 'data'
# Separate the majority and minority classes
majority_class = target[target['protein_match'] == 1]
minority_class = target[target['protein_match'] == 0]

# Undersample the majority class to match the number of minority class samples
n_samples = len(minority_class)
undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

# Combine the undersampled majority class with the minority class
balanced_data = pd.concat([undersampled_majority, minority_class])

In [10]:
balanced_data

Unnamed: 0.1,Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
15519,865,45982583,1,1.000000
25031,623,65376717,1,0.600000
20705,1168,96763833,1,0.400000
48450,4480,16236144,1,0.500000
47129,3159,151972561,1,0.500000
...,...,...,...,...
48833,4863,70996712,0,0.285714
48839,4869,14456723,0,0.250000
48845,4875,78849058,0,0.250000
48847,4877,161110219,0,0.250000


In [11]:
df = pd.merge(df, balanced_data, on=['prot_pair_index'])
df.shape

(17108, 33)

In [12]:
df

Unnamed: 0,Unnamed: 0_x,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len,Unnamed: 0_y,protein_match,Jaccard_Score
0,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,875,...,22.0,MTSGLWERVLDGVWVTIQLLVLSALLATAVSFVVGIARTHRLWIVR...,MAMSRRKRGQLARGIQYAILVIVVVVLALLADWGKIGKAFFDWEAA...,ectoine/hydroxyectoine ABC transporter permeas...,amino acid ABC transporter permease,234,269,3,1,1.00
1,6,0.283582,0.231707,0.240506,133,0.810976,134,0.881579,99,7134,...,24.5,MGSTDRPDLAAMLAPLTRTLIAMERPVLETYGLTMWAYSVLVALSR...,MGHKDELIARIQRAEREFRRSVVSQAATDFFSVDLTMPQLRVVFFL...,MarR family transcriptional regulator,MarR family transcriptional regulator,152,164,6,1,1.00
2,9,0.333333,0.295082,0.291498,218,0.893443,221,0.884000,158,875,...,24.0,MQIKDSVAVVTGGASGLGLATTKRLLDAGGSVVVIDLKGEDVVAEL...,MTGTVVITGGSRGIGAATAVLAAERGWQVAVSFRERRDAAEQVVRR...,3-hydroxyacyl-CoA dehydrogenase,SDR family oxidoreductase,250,244,9,1,0.75
3,11,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,14963,...,31.5,MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...,MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...,glycosyltransferase family 9 protein,S-methyl-5'-thioadenosine phosphorylase,375,269,11,0,0.00
4,18,0.394495,0.346774,0.369099,108,0.870968,106,0.972477,130,14963,...,24.5,MHELSIAQSVVEAVCERADGRRVRSVRMVVGALCAVVPDSMLFCFE...,MGLCEAIVDAVLQRAEGRRVRSVRVRVAGHPVVREVVDQGFAMAAA...,hydrogenase maturation nickel metallochaperone...,hydrogenase maturation nickel metallochaperone...,109,124,18,1,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,49992,0.331707,0.294372,0.282158,202,0.874459,218,0.868526,232,11324,...,24.0,MSTLVADTVSKRFGGVTALSNVSLELREGEIHGLIGPNGSGKTTLL...,MLEVKNIDLFYGASRALRSVSLTAQAGQVTAVMGRNGVGKTSLIRA...,ABC transporter ATP-binding protein,urea ABC transporter ATP-binding subunit UrtE,251,231,4872,1,0.50
17104,49993,0.304878,0.188679,0.163666,166,0.626415,170,0.491329,125,875,...,22.0,MQVGDATVVITGASSGIGRATALAFADAGANLVLTARAKNPLRAAE...,MGRLSGKRAVVTGGATLIGQAVVRTLHREGARVAIADIDEAGAKAV...,SDR family oxidoreductase,SDR family oxidoreductase,346,265,4873,1,0.60
17105,49995,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,14963,...,24.5,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,aminoglycoside phosphotransferase family protein,serine/threonine protein kinase,271,353,4875,0,0.25
17106,49997,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,7134,...,24.5,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,ABC transporter ATP-binding protein,ABC transporter ATP-binding protein,331,338,4877,0,0.25


In [13]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       , 'prot_pair_index', 'ogt_difference', 'Jaccard_Score',
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'Unnamed: 0_x', 'Unnamed: 0_y', 'thermo_index','m_protein_seq', 't_protein_seq', 
                       'm_protein_desc', 't_protein_desc'])

In [14]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'm_protein_len', 't_protein_len', 'protein_match'],
      dtype='object')

In [15]:
df = df.replace([np.inf, -np.inf], np.nan)

In [18]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
subject_align_len                    0
subject_align_cov                    0
bit_score                            0
m_protein_len                        0
t_protein_len                        0
protein_match                        0
dtype: int64


array([0])

In [17]:
df = df.dropna(axis=1, how='any')

Use MRMR to select for the best features. Going to start by grouping into different categories of features generated from iFeature Omega.

In [19]:
#use MRMR to select for the best features from PseKRAAC
df_subset = df.loc[:, df.columns != 'protein_match']
print(type(df_subset))

# select top 10 features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=df_subset.iloc[:,10:], y=df['protein_match'], K=20)

selected_features

<class 'pandas.core.frame.DataFrame'>


ValueError: number sections must be larger than 0.

In [66]:
best_features_df = df[[feature for feature in selected_features]]

#concatenates original feature vector back into the dataframe
df = pd.concat([df.iloc[:, :11], best_features_df], axis=1)
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,AAC_K,AAC_Q,AAC_I,QSOrder_Grantham.Xd.1,AAC_S,AAC_N,AAC_F,AAC_D,GAAC_uncharge,QSOrder_Schneider.Xd.2
0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,206,202,...,-0.029991,-0.020571,-0.029991,-0.064588,-0.015524,-0.019898,-0.015140,0.013410,-0.033452,0.022842
1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,233,237,...,-0.007497,0.021821,-0.016008,-0.093397,0.004944,-0.016443,0.008946,0.035711,-0.017982,0.055143
2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,287,252,...,-0.017325,0.002033,-0.035134,0.034032,-0.067847,-0.018777,-0.004355,0.027003,-0.114015,0.005861
3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,234,269,...,0.008722,-0.000381,-0.018238,-0.082094,0.003511,-0.006323,0.010946,-0.016919,-0.020144,0.055484
4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,80,66,...,-0.032955,0.022348,0.001894,-0.024133,0.037500,0.007197,-0.066667,-0.005303,0.051136,0.049946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,0.387574,0.367978,0.360882,336,0.943820,337,0.910811,588,370,356,...,0.002855,0.030626,-0.015958,0.004543,-0.001488,-0.007319,-0.015639,0.028135,0.023504,-0.002964
6344,0.488462,0.488462,0.455197,258,0.992308,263,0.882550,619,298,260,...,-0.001962,0.004750,-0.005885,0.052205,0.010403,-0.000077,0.011461,0.001652,0.007073,-0.015008
6345,0.383838,0.376238,0.374384,296,0.976898,299,0.977124,436,306,303,...,0.009707,-0.003494,0.035462,-0.063804,0.005986,0.022811,-0.000356,-0.000453,0.040866,-0.007821
6346,0.769231,0.769231,0.769231,78,1.000000,78,1.000000,258,78,78,...,0.025641,0.000000,0.000000,0.023367,0.051282,0.000000,-0.012821,-0.012821,-0.038462,0.012262


In [20]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(14541, 11)
(2567, 11)
(12359, 11)
(2182, 11)


In [21]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [22]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(14541, 10) (2567, 10) (14541, 1) (2567, 1)


In [23]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [24]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [43]:
def k_fold_cross_val(dataframe, n_splits=10):  
    """
    Runs k-fold cross validation on dataset.
    Default = 10-fold.

    Params
    ----------
    -dataframe: Pandas dataframe
    -n_splits: Number of cross validations (int)

    Returns
    -------
    -vector of predictions
    """
    
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)

    train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)
    
    target = 'protein_match'
    input_features = [columns for columns in df]
    input_features.remove(target)
    
    dev_X = dev[input_features].values
    test_X = test[input_features].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1) 

    from sklearn.model_selection import StratifiedKFold

    cv = StratifiedKFold(n_splits, shuffle=True)

    for (train_index, test_index) in cv.split(dev_X, dev_y):

        train_X = dev_X[train_index]
        val_X = dev_X[test_index]

        train_y = dev_y[train_index]
        val_y = dev_y[test_index]

        model.fit(train_X, train_y)

        preds = model.predict(val_X)

        return preds

In [25]:
preds

NameError: name 'preds' is not defined

In [27]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier()

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [28]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.7653528872593951
[0 0 1 ... 0 1 0]


In [29]:
proba_y = model.predict_proba(val_X)[:,1]

In [30]:
proba_y

array([0.36, 0.92, 0.4 , ..., 1.  , 0.28, 0.37])

In [31]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fab12e5e790>

In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
    RandomForestClassifier, \
    AdaBoostClassifier, \
    GradientBoostingClassifier, \
    ExtraTreesClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

names = ['LR', 'KNN', 'DT', 'NB', 'RF', 'Bagging', 'AB', 'GB', 'SVM']

#list of classifiers (hyperparameters optimized)
classifiers = [
    #Regression
    LogisticRegression(),
    #KNN (neighbors optimized iteratively)
    KNeighborsClassifier(n_neighbors=20),
    #Decision Tree
    DecisionTreeClassifier(max_features=None),
    #Gaussian
    GaussianNB(),
    #RF Classifier (with optuna)
    RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        max_samples=0.3,
        max_features=0.5,
        min_weight_fraction_leaf=0,
        min_samples_split=17),
    #RF Classifier with bagging (with optuna)
    BaggingClassifier(sklearn.ensemble.RandomForestClassifier
    (n_estimators=200, max_depth=None, 
     min_weight_fraction_leaf=0.000215),max_samples=0.5, 
    max_features=0.5),
    #AdaBoost (with optuna)
    AdaBoostClassifier(n_estimators=53, learning_rate=0.156),
    #Gradient Boosting (with optuna)
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
                                max_depth=1),   
    #C-support vector classification (9)
#     SVC(),
]

F = open('evaluationResults.txt', 'w')

F.write('Evaluation Scale:'+'\n')
F.write('0.0% <=Accuracy<= 100.0%'+'\n')
F.write('0.0 <=AUC<= 1.0'+'\n') #area under curve
F.write('0.0 <=auPR<= 1.0'+'\n')  # average_Precision
F.write('0.0 <=F1_Score<= 1.0'+'\n')
F.write('-1.0 <=MCC<= 1.0'+'\n')
F.write('_______________________________________'+'\n')

def runClassifiers(filename:str, dataframe, columns=[], target=[], model=RandomForestClassifier()):

    """
    Takes dataframe and splits it into a training and testing set. 
    Trains a RF Classifier with data.

    Params
    ----------
    dataframe: Pandas dataframe
    columns: list of strings, representing input features
    target: list of strings, representing target feature(s)

    Returns
    -------
    -Accuracy score
    -area under ROC curve
    -train data (target)
    -validation data (features)
    -validation data (target)
    """
    
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)

    train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)
    
    dev_X = dev[columns].values
    test_X = test[columns].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1) 

    # test input arguments
    assert "pandas.core.frame.DataFrame" in str(type(train))
    assert "pandas.core.frame.DataFrame" in str(type(val))
    assert "str" in str(type(columns[0]))
    assert "str" in str(type(target[0]))

    # split into input and output feature(s)
    train_X = train[columns].values
    val_X = val[columns].values

    train_y = train[target].values.reshape(-1, 1)
    val_y = val[target].values.reshape(-1, 1)

    # scale data
    scaler = sklearn.preprocessing.StandardScaler()
    train_X = scaler.fit_transform(train_X)
    val_X = scaler.fit_transform(val_X)

    Results = {}  # compare algorithms

    from sklearn.metrics import accuracy_score, \
        confusion_matrix, \
        roc_auc_score,\
        average_precision_score,\
        auc,\
        roc_curve, f1_score, recall_score, matthews_corrcoef, auc

    for classifier, name in zip(classifiers, names):
        accuracy = []
        avg_precision = []
        F1_Score = []
        AUC = []
        MCC = []
        Recall = []
        
        mean_TPR = 0.0
        mean_FPR = np.linspace(0, 1, 100)

        print('{} is done.'.format(classifier.__class__.__name__))

        model = classifier

        # model
        model.fit(train_X, train_y)

        preds = model.predict(val_X)

        # Calculate ROC Curve and Area the Curve
        proba_y = model.predict_proba(val_X)[:,1]
        FPR, TPR, _ = roc_curve(val_y, proba_y, pos_label=1)
        roc_auc = auc(FPR, TPR)
        
        #calculate scoring metrics
        #include option to return these scores
        accuracy.append(accuracy_score(y_pred=preds, y_true=val_y))
        avg_precision.append(average_precision_score(y_true=val_y, y_score=proba_y, pos_label=1))
        F1_Score.append(f1_score(y_true=val_y, y_pred=preds, pos_label=1))
        MCC.append(matthews_corrcoef(y_true=val_y, y_pred=preds))
        Recall.append(recall_score(y_true=val_y, y_pred=preds, pos_label=1))
        AUC.append(roc_auc)

        confusion_matrix = sklearn.metrics.confusion_matrix(y_pred=preds, y_true=val_y)
        sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

        accuracy = [_*100.0 for _ in accuracy]
        Results[name + ' Accuracy, F1 Score'] = [accuracy, F1_Score]
        
        F.write('Classifier: {}\n'.format(name))
        F.write('Accuracy: {0:.4f}%\n'.format(np.mean(accuracy)))
        F.write('AUC: {0:.4f}\n'.format( np.mean(AUC)))
        F.write('auPR: {0:.4f}\n'.format(np.mean(avg_precision))) # average_Precision
        F.write('F1_Score: {0:.4f}\n'.format(np.mean(F1_Score)))
        F.write('MCC: {0:.4f}\n'.format(np.mean(MCC)))

#         TN, FP, FN, TP = CM.ravel()
        F.write('Recall: {0:.4f}\n'.format( np.mean(Recall)) )
        F.write('_______________________________________'+'\n')
    
    F.close()
    
    return Results


# if __name__ == '__main__':
#     # print('Please, enter number of cross validation:')
#     import argparse
#     p = argparse.ArgumentParser(description='Run Machine Learning Classifiers.')

#     p.add_argument('-cv', '--nFCV', type=int, help='Number of crossValidation', default=10)
#     p.add_argument('-data', '--dataset', type=str, help='~/dataset.csv', default='optimumDataset.csv')
#     p.add_argument('-roc', '--auROC', type=int, help='Print ROC Curve', default=1, choices=[0, 1])
#     p.add_argument('-box', '--boxPlot', type=int, help='Print Accuracy Box Plaot', default=1, choices=[0, 1])

#     args = p.parse_args()

#     runClassifiers(args)

In [None]:
result = runClassifiers('sample.csv', df, columns=input_features, target=target)

LogisticRegression is done.
KNeighborsClassifier is done.
DecisionTreeClassifier is done.
GaussianNB is done.
RandomForestClassifier is done.
BaggingClassifier is done.


In [None]:
result