This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [2]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [3]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k_exploration.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [4]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [5]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [6]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    
    feature_dict_a = get_protein_descriptors(fasta_files[0], descriptors)
    
    feature_dict_b = get_protein_descriptors(fasta_files[1], descriptors)


    df = dataframe.reset_index()

    for desc in descriptors:

        feature_dict_a[desc].index = feature_dict_a[desc].index.astype(int)
        features_a = feature_dict_a[desc].reset_index()
        
        feature_dict_b[desc].index = feature_dict_b[desc].index.astype(int)
        features_b = feature_dict_b[desc].reset_index()
    
    feature_df = pd.merge(
            df,
            features_a,
            how='outer',
            left_index=True,
            right_index=True)
    
    new_df = pd.merge(
            feature_df,
            features_b,
            how='outer',
            left_index=True,
            right_index=True)

    return new_df

In [7]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [8]:
target = pd.read_csv('protein_match_50k')
target['protein_match'] = target['protein_match'].map({'Yes': 1, 'No': 0})
target

Unnamed: 0.1,Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
0,0,48641291,1,1.00
1,1,92992745,1,1.00
2,2,157628663,1,1.00
3,3,136708305,1,1.00
4,4,133672542,1,1.00
...,...,...,...,...
48845,4875,78849058,0,0.25
48846,4876,108797464,1,1.00
48847,4877,161110219,0,0.25
48848,4878,74177185,1,0.50


In [9]:
from sklearn.utils import resample

# Assuming your data is in a pandas DataFrame called 'data'
# Separate the majority and minority classes
majority_class = target[target['protein_match'] == 1]
minority_class = target[target['protein_match'] == 0]

# Undersample the majority class to match the number of minority class samples
n_samples = len(minority_class)
undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

# Combine the undersampled majority class with the minority class
balanced_data = pd.concat([undersampled_majority, minority_class])

In [10]:
balanced_data

Unnamed: 0.1,Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
31723,2431,60539488,1,0.333333
26282,1874,87049990,1,0.500000
15250,596,148005611,1,1.000000
29439,147,163910669,1,0.666667
24293,4756,140356755,1,0.454545
...,...,...,...,...
48833,4863,70996712,0,0.285714
48839,4869,14456723,0,0.250000
48845,4875,78849058,0,0.250000
48847,4877,161110219,0,0.250000


In [11]:
df = pd.merge(df, balanced_data, on=['prot_pair_index'])
df.shape

(17108, 33)

In [12]:
df

Unnamed: 0,Unnamed: 0_x,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len,Unnamed: 0_y,protein_match,Jaccard_Score
0,11,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,14963,...,31.5,MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...,MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...,glycosyltransferase family 9 protein,S-methyl-5'-thioadenosine phosphorylase,375,269,11,0,0.000000
1,12,0.354067,0.245847,0.225954,208,0.691030,216,0.610170,264,7134,...,24.5,MDSLITVRNLHKLFRVDQREDSGLWPAIKSLIRRERAEVVAVDGVS...,MSEHILETDRLTRRYGDRLAVDAVSMAVRPREVYGFLGPNGAGKTT...,ATP-binding cassette domain-containing protein,ABC transporter ATP-binding protein,354,301,12,1,0.400000
2,32,0.314554,0.241007,0.228669,218,0.784173,212,0.688312,228,875,...,22.0,MRKQYGDFHAVKGIDLDVRPGECFGLLGPNGAGKSTTMRMLAGTSQ...,MTNGHTDLSGSGLTLGYDRRIVSRDLDVSIPPGSFTVIIGPNACGK...,ATP-binding cassette domain-containing protein,ABC transporter ATP-binding protein,308,278,30,0,0.222222
3,33,0.487685,0.452055,0.458333,202,0.922374,203,0.953052,480,4508,...,22.5,MAWTREQMAARAAKELQDGFYVNLGIGIPTLVANYIPDGVHVTLQS...,MDMLVEAAPLDRHGMARRLAADIPEGWVVNLGIGIPTLVSDHVPPE...,3-oxoacid CoA-transferase subunit B,3-oxoacid CoA-transferase subunit B,213,219,31,1,1.000000
4,36,0.298701,0.205357,0.206897,254,0.755952,238,0.719033,128,11324,...,21.5,MSATDTPSRPLASLDIKKEVERYALVLGWIFLIAIFGLLLPDTFLT...,MDLLNAIVALLNFVIIPATAYGAQLALGALGVTMIYGILRFSNFAH...,ABC transporter permease,branched-chain amino acid ABC transporter perm...,331,336,34,1,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,49989,0.284091,0.223214,0.245700,190,0.848214,175,0.956284,112,7134,...,22.5,MRRQVGSVPGVTRTAGAAGGGAAARTAPGGPPDVVLMSCSHPRAAE...,MAESHSTTRSGQIRVFVLDDHEVVRRGVRDLLDAEPDITVVGEAET...,response regulator transcription factor,response regulator transcription factor,183,224,4869,0,0.250000
17104,49994,0.322368,0.303406,0.294737,319,0.987616,335,0.979532,122,875,...,22.0,MLTVSYVGAHKIEIREQDPEPVHAGQVQVEVAYAGICGTDLHILHG...,MRAVTISEPGGPEKLQWTEVPDPRPGAGEVLLEVVASAVNRADVLQ...,alcohol dehydrogenase catalytic domain-contain...,NAD(P)H-quinone oxidoreductase,342,323,4874,1,0.600000
17105,49995,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,14963,...,24.5,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,aminoglycoside phosphotransferase family protein,serine/threonine protein kinase,271,353,4875,0,0.250000
17106,49997,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,7134,...,24.5,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,ABC transporter ATP-binding protein,ABC transporter ATP-binding protein,331,338,4877,0,0.250000


In [13]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       ,'ogt_difference', 'Jaccard_Score',
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'Unnamed: 0_x', 'Unnamed: 0_y', 'thermo_index', 
                       'm_protein_desc', 't_protein_desc'])

In [14]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'prot_pair_index', 'm_protein_seq', 't_protein_seq',
       'm_protein_len', 't_protein_len', 'protein_match'],
      dtype='object')

In [15]:
df = df.replace([np.inf, -np.inf], np.nan)

In [16]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
subject_align_len                    0
subject_align_cov                    0
bit_score                            0
prot_pair_index                      0
m_protein_seq                        0
t_protein_seq                        0
m_protein_len                        0
t_protein_len                        0
protein_match                        0
dtype: int64


array([0])

In [17]:
df = df.dropna(axis=1, how='any')

In [18]:
df = df.drop(columns=['m_protein_seq', 't_protein_seq'])

In [19]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'prot_pair_index', 'm_protein_len', 't_protein_len',
       'protein_match'],
      dtype='object')

Use MRMR to select for the best features. Going to start by grouping into different categories of features generated from iFeature Omega.

In [87]:
#use MRMR to select for the best features from PseKRAAC
df_subset = df.loc[:, df.columns != 'protein_match']
print(type(df_subset))

# select top 10 features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=df_subset.iloc[:,10:], y=df['protein_match'], K=20)

selected_features

<class 'pandas.core.frame.DataFrame'>


100%|██████████| 1/1 [00:00<00:00, 669.80it/s]


['t_protein_len']

In [88]:
best_features_df = df[[feature for feature in selected_features]]

#concatenates original feature vector back into the dataframe
df = pd.concat([df.iloc[:, :11], best_features_df], axis=1)
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,prot_pair_index,m_protein_len,t_protein_len,t_protein_len.1
0,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,133672542,80,66,66
1,0.333333,0.295082,0.291498,218,0.893443,221,0.884000,158,85102366,250,244,244
2,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,37128170,375,269,269
3,0.270000,0.220408,0.222680,195,0.795918,210,0.875000,173,114387951,240,245,245
4,0.394495,0.346774,0.369099,108,0.870968,106,0.972477,130,166997534,109,124,124
...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.251397,0.140187,0.134529,186,0.579439,181,0.520115,117,135090204,348,321,321
17104,0.304878,0.188679,0.163666,166,0.626415,170,0.491329,125,133762466,346,265,265
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,78849058,271,353,353
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,161110219,331,338,338


In [20]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(14541, 12)
(2567, 12)
(12359, 12)
(2182, 12)


In [21]:
val

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,prot_pair_index,m_protein_len,t_protein_len,protein_match
1968,0.314516,0.157258,0.174107,128,0.516129,122,0.610000,127,157036297,200,248,0
6217,0.371429,0.237443,0.232662,157,0.716895,137,0.600877,119,144857841,228,219,0
4435,0.331897,0.292776,0.308000,239,0.908745,229,0.966245,319,103555510,237,263,1
10376,0.597156,0.591549,0.565022,210,0.985915,210,0.901288,571,72720308,233,213,1
7375,0.309179,0.252964,0.259109,213,0.841897,205,0.850622,267,124384823,241,253,1
...,...,...,...,...,...,...,...,...,...,...,...,...
13199,0.325444,0.255814,0.248307,169,0.786046,179,0.785088,175,42719131,228,215,1
16196,0.324742,0.272727,0.233333,195,0.844156,206,0.666667,206,79907519,309,231,0
10844,0.309417,0.274900,0.281059,228,0.908367,218,0.908333,180,181100188,240,251,0
6910,0.241525,0.200000,0.212687,246,0.863158,233,0.928287,173,51999008,251,285,0


In [15]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [19]:
input_features

['local_gap_compressed_percent_id',
 'scaled_local_query_percent_id',
 'scaled_local_symmetric_percent_id',
 'query_align_len',
 'query_align_cov',
 'subject_align_len',
 'subject_align_cov',
 'bit_score',
 'prot_pair_index',
 'm_protein_seq',
 't_protein_seq',
 'm_protein_len',
 't_protein_len']

In [24]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(14541, 11) (2567, 11) (14541, 1) (2567, 1)


In [25]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [26]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [27]:
def k_fold_cross_val(dataframe, n_splits=10):  
    """
    Runs k-fold cross validation on dataset.
    Default = 10-fold.

    Params
    ----------
    -dataframe: Pandas dataframe
    -n_splits: Number of cross validations (int)

    Returns
    -------
    -vector of predictions
    """
    
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)

    train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)
    
    target = 'protein_match'
    input_features = [columns for columns in df]
    input_features.remove(target)
    
    dev_X = dev[input_features].values
    test_X = test[input_features].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1) 

    from sklearn.model_selection import StratifiedKFold

    cv = StratifiedKFold(n_splits, shuffle=True)

    for (train_index, test_index) in cv.split(dev_X, dev_y):

        train_X = dev_X[train_index]
        val_X = dev_X[test_index]

        train_y = dev_y[train_index]
        val_y = dev_y[test_index]

        model.fit(train_X, train_y)

        preds = model.predict(val_X)

        return preds

In [28]:
preds

NameError: name 'preds' is not defined

In [29]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier()

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [57]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

NameError: name 'val_X' is not defined

In [58]:
proba_y = model.predict_proba(val_X)[:,1]

NameError: name 'val_X' is not defined

In [59]:
proba_y

NameError: name 'proba_y' is not defined

In [None]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
    RandomForestClassifier, \
    AdaBoostClassifier, \
    GradientBoostingClassifier, \
    ExtraTreesClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

names = ['LR', 'KNN', 'DT', 'NB', 'RF', 'Bagging', 'AB', 'GB', 'SVM']

#list of classifiers (hyperparameters optimized)
classifiers = [
    #Regression
    LogisticRegression(),
    #KNN (neighbors optimized iteratively)
    KNeighborsClassifier(n_neighbors=20),
    #Decision Tree
    DecisionTreeClassifier(max_features=None),
    #Gaussian
    GaussianNB(),
    #RF Classifier (with optuna)
    RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        max_samples=0.3,
        max_features=0.5,
        min_weight_fraction_leaf=0,
        min_samples_split=17),
    #RF Classifier with bagging (with optuna)
    BaggingClassifier(sklearn.ensemble.RandomForestClassifier
    (n_estimators=200, max_depth=None, 
     min_weight_fraction_leaf=0.000215),max_samples=0.5, 
    max_features=0.5),
    #AdaBoost (with optuna)
    AdaBoostClassifier(n_estimators=53, learning_rate=0.156),
    #Gradient Boosting (with optuna)
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
                                max_depth=1),   
    #C-support vector classification (9)
#     SVC(),
]

F = open('evaluationResults.txt', 'w')

F.write('Evaluation Scale:'+'\n')
F.write('0.0% <=Accuracy<= 100.0%'+'\n')
F.write('0.0 <=AUC<= 1.0'+'\n') #area under curve
F.write('0.0 <=auPR<= 1.0'+'\n')  # average_Precision
F.write('0.0 <=F1_Score<= 1.0'+'\n')
F.write('-1.0 <=MCC<= 1.0'+'\n')
F.write('_______________________________________'+'\n')

def runClassifiers(filename:str, dataframe, columns=[], target=[], model=RandomForestClassifier()):

    """
    Takes dataframe and splits it into a training and testing set. 
    Trains a RF Classifier with data.

    Params
    ----------
    dataframe: Pandas dataframe
    columns: list of strings, representing input features
    target: list of strings, representing target feature(s)

    Returns
    -------
    -Accuracy score
    -area under ROC curve
    -train data (target)
    -validation data (features)
    -validation data (target)
    """
    
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)

    train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)
    
    dev_X = dev[columns].values
    test_X = test[columns].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1) 

    # test input arguments
    assert "pandas.core.frame.DataFrame" in str(type(train))
    assert "pandas.core.frame.DataFrame" in str(type(val))
    assert "str" in str(type(columns[0]))
    assert "str" in str(type(target[0]))

    # split into input and output feature(s)
    train_X = train[columns].values
    val_X = val[columns].values

    train_y = train[target].values.reshape(-1, 1)
    val_y = val[target].values.reshape(-1, 1)

    # scale data
    scaler = sklearn.preprocessing.StandardScaler()
    train_X = scaler.fit_transform(train_X)
    val_X = scaler.fit_transform(val_X)

    Results = {}  # compare algorithms

    from sklearn.metrics import accuracy_score, \
        confusion_matrix, \
        roc_auc_score,\
        average_precision_score,\
        auc,\
        roc_curve, f1_score, recall_score, matthews_corrcoef, auc

    for classifier, name in zip(classifiers, names):
        accuracy = []
        avg_precision = []
        F1_Score = []
        AUC = []
        MCC = []
        Recall = []
        
        mean_TPR = 0.0
        mean_FPR = np.linspace(0, 1, 100)

        print('{} is done.'.format(classifier.__class__.__name__))

        model = classifier

        # model
        model.fit(train_X, train_y)

        preds = model.predict(val_X)

        # Calculate ROC Curve and Area the Curve
        proba_y = model.predict_proba(val_X)[:,1]
        FPR, TPR, _ = roc_curve(val_y, proba_y, pos_label=1)
        roc_auc = auc(FPR, TPR)
        
        #calculate scoring metrics
        #include option to return these scores
        accuracy.append(accuracy_score(y_pred=preds, y_true=val_y))
        avg_precision.append(average_precision_score(y_true=val_y, y_score=proba_y, pos_label=1))
        F1_Score.append(f1_score(y_true=val_y, y_pred=preds, pos_label=1))
        MCC.append(matthews_corrcoef(y_true=val_y, y_pred=preds))
        Recall.append(recall_score(y_true=val_y, y_pred=preds, pos_label=1))
        AUC.append(roc_auc)

        confusion_matrix = sklearn.metrics.confusion_matrix(y_pred=preds, y_true=val_y)
        sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

        accuracy = [_*100.0 for _ in accuracy]
        Results[name + ' Accuracy, F1 Score'] = [accuracy, F1_Score]
        
        F.write('Classifier: {}\n'.format(name))
        F.write('Accuracy: {0:.4f}%\n'.format(np.mean(accuracy)))
        F.write('AUC: {0:.4f}\n'.format( np.mean(AUC)))
        F.write('auPR: {0:.4f}\n'.format(np.mean(avg_precision))) # average_Precision
        F.write('F1_Score: {0:.4f}\n'.format(np.mean(F1_Score)))
        F.write('MCC: {0:.4f}\n'.format(np.mean(MCC)))

#         TN, FP, FN, TP = CM.ravel()
        F.write('Recall: {0:.4f}\n'.format( np.mean(Recall)) )
        F.write('_______________________________________'+'\n')
    
    F.close()
    
    return Results, model


# if __name__ == '__main__':
#     # print('Please, enter number of cross validation:')
#     import argparse
#     p = argparse.ArgumentParser(description='Run Machine Learning Classifiers.')

#     p.add_argument('-cv', '--nFCV', type=int, help='Number of crossValidation', default=10)
#     p.add_argument('-data', '--dataset', type=str, help='~/dataset.csv', default='optimumDataset.csv')
#     p.add_argument('-roc', '--auROC', type=int, help='Print ROC Curve', default=1, choices=[0, 1])
#     p.add_argument('-box', '--boxPlot', type=int, help='Print Accuracy Box Plaot', default=1, choices=[0, 1])

#     args = p.parse_args()

#     runClassifiers(args)

In [17]:
input_features.remove('m_protein_seq')

In [18]:
input_features.remove('t_protein_seq')

In [19]:
input_features.remove('prot_pair_index')

In [20]:
input_features

['local_gap_compressed_percent_id',
 'scaled_local_query_percent_id',
 'scaled_local_symmetric_percent_id',
 'query_align_len',
 'query_align_cov',
 'subject_align_len',
 'subject_align_cov',
 'bit_score',
 'm_protein_len',
 't_protein_len']

In [21]:
new_df = df.drop(columns=['m_protein_seq', 't_protein_seq'])

In [22]:
new_df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,prot_pair_index,m_protein_len,t_protein_len,protein_match
0,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,37128170,375,269,0
1,0.354067,0.245847,0.225954,208,0.691030,216,0.610170,264,55279368,354,301,1
2,0.314554,0.241007,0.228669,218,0.784173,212,0.688312,228,107256987,308,278,0
3,0.487685,0.452055,0.458333,202,0.922374,203,0.953052,480,121070359,213,219,1
4,0.298701,0.205357,0.206897,254,0.755952,238,0.719033,128,131454663,331,336,1
...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.284091,0.223214,0.245700,190,0.848214,175,0.956284,112,14456723,183,224,0
17104,0.322368,0.303406,0.294737,319,0.987616,335,0.979532,122,46032974,342,323,1
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,78849058,271,353,0
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,161110219,331,338,0


In [23]:
target = 'protein_match'

In [24]:
result = runClassifiers('sample.csv', df, columns=input_features, target=target)

LogisticRegression is done.
KNeighborsClassifier is done.
DecisionTreeClassifier is done.
GaussianNB is done.
RandomForestClassifier is done.
BaggingClassifier is done.
AdaBoostClassifier is done.
GradientBoostingClassifier is done.


In [25]:
model = result[1]
type(model)

sklearn.ensemble._gb.GradientBoostingClassifier

In [38]:
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,prot_pair_index,m_protein_len,t_protein_len,protein_match
0,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,37128170,375,269,0
1,0.322430,0.264368,0.245552,232,0.888889,208,0.691030,227,172639070,301,261,1
2,0.270000,0.220408,0.222680,195,0.795918,210,0.875000,173,114387951,240,245,1
3,0.268908,0.232727,0.247582,239,0.869091,236,0.975207,191,157332201,242,275,1
4,0.497143,0.426471,0.429630,175,0.857843,174,0.865672,422,101047443,201,204,1
...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.328000,0.278912,0.299817,278,0.945578,244,0.964427,166,136836774,253,294,1
17104,0.284091,0.223214,0.245700,190,0.848214,175,0.956284,112,14456723,183,224,0
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,78849058,271,353,0
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,161110219,331,338,0


In [28]:
from sklearn.metrics import accuracy_score, \
    confusion_matrix, \
    roc_auc_score,\
    average_precision_score,\
    auc,\
    roc_curve, f1_score, recall_score, matthews_corrcoef, auc

In [34]:
def evaluate_model(output_path, model, dataframe):
    '''
    Takes a trained model and test data and tests the model.

    Args:
        output path: File path: str
        model: sklearn.neighbors.KNeighborsClassifier
        test_X: numpy array
        test_y: numpy array

    Returns:
        Vector of predictions (numpy arrray)
    '''
    # initialize empty eval results file
    F = open('evaluationResults.txt', 'w')

    F.write('Evaluation Scale:' + '\n')
    F.write('0.0% <=Accuracy<= 100.0%' + '\n')
    F.write('0.0 <=AUC<= 1.0' + '\n')  # area under curve
    F.write('0.0 <=auPR<= 1.0' + '\n')  # average_Precision
    F.write('0.0 <=F1_Score<= 1.0' + '\n')
    F.write('-1.0 <=MCC<= 1.0' + '\n')
    F.write('_______________________________________' + '\n')

    model = model

    df_seqs = dataframe[['m_protein_seq', 't_protein_seq', 'prot_pair_index']]

    dataframe = dataframe.drop(columns=['m_protein_seq', 't_protein_seq', 'prot_pair_index'])

    target = 'protein_match'
    features = [columns for columns in dataframe]
    features.remove(target)
    print(features)

    # split into input and output feature(s)
    test_X = dataframe[features].values
    test_y = dataframe[target].values.reshape(-1, 1)

    # scale data
    scaler = sklearn.preprocessing.StandardScaler()
    test_X = scaler.fit_transform(test_X)

    accuracy = []
    avg_precision = []
    F1_Score = []
    AUC = []
    MCC = []
    Recall = []

    # test input arguments
    assert "sklearn" in str(type(model))
    assert "numpy.ndarray" in str(type(test_X))
    assert "numpy.ndarray" in str(type(test_y))

    # vector of predictions
    preds = model.predict(test_X)

    # calculate precision score
    precision_score = sklearn.metrics.precision_score(test_y, preds)

    # Calculate ROC Curve and Area the Curve
    proba_y = model.predict_proba(test_X)[:, 1]
    FPR, TPR, _ = roc_curve(test_y, proba_y, pos_label=1)
    roc_auc = auc(FPR, TPR)

    # calculate scoring metrics
    # include option to return these scores
    accuracy.append(accuracy_score(y_pred=preds, y_true=test_y))
    avg_precision.append(
        average_precision_score(
            y_true=test_y,
            y_score=proba_y,
            pos_label=1))
    F1_Score.append(f1_score(y_true=test_y, y_pred=preds, pos_label=1))
    MCC.append(matthews_corrcoef(y_true=test_y, y_pred=preds))
    Recall.append(recall_score(y_true=test_y, y_pred=preds, pos_label=1))
    AUC.append(roc_auc)

    confusion_matrix = sklearn.metrics.confusion_matrix(
        y_pred=preds, y_true=test_y)
    sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

    accuracy = [_*100.0 for _ in accuracy]

    F.write('Accuracy: {0:.4f}%\n'.format(np.mean(accuracy)))
    F.write('AUC: {0:.4f}\n'.format(np.mean(AUC)))
    F.write(
        'auPR: {0:.4f}\n'.format(
            np.mean(avg_precision)))  # average_Precision
    F.write('F1_Score: {0:.4f}\n'.format(np.mean(F1_Score)))
    F.write('MCC: {0:.4f}\n'.format(np.mean(MCC)))

#         TN, FP, FN, TP = CM.ravel()
    F.write('Recall: {0:.4f}\n'.format(np.mean(Recall)))
    F.write('_______________________________________' + '\n')

    # merge dataframes together to report results
    df_seqs['prediction'] = preds

    # save to csv
    df_seqs.to_csv('predictions.csv')

    return preds, precision_score, df_seqs

In [35]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'prot_pair_index', 'm_protein_seq', 't_protein_seq',
       'm_protein_len', 't_protein_len', 'protein_match'],
      dtype='object')

In [37]:
evaluate_model('notebooks', model, df)

['local_gap_compressed_percent_id', 'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id', 'query_align_len', 'query_align_cov', 'subject_align_len', 'subject_align_cov', 'bit_score', 'm_protein_len', 't_protein_len']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_seqs['prediction'] = preds


(array([0, 0, 0, ..., 0, 0, 0]),
 0.7725343320848939,
                                            m_protein_seq  \
 0      MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...   
 1      MDSLITVRNLHKLFRVDQREDSGLWPAIKSLIRRERAEVVAVDGVS...   
 2      MRKQYGDFHAVKGIDLDVRPGECFGLLGPNGAGKSTTMRMLAGTSQ...   
 3      MAWTREQMAARAAKELQDGFYVNLGIGIPTLVANYIPDGVHVTLQS...   
 4      MSATDTPSRPLASLDIKKEVERYALVLGWIFLIAIFGLLLPDTFLT...   
 ...                                                  ...   
 17103  MRRQVGSVPGVTRTAGAAGGGAAARTAPGGPPDVVLMSCSHPRAAE...   
 17104  MLTVSYVGAHKIEIREQDPEPVHAGQVQVEVAYAGICGTDLHILHG...   
 17105  MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...   
 17106  MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...   
 17107  MTGQGASGSETGVSDDSPITVVVVDDQELLRTGLRDLAEHDGDIAV...   
 
                                            t_protein_seq  prot_pair_index  \
 0      MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...         37128170   
 1      MSEHILETDRLTRRYGDRLAVDAVSMAVRPREVYGFLGPNGAGKTT..

In [37]:
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,prot_pair_index,m_protein_len,t_protein_len,protein_match
0,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,37128170,375,269,0
1,0.322430,0.264368,0.245552,232,0.888889,208,0.691030,227,172639070,301,261,1
2,0.270000,0.220408,0.222680,195,0.795918,210,0.875000,173,114387951,240,245,1
3,0.268908,0.232727,0.247582,239,0.869091,236,0.975207,191,157332201,242,275,1
4,0.497143,0.426471,0.429630,175,0.857843,174,0.865672,422,101047443,201,204,1
...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.328000,0.278912,0.299817,278,0.945578,244,0.964427,166,136836774,253,294,1
17104,0.284091,0.223214,0.245700,190,0.848214,175,0.956284,112,14456723,183,224,0
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,78849058,271,353,0
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,161110219,331,338,0
