This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

In [183]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [184]:
cd /Users/loganroberts/Learn2Therm/ValidProt/FAFSA

/Users/loganroberts/Learn2Therm/ValidProt/FAFSA


In [185]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [186]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [187]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [188]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    
    feature_dict_a = get_protein_descriptors(fasta_files[0], descriptors)
    
    feature_dict_b = get_protein_descriptors(fasta_files[1], descriptors)


    df = dataframe.reset_index()

    for desc in descriptors:

        feature_dict_a[desc].index = feature_dict_a[desc].index.astype(int)
        features_a = feature_dict_a[desc].reset_index()
        
        feature_dict_b[desc].index = feature_dict_b[desc].index.astype(int)
        features_b = feature_dict_b[desc].reset_index()
    
    feature_df = pd.merge(
            df,
            features_a,
            how='outer',
            left_index=True,
            right_index=True)
    
    new_df = pd.merge(
            feature_df,
            features_b,
            how='outer',
            left_index=True,
            right_index=True)

    return new_df

In [189]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [190]:
target = pd.read_csv('protein_match_50k')
target['protein_match'] = target['protein_match'].map({'Yes': 1, 'No': 0})
target

Unnamed: 0.1,Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
0,0,48641291,1,1.00
1,1,92992745,1,1.00
2,2,157628663,1,1.00
3,3,136708305,1,1.00
4,4,133672542,1,1.00
...,...,...,...,...
48845,4875,78849058,0,0.25
48846,4876,108797464,1,1.00
48847,4877,161110219,0,0.25
48848,4878,74177185,1,0.50


In [191]:
from sklearn.utils import resample

# Assuming your data is in a pandas DataFrame called 'data'
# Separate the majority and minority classes
majority_class = target[target['protein_match'] == 1]
minority_class = target[target['protein_match'] == 0]

# Undersample the majority class to match the number of minority class samples
n_samples = len(minority_class)
undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

# Combine the undersampled majority class with the minority class
balanced_data = pd.concat([undersampled_majority, minority_class])

In [192]:
balanced_data

Unnamed: 0.1,Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
18639,3985,29877661,1,0.600000
27326,2918,160971218,1,1.000000
9604,4729,48732214,1,1.000000
11862,2092,56048646,1,0.875000
34183,4891,159585956,1,1.000000
...,...,...,...,...
48833,4863,70996712,0,0.285714
48839,4869,14456723,0,0.250000
48845,4875,78849058,0,0.250000
48847,4877,161110219,0,0.250000


In [193]:
df = pd.merge(df, balanced_data, on=['prot_pair_index'])
df.shape

(17108, 33)

In [194]:
df

Unnamed: 0,Unnamed: 0_x,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len,Unnamed: 0_y,protein_match,Jaccard_Score
0,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,11324,...,29.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,response regulator transcription factor,response regulator transcription factor,233,237,1,1,1.000
1,8,0.265306,0.201550,0.210243,320,0.826873,294,0.828169,140,1674,...,27.5,MSITTKEKRFGIIERYREFLPVTENTPKLTLHEGDTPLIHAASLSA...,MSGVEIEKGYFGQFGGSFVFPELQEVLDYLAEQFERYKDDPEFKQE...,threonine synthase,tryptophan synthase subunit beta,355,387,8,1,1.000
2,11,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,14963,...,31.5,MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...,MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...,glycosyltransferase family 9 protein,S-methyl-5'-thioadenosine phosphorylase,375,269,11,0,0.000
3,21,0.366120,0.340102,0.322115,178,0.903553,192,0.876712,214,7134,...,35.0,MPLRVILAEDSALMREGLVGLLDRFGHTTVAAVGDAGEVAAAVERE...,MIRVLLADDQHLIREAIASLLGLEPDLEVVAQVGRGDEVVAAVHVH...,response regulator transcription factor,response regulator transcription factor,219,197,21,1,0.500
4,22,0.253275,0.232000,0.230616,229,0.916000,227,0.897233,227,1462,...,20.0,MTDAPDLSVQDGSSGLKVLGLRKNYKRRPVIRDVSMELARGEVVAL...,MQVSDTEVAIDIIGMNKWFGDFHVLRDINLRVMKGERIVVCGPSGS...,LPS export ABC transporter ATP-binding protein,amino acid ABC transporter ATP-binding protein,253,250,22,1,0.375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,49989,0.284091,0.223214,0.245700,190,0.848214,175,0.956284,112,7134,...,22.5,MRRQVGSVPGVTRTAGAAGGGAAARTAPGGPPDVVLMSCSHPRAAE...,MAESHSTTRSGQIRVFVLDDHEVVRRGVRDLLDAEPDITVVGEAET...,response regulator transcription factor,response regulator transcription factor,183,224,4869,0,0.250
17104,49995,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,14963,...,24.5,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,aminoglycoside phosphotransferase family protein,serine/threonine protein kinase,271,353,4875,0,0.250
17105,49997,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,7134,...,24.5,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,ABC transporter ATP-binding protein,ABC transporter ATP-binding protein,331,338,4877,0,0.250
17106,49998,0.334764,0.331915,0.329810,232,0.987234,230,0.966387,281,11324,...,25.0,MSESHAGALLSVRGLTAGYGGATALDGVSLTVAAGETVALLGANGA...,MSLLTTSGLTRHFSGIHAVEGVDFTLEAGEIRALIGSNGAGKTTLV...,ABC transporter ATP-binding protein,ABC transporter ATP-binding protein,238,235,4878,1,0.500


In [195]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       ,'ogt_difference', 'Jaccard_Score',
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'Unnamed: 0_x', 'Unnamed: 0_y', 'thermo_index', 
                       'm_protein_desc', 't_protein_desc'])

In [196]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'prot_pair_index', 'm_protein_seq', 't_protein_seq',
       'm_protein_len', 't_protein_len', 'protein_match'],
      dtype='object')

In [197]:
df = df.replace([np.inf, -np.inf], np.nan)

In [198]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
subject_align_len                    0
subject_align_cov                    0
bit_score                            0
prot_pair_index                      0
m_protein_seq                        0
t_protein_seq                        0
m_protein_len                        0
t_protein_len                        0
protein_match                        0
dtype: int64


array([0])

In [199]:
df = df.dropna(axis=1, how='any')

In [200]:
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,prot_pair_index,m_protein_seq,t_protein_seq,m_protein_len,t_protein_len,protein_match
0,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,92992745,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,233,237,1
1,0.265306,0.201550,0.210243,320,0.826873,294,0.828169,140,180139301,MSITTKEKRFGIIERYREFLPVTENTPKLTLHEGDTPLIHAASLSA...,MSGVEIEKGYFGQFGGSFVFPELQEVLDYLAEQFERYKDDPEFKQE...,355,387,1
2,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,37128170,MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...,MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...,375,269,0
3,0.366120,0.340102,0.322115,178,0.903553,192,0.876712,214,87866267,MPLRVILAEDSALMREGLVGLLDRFGHTTVAAVGDAGEVAAAVERE...,MIRVLLADDQHLIREAIASLLGLEPDLEVVAQVGRGDEVVAAVHVH...,219,197,1
4,0.253275,0.232000,0.230616,229,0.916000,227,0.897233,227,96768077,MTDAPDLSVQDGSSGLKVLGLRKNYKRRPVIRDVSMELARGEVVAL...,MQVSDTEVAIDIIGMNKWFGDFHVLRDINLRVMKGERIVVCGPSGS...,253,250,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.284091,0.223214,0.245700,190,0.848214,175,0.956284,112,14456723,MRRQVGSVPGVTRTAGAAGGGAAARTAPGGPPDVVLMSCSHPRAAE...,MAESHSTTRSGQIRVFVLDDHEVVRRGVRDLLDAEPDITVVGEAET...,183,224,0
17104,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,78849058,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,271,353,0
17105,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,161110219,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,331,338,0
17106,0.334764,0.331915,0.329810,232,0.987234,230,0.966387,281,74177185,MSESHAGALLSVRGLTAGYGGATALDGVSLTVAAGETVALLGANGA...,MSLLTTSGLTRHFSGIHAVEGVDFTLEAGEIRALIGSNGAGKTTLV...,238,235,1


Use MRMR to select for the best features. Going to start by grouping into different categories of features generated from iFeature Omega.

In [90]:
#use MRMR to select for the best features from PseKRAAC
df_subset = df.loc[:, df.columns != 'protein_match']
print(type(df_subset))

# select top 10 features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=df_subset.iloc[:,10:], y=df['protein_match'], K=20)

selected_features

<class 'pandas.core.frame.DataFrame'>


100%|██████████| 2/2 [00:00<00:00, 156.75it/s]


['m_protein_len', 't_protein_len']

In [91]:
best_features_df = df[[feature for feature in selected_features]]

#concatenates original feature vector back into the dataframe
df = pd.concat([df.iloc[:, :11], best_features_df], axis=1)
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_seq,t_protein_seq,m_protein_len,m_protein_len.1,t_protein_len
0,0.333333,0.267380,0.254453,148,0.791444,163,0.791262,127,MVTTGERQRNARGEGARLRLEIVAATQALLADGETATLRSIARRAG...,MNRPTYHHGDLRAAILTEAARLVAERGAERVSLRELAREAGVSHAA...,206,206,187
1,0.333333,0.295082,0.291498,218,0.893443,221,0.884000,158,MQIKDSVAVVTGGASGLGLATTKRLLDAGGSVVVIDLKGEDVVAEL...,MTGTVVITGGSRGIGAATAVLAAERGWQVAVSFRERRDAAEQVVRR...,250,250,244
2,0.334630,0.299652,0.299130,253,0.881533,256,0.888889,206,MTLFVQQLANGLALGGIYCLAAIGLTLVFGVLGFPNLAHGALYMLG...,MSDFLQYLISGLAVGCGFALLASGLVTIHRVTHVVNFAQGMFAVVA...,288,288,287
3,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...,MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...,375,375,269
4,0.331915,0.245283,0.242613,239,0.751572,238,0.732308,202,MSQNVALGVPSSGSPAPAAGPAAQVPVPSRRTFKASPYLLLIPAVV...,MAESTSLRRATATARRGAAARRARRRTLLGLAFASPWIVGALVFTL...,325,325,318
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.331707,0.294372,0.282158,202,0.874459,218,0.868526,232,MSTLVADTVSKRFGGVTALSNVSLELREGEIHGLIGPNGSGKTTLL...,MLEVKNIDLFYGASRALRSVSLTAQAGQVTAVMGRNGVGKTSLIRA...,251,251,231
17104,0.322368,0.303406,0.294737,319,0.987616,335,0.979532,122,MLTVSYVGAHKIEIREQDPEPVHAGQVQVEVAYAGICGTDLHILHG...,MRAVTISEPGGPEKLQWTEVPDPRPGAGEVLLEVVASAVNRADVLQ...,342,342,323
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,271,271,353
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,331,331,338


In [114]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(14541, 13)
(2567, 13)
(12359, 13)
(2182, 13)


In [115]:
val

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_seq,t_protein_seq,m_protein_len,t_protein_len,protein_match
1968,0.270073,0.174528,0.169336,148,0.698113,139,0.617778,52,MTQTQIPSAVPSVPAKSPHGPLRVGIGGPVGAGKTTLTAALCRALG...,MAFTVAVDGPAAAGKGTVARAVAEHFGLAHLDTGALYRAVGLKVLE...,225,212,0
6217,0.268571,0.199153,0.209354,178,0.754237,171,0.802817,116,MSTSSREHAIDRLEDRLARLPLRERKKLRTRRTIQDHALRLFGEQG...,MPRKSASREPEEVSGPKSARTRQRILDAAAHVLSRKGFAGTRLTDV...,213,236,0
4435,0.241096,0.221106,0.225064,369,0.927136,369,0.960938,184,MDFDFTDEQRMLKDSVERLVKDEYGFEQRAKYLAEPDGFSRELWAR...,MANAPFHWDDPFLLEQQLSADERMVRDAAASYCRDKLAPRVLEAFR...,384,398,1
10376,0.596330,0.590909,0.356164,108,0.981818,109,0.427451,305,MTGTSTDVVVRIEGLHKSFGHLEVVKGVDLDVHRGEVVVVFGRSGS...,YPAQLSGGQQQRAAIARALAMKPKVMLFDEPTSALDPEMVNEVLEV...,255,110,1
7375,0.261780,0.253807,0.253485,380,0.964467,385,0.974684,312,MNLANRVQTLTPSTTLAITAKANELKAQGVDVIGLGAGEPDYNTPE...,MKKAKRMNAFSASIFAELTAYRQKRRHLHDEWIDLSVGSPDLPPAP...,395,394,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13199,0.327138,0.281150,0.275862,273,0.872205,273,0.840000,222,MSTPSTGPSTSPSTSPSTMASPVAAIDCGTNSIKVLIGRRRDDGTL...,MRLGVLDVGSNTVHLLIVDAQPGAPPLPAFSHKTELRLAEHLDEVG...,325,313,1
16196,0.321951,0.284483,0.234043,200,0.862069,209,0.629518,234,MKTTEIEKQPILQVSGLKQHFVLKKEVIFQSKQVVKAVDGISFDLF...,MIKIQHISRIYQNGTTKTSALKDIDFKIKPNEVTIILGPSGSGKST...,332,232,0
10844,0.356383,0.283898,0.303855,201,0.851695,186,0.907317,168,MTTVVIVDDHPVVRAGLTALLATRPGIEVVGEAGDGSAALAVVGRE...,MATVLVVEDDPNLRAALIRELGARSHTVRSTSTAMGMLRDVAQNPP...,205,236,0
6910,0.451128,0.439560,0.424779,265,0.970696,274,0.938356,582,MRLNDRNRGATAAVLLTLCGFGLGGCDALDSINPFAEKYKPEVIPD...,MPRRHPLVALLLVLLLAVACGSRDAADDPLITRDRPVESIYNEAAD...,292,273,1


In [116]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [117]:
input_features

['local_gap_compressed_percent_id',
 'scaled_local_query_percent_id',
 'scaled_local_symmetric_percent_id',
 'query_align_len',
 'query_align_cov',
 'subject_align_len',
 'subject_align_cov',
 'bit_score',
 'm_protein_seq',
 't_protein_seq',
 'm_protein_len',
 't_protein_len']

In [118]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(14541, 12) (2567, 12) (14541, 1) (2567, 1)


In [119]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [120]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

ValueError: could not convert string to float: 'MAALVLDSLVKRFESTPVVDELSLEIEPGEFVALLGPSGCGKTTTLRLLAGFETLSSGQVSLNGRSLAHASLHLPPEQRRMGMVFQSYALWPHMSVADNVGYTLKMRHIKGDEYRRRVGQALETVQLASLAERMPQALSGGQRQRVALARCLVTEPEVVLLDEPLANLDRHLRASMEETFREFHRRTGATMVYVTHDQSEAMSLADRIAVMKSGRLAQWATPETLYRRPRNEWVARFIGQGSILRLPDVTPGRAIPGAELMSLAMHHECEWDTPVLVRPEQVRLGSTGLPARVENCIFRGERYELWLCLNSGQRLFAYHHEALPIGREVRLVLQQGWSLEPEA'

Train the model

In [30]:
def k_fold_cross_val(dataframe, n_splits=10):  
    """
    Runs k-fold cross validation on dataset.
    Default = 10-fold.

    Params
    ----------
    -dataframe: Pandas dataframe
    -n_splits: Number of cross validations (int)

    Returns
    -------
    -vector of predictions
    """
    
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)

    train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)
    
    target = 'protein_match'
    input_features = [columns for columns in df]
    input_features.remove(target)
    
    dev_X = dev[input_features].values
    test_X = test[input_features].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1) 

    from sklearn.model_selection import StratifiedKFold

    cv = StratifiedKFold(n_splits, shuffle=True)

    for (train_index, test_index) in cv.split(dev_X, dev_y):

        train_X = dev_X[train_index]
        val_X = dev_X[test_index]

        train_y = dev_y[train_index]
        val_y = dev_y[test_index]

        model.fit(train_X, train_y)

        preds = model.predict(val_X)

        return preds

In [31]:
preds

NameError: name 'preds' is not defined

In [32]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier()

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [33]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.7548120989917507
[1 0 1 ... 1 0 0]


In [34]:
proba_y = model.predict_proba(val_X)[:,1]

In [35]:
proba_y

array([0.35, 0.57, 0.93, ..., 0.55, 0.97, 1.  ])

In [36]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7f9dd8d3c610>

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
    RandomForestClassifier, \
    AdaBoostClassifier, \
    GradientBoostingClassifier, \
    ExtraTreesClassifier

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

names = ['LR', 'KNN', 'DT', 'NB', 'RF', 'Bagging', 'AB', 'GB', 'SVM']

#list of classifiers (hyperparameters optimized)
classifiers = [
    #Regression
    LogisticRegression(),
    #KNN (neighbors optimized iteratively)
    KNeighborsClassifier(n_neighbors=20),
    #Decision Tree
    DecisionTreeClassifier(max_features=None),
    #Gaussian
    GaussianNB(),
    #RF Classifier (with optuna)
    RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        max_samples=0.3,
        max_features=0.5,
        min_weight_fraction_leaf=0,
        min_samples_split=17),
    #RF Classifier with bagging (with optuna)
    BaggingClassifier(sklearn.ensemble.RandomForestClassifier
    (n_estimators=200, max_depth=None, 
     min_weight_fraction_leaf=0.000215),max_samples=0.5, 
    max_features=0.5),
    #AdaBoost (with optuna)
    AdaBoostClassifier(n_estimators=53, learning_rate=0.156),
    #Gradient Boosting (with optuna)
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, 
                                max_depth=1),   
    #C-support vector classification (9)
#     SVC(),
]

F = open('evaluationResults.txt', 'w')

F.write('Evaluation Scale:'+'\n')
F.write('0.0% <=Accuracy<= 100.0%'+'\n')
F.write('0.0 <=AUC<= 1.0'+'\n') #area under curve
F.write('0.0 <=auPR<= 1.0'+'\n')  # average_Precision
F.write('0.0 <=F1_Score<= 1.0'+'\n')
F.write('-1.0 <=MCC<= 1.0'+'\n')
F.write('_______________________________________'+'\n')

def runClassifiers(filename:str, dataframe, columns=[], target=[], model=RandomForestClassifier()):

    """
    Takes dataframe and splits it into a training and testing set. 
    Trains a RF Classifier with data.

    Params
    ----------
    dataframe: Pandas dataframe
    columns: list of strings, representing input features
    target: list of strings, representing target feature(s)

    Returns
    -------
    -Accuracy score
    -area under ROC curve
    -train data (target)
    -validation data (features)
    -validation data (target)
    """
    
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)

    train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)
    
    dev_X = dev[columns].values
    test_X = test[columns].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1) 

    # test input arguments
    assert "pandas.core.frame.DataFrame" in str(type(train))
    assert "pandas.core.frame.DataFrame" in str(type(val))
    assert "str" in str(type(columns[0]))
    assert "str" in str(type(target[0]))

    # split into input and output feature(s)
    train_X = train[columns].values
    val_X = val[columns].values

    train_y = train[target].values.reshape(-1, 1)
    val_y = val[target].values.reshape(-1, 1)

    # scale data
    scaler = sklearn.preprocessing.StandardScaler()
    train_X = scaler.fit_transform(train_X)
    val_X = scaler.fit_transform(val_X)

    Results = {}  # compare algorithms

    from sklearn.metrics import accuracy_score, \
        confusion_matrix, \
        roc_auc_score,\
        average_precision_score,\
        auc,\
        roc_curve, f1_score, recall_score, matthews_corrcoef, auc

    for classifier, name in zip(classifiers, names):
        accuracy = []
        avg_precision = []
        F1_Score = []
        AUC = []
        MCC = []
        Recall = []
        
        mean_TPR = 0.0
        mean_FPR = np.linspace(0, 1, 100)

        print('{} is done.'.format(classifier.__class__.__name__))

        model = classifier

        # model
        model.fit(train_X, train_y)

        preds = model.predict(val_X)

        # Calculate ROC Curve and Area the Curve
        proba_y = model.predict_proba(val_X)[:,1]
        FPR, TPR, _ = roc_curve(val_y, proba_y, pos_label=1)
        roc_auc = auc(FPR, TPR)
        
        #calculate scoring metrics
        #include option to return these scores
        accuracy.append(accuracy_score(y_pred=preds, y_true=val_y))
        avg_precision.append(average_precision_score(y_true=val_y, y_score=proba_y, pos_label=1))
        F1_Score.append(f1_score(y_true=val_y, y_pred=preds, pos_label=1))
        MCC.append(matthews_corrcoef(y_true=val_y, y_pred=preds))
        Recall.append(recall_score(y_true=val_y, y_pred=preds, pos_label=1))
        AUC.append(roc_auc)

        confusion_matrix = sklearn.metrics.confusion_matrix(y_pred=preds, y_true=val_y)
        sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

        accuracy = [_*100.0 for _ in accuracy]
        Results[name + ' Accuracy, F1 Score'] = [accuracy, F1_Score]
        
        F.write('Classifier: {}\n'.format(name))
        F.write('Accuracy: {0:.4f}%\n'.format(np.mean(accuracy)))
        F.write('AUC: {0:.4f}\n'.format( np.mean(AUC)))
        F.write('auPR: {0:.4f}\n'.format(np.mean(avg_precision))) # average_Precision
        F.write('F1_Score: {0:.4f}\n'.format(np.mean(F1_Score)))
        F.write('MCC: {0:.4f}\n'.format(np.mean(MCC)))

#         TN, FP, FN, TP = CM.ravel()
        F.write('Recall: {0:.4f}\n'.format( np.mean(Recall)) )
        F.write('_______________________________________'+'\n')
    
    F.close()
    
    return Results, model


# if __name__ == '__main__':
#     # print('Please, enter number of cross validation:')
#     import argparse
#     p = argparse.ArgumentParser(description='Run Machine Learning Classifiers.')

#     p.add_argument('-cv', '--nFCV', type=int, help='Number of crossValidation', default=10)
#     p.add_argument('-data', '--dataset', type=str, help='~/dataset.csv', default='optimumDataset.csv')
#     p.add_argument('-roc', '--auROC', type=int, help='Print ROC Curve', default=1, choices=[0, 1])
#     p.add_argument('-box', '--boxPlot', type=int, help='Print Accuracy Box Plaot', default=1, choices=[0, 1])

#     args = p.parse_args()

#     runClassifiers(args)

In [38]:
result = runClassifiers('sample.csv', df, columns=input_features, target=target)

LogisticRegression is done.
KNeighborsClassifier is done.
DecisionTreeClassifier is done.
GaussianNB is done.
RandomForestClassifier is done.
BaggingClassifier is done.
AdaBoostClassifier is done.
GradientBoostingClassifier is done.


In [169]:
model = result[1]
type(model)

sklearn.ensemble._gb.GradientBoostingClassifier

In [201]:
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,prot_pair_index,m_protein_seq,t_protein_seq,m_protein_len,t_protein_len,protein_match
0,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,92992745,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,233,237,1
1,0.265306,0.201550,0.210243,320,0.826873,294,0.828169,140,180139301,MSITTKEKRFGIIERYREFLPVTENTPKLTLHEGDTPLIHAASLSA...,MSGVEIEKGYFGQFGGSFVFPELQEVLDYLAEQFERYKDDPEFKQE...,355,387,1
2,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,37128170,MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...,MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...,375,269,0
3,0.366120,0.340102,0.322115,178,0.903553,192,0.876712,214,87866267,MPLRVILAEDSALMREGLVGLLDRFGHTTVAAVGDAGEVAAAVERE...,MIRVLLADDQHLIREAIASLLGLEPDLEVVAQVGRGDEVVAAVHVH...,219,197,1
4,0.253275,0.232000,0.230616,229,0.916000,227,0.897233,227,96768077,MTDAPDLSVQDGSSGLKVLGLRKNYKRRPVIRDVSMELARGEVVAL...,MQVSDTEVAIDIIGMNKWFGDFHVLRDINLRVMKGERIVVCGPSGS...,253,250,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.284091,0.223214,0.245700,190,0.848214,175,0.956284,112,14456723,MRRQVGSVPGVTRTAGAAGGGAAARTAPGGPPDVVLMSCSHPRAAE...,MAESHSTTRSGQIRVFVLDDHEVVRRGVRDLLDAEPDITVVGEAET...,183,224,0
17104,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,78849058,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,271,353,0
17105,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,161110219,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,331,338,0
17106,0.334764,0.331915,0.329810,232,0.987234,230,0.966387,281,74177185,MSESHAGALLSVRGLTAGYGGATALDGVSLTVAAGETVALLGANGA...,MSLLTTSGLTRHFSGIHAVEGVDFTLEAGEIRALIGSNGAGKTTLV...,238,235,1


In [209]:
#testing function

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier, \
    RandomForestClassifier, \
    AdaBoostClassifier, \
    GradientBoostingClassifier, \
    ExtraTreesClassifier

from sklearn.metrics import accuracy_score, \
    confusion_matrix, \
    roc_auc_score,\
    average_precision_score,\
    auc,\
    roc_curve, f1_score, recall_score, matthews_corrcoef, auc

# create dictionary of models
names = ['LR', 'KNN', 'DT', 'NB', 'RF', 'Bagging', 'AB', 'GB', 'SVM']

# list of classifiers (hyperparameters optimized)
classifiers = [
    # Regression
    LogisticRegression(),
    # KNN (neighbors optimized iteratively)
    KNeighborsClassifier(n_neighbors=20),
    # Decision Tree
    DecisionTreeClassifier(max_features=None),
    # Gaussian
    GaussianNB(),
    # RF Classifier (with optuna)
    RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        max_samples=0.3,
        max_features=0.5,
        min_weight_fraction_leaf=0,
        min_samples_split=17),
    # RF Classifier with bagging (with optuna)
    BaggingClassifier(RandomForestClassifier
                      (n_estimators=200, max_depth=None,
                       min_weight_fraction_leaf=0.000215), max_samples=0.5,
                      max_features=0.5),
    # AdaBoost (with optuna)
    AdaBoostClassifier(n_estimators=53, learning_rate=0.156),
    # Gradient Boosting (with optuna)
    GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                               max_depth=1),
    # C-support vector classification (9)
    #     SVC(),
]

F = open('evaluationResults.txt', 'w')

F.write('Evaluation Scale:'+'\n')
F.write('0.0% <=Accuracy<= 100.0%'+'\n')
F.write('0.0 <=AUC<= 1.0'+'\n') #area under curve
F.write('0.0 <=auPR<= 1.0'+'\n')  # average_Precision
F.write('0.0 <=F1_Score<= 1.0'+'\n')
F.write('-1.0 <=MCC<= 1.0'+'\n')
F.write('_______________________________________'+'\n')


def test_model(model, dataframe, target=[]):
    """
    Takes a trained model and test data and tests the model.

    Params
    ----------
    model: sklearn.neighbors.KNeighborsClassifier
    test_X: numpy array
    test_y: numpy array

    Returns
    -------
    Vector of predictions based on the model (numpy array)
    """

    model = model
    
    df_seqs = dataframe[['m_protein_seq', 't_protein_seq', 'prot_pair_index']]
    
    dataframe = dataframe.drop(columns=['m_protein_seq', 't_protein_seq', 'prot_pair_index'])
    
    target = 'protein_match'
    features = [columns for columns in dataframe]
    features.remove(target)
    print(features)
    
    # split into input and output feature(s)
    test_X = dataframe[features].values
    test_y = dataframe[target].values.reshape(-1, 1)

    # scale data
    scaler = sklearn.preprocessing.StandardScaler()
    test_X = scaler.fit_transform(test_X)

    accuracy = []
    avg_precision = []
    F1_Score = []
    AUC = []
    MCC = []
    Recall = []

    # test input arguments
    assert "sklearn" in str(type(model))
    assert "numpy.ndarray" in str(type(test_X))
    assert "numpy.ndarray" in str(type(test_y))

    #vector of predictions
    preds = model.predict(test_X)

    # calculate precision score
    precision_score = sklearn.metrics.precision_score(test_y, preds)

    # Calculate ROC Curve and Area the Curve
    proba_y = model.predict_proba(test_X)[:,1]
    FPR, TPR, _ = roc_curve(test_y, proba_y, pos_label=1)
    roc_auc = auc(FPR, TPR)
    
    #calculate scoring metrics
    #include option to return these scores
    accuracy.append(accuracy_score(y_pred=preds, y_true=test_y))
    avg_precision.append(average_precision_score(y_true=test_y, y_score=proba_y, pos_label=1))
    F1_Score.append(f1_score(y_true=test_y, y_pred=preds, pos_label=1))
    MCC.append(matthews_corrcoef(y_true=test_y, y_pred=preds))
    Recall.append(recall_score(y_true=test_y, y_pred=preds, pos_label=1))
    AUC.append(roc_auc)

    confusion_matrix = sklearn.metrics.confusion_matrix(y_pred=preds, y_true=test_y)
    sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

    accuracy = [_*100.0 for _ in accuracy]
    
    F.write('Accuracy: {0:.4f}%\n'.format(np.mean(accuracy)))
    F.write('AUC: {0:.4f}\n'.format( np.mean(AUC)))
    F.write('auPR: {0:.4f}\n'.format(np.mean(avg_precision))) # average_Precision
    F.write('F1_Score: {0:.4f}\n'.format(np.mean(F1_Score)))
    F.write('MCC: {0:.4f}\n'.format(np.mean(MCC)))

#         TN, FP, FN, TP = CM.ravel()
    F.write('Recall: {0:.4f}\n'.format( np.mean(Recall)) )
    F.write('_______________________________________'+'\n')
    
    #merge dataframes together to report results
    df_seqs['prediction'] = preds
    
    #save to csv
    df_seqs.to_csv('predictions.csv')
  
    return preds, precision_score, df_seqs

In [210]:
test_model(model, df, target=['protein_match'])[2]

['local_gap_compressed_percent_id', 'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id', 'query_align_len', 'query_align_cov', 'subject_align_len', 'subject_align_cov', 'bit_score', 'm_protein_len', 't_protein_len']


Unnamed: 0,m_protein_seq,t_protein_seq,prot_pair_index,prediction
0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,92992745,0
1,MSITTKEKRFGIIERYREFLPVTENTPKLTLHEGDTPLIHAASLSA...,MSGVEIEKGYFGQFGGSFVFPELQEVLDYLAEQFERYKDDPEFKQE...,180139301,1
2,MRAIGELWPTFDDVHEIAVLRGGGLGDLMFALPAIDALAAAYPEAR...,MATTTAEIGVIGGSGFYSFLDDPHEVTVQTPYGPPSDPIAVGTVAG...,37128170,0
3,MPLRVILAEDSALMREGLVGLLDRFGHTTVAAVGDAGEVAAAVERE...,MIRVLLADDQHLIREAIASLLGLEPDLEVVAQVGRGDEVVAAVHVH...,87866267,1
4,MTDAPDLSVQDGSSGLKVLGLRKNYKRRPVIRDVSMELARGEVVAL...,MQVSDTEVAIDIIGMNKWFGDFHVLRDINLRVMKGERIVVCGPSGS...,96768077,0
...,...,...,...,...
17103,MRRQVGSVPGVTRTAGAAGGGAAARTAPGGPPDVVLMSCSHPRAAE...,MAESHSTTRSGQIRVFVLDDHEVVRRGVRDLLDAEPDITVVGEAET...,14456723,1
17104,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,78849058,0
17105,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,161110219,0
17106,MSESHAGALLSVRGLTAGYGGATALDGVSLTVAAGETVALLGANGA...,MSLLTTSGLTRHFSGIHAVEGVDFTLEAGEIRALIGSNGAGKTTLV...,74177185,1


In [42]:
preds = test_model(model, val_X, val_y)[0]
precision_score = test_model(model, val_X, val_y)[1]

In [43]:
classifier_results = pd.DataFrame({'Sequence_1': seq1, 'Sequence_2': seq2,
                                   'Functional_Pair': preds, 'Precision_Score': precision_score})

classifier_results.to_csv('classifier_results.csv', index=False)

NameError: name 'seq1' is not defined

In [44]:
val

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,protein_match
1968,0.493976,0.460674,0.201474,83,0.932584,83,0.261006,224,318,89,1
6217,0.299065,0.274678,0.263918,210,0.901288,221,0.876984,253,252,233,0
4435,0.320896,0.272152,0.293515,278,0.879747,267,0.988889,276,270,316,1
10376,0.345588,0.300320,0.296063,268,0.856230,277,0.860248,281,322,313,1
7375,0.268182,0.232283,0.194719,223,0.877953,221,0.627841,138,352,254,0
...,...,...,...,...,...,...,...,...,...,...,...
13199,0.264151,0.140000,0.074468,52,0.520000,58,0.210145,49,276,100,0
16196,0.331606,0.193939,0.218430,192,0.581818,193,0.753906,207,256,330,1
10844,0.257812,0.165414,0.165621,269,0.674185,268,0.673367,92,398,399,1
6910,0.356021,0.352332,0.347826,383,0.992228,390,0.984849,405,396,386,1
