This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

To do:

1) Write unit tests for iFeatureOmega
2) time trial for different descriptors - DONE
3) Figure out how to append meso and thermo descriptors
4) Make sure protein length is in training
5) try changing subject_align_len to subject_align_len/m_protein_len
6) log(ratio) for proteins approacing zero or infinity
    take distributions to assess for this.
    Update -- taking log and then droping infinites and NaN improves score to 0.86 <-- cannot repeat this

 protein_pair_cmd = """CREATE OR REPLACE TABLE fafsa_protein_pairs AS
                          SELECT 
                          meso_pid,
                          thermo_pid,
                          bit_score,
                          local_gap_compressed_percent_id,
                          scaled_local_query_percent_id,
                          scaled_local_symmetric_percent_id,
                          query_align_len,
                          query_align_cov,
                          subject_align_len,
                          subject_align_cov,
                          FROM protein_pairs USING SAMPLE 100
"""

In [147]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [148]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [149]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [150]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    def compute_descriptor_ratio(fasta_files, descriptors=[]):
        """
        Generates dictionary of descriptors for each of the two input sequences.
        Computes the difference between each instance of a descriptor.
        
        Parameters
        ----------
        List of two fasta files (str) and list of descriptors (str).

        Returns
        -------
        Dictionary with difference between descriptors for each of the 
        input sequences.
        """
        desc_a = get_protein_descriptors(fasta_files[0], descriptors)
        desc_b = get_protein_descriptors(fasta_files[1], descriptors)

        feature_dict = {}

        for key in desc_a:
            
            if 'AAC' in key:
                feature_dict[key] = desc_a[key] - desc_b[key]
            elif 'GAAC' in key:
                feature_dict[key] = desc_a[key] - desc_b[key]
            else:
                feature_dict[key] = desc_a[key] / desc_b[key]

        return feature_dict
    
    feature_dict = compute_descriptor_ratio(fasta_files, descriptors)


    df = dataframe.reset_index()

    for desc in descriptors:

        feature_dict[desc].index = feature_dict[desc].index.astype(int)
        features = feature_dict[desc].reset_index()

        df = pd.merge(
            df,
            features,
            how='outer',
            left_index=True,
            right_index=True)

    return df

In [151]:
#iFeature properties

# protein = iFeatureOmegaCLI.iProtein('meso_50k.fasta')
# protein.display_feature_types()

In [152]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [153]:
#convert to pandas df
df = pd.read_csv('/Users/loganroberts/Learn2Therm/ValidProt/notebooks/learn2therm_sample_50k_exploration.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [154]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [155]:
target = pd.read_csv('protein_match_50k')

In [156]:
target=target.drop(columns=['Unnamed: 0'])

In [157]:
target['protein_match'] = target['protein_match'].map({'Yes': 1, 'No': 0})
target

Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score
0,48641291,1,1.00
1,92992745,1,1.00
2,157628663,1,1.00
3,136708305,1,1.00
4,133672542,1,1.00
...,...,...,...
48845,78849058,0,0.25
48846,108797464,1,1.00
48847,161110219,0,0.25
48848,74177185,1,0.50


In [158]:
target['structure_match'] = np.random.randint(2, size=len(target))

In [159]:
target

Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score,structure_match
0,48641291,1,1.00,0
1,92992745,1,1.00,1
2,157628663,1,1.00,1
3,136708305,1,1.00,1
4,133672542,1,1.00,1
...,...,...,...,...
48845,78849058,0,0.25,1
48846,108797464,1,1.00,1
48847,161110219,0,0.25,1
48848,74177185,1,0.50,1


In [160]:
target['dual_match'] = target['protein_match'] + target['structure_match']

In [161]:
target

Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score,structure_match,dual_match
0,48641291,1,1.00,0,1
1,92992745,1,1.00,1,2
2,157628663,1,1.00,1,2
3,136708305,1,1.00,1,2
4,133672542,1,1.00,1,2
...,...,...,...,...,...
48845,78849058,0,0.25,1,1
48846,108797464,1,1.00,1,2
48847,161110219,0,0.25,1,1
48848,74177185,1,0.50,1,2


In [162]:
from sklearn.utils import resample

# Assuming your data is in a pandas DataFrame called 'data'
# Separate the majority and minority classes
majority_class = target[target['protein_match'] == 1]
minority_class = target[target['protein_match'] == 0]

# Undersample the majority class to match the number of minority class samples
n_samples = len(minority_class)
undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

# Combine the undersampled majority class with the minority class
balanced_data = pd.concat([undersampled_majority, minority_class])

In [163]:
balanced_data

Unnamed: 0,prot_pair_index,protein_match,Jaccard_Score,structure_match,dual_match
43770,97148983,1,1.000000,1,2
44694,4714642,1,1.000000,0,1
44882,18572110,1,1.000000,0,1
19919,56809533,1,0.333333,0,1
35841,36134069,1,0.333333,0,1
...,...,...,...,...,...
48833,70996712,0,0.285714,1,1
48839,14456723,0,0.250000,0,0
48845,78849058,0,0.250000,1,1
48847,161110219,0,0.250000,1,1


In [164]:
df = pd.merge(df, balanced_data, on=['prot_pair_index'])
df.shape

(17108, 34)

In [165]:
"""
this list comes from a combination of reading through the features and determining which might be useful
and timing some of the feature generations. those that took more than 30ish seconds were eliminated
Also removed those that have really high dimensionality (>4000)
"""

feature_list = ['AAC']
other_list = ['DistancePair',
               'CTDC', 'CTDT', 'CTDD', 'CTriad', 'GDPC type 1', 'GDPC type 2',
                'CKSAAGP type 1', 'CKSAAGP type 2', 'PseKRAAC type 2', 'PseKRAAC type 3A','PseKRAAC type 7',
                'PseKRAAC type 9', 'Geary','APAAC', 'QSOrder']

In [166]:
# df = create_new_dataframe(df, 'seq_50k.fasta', descriptors= ['EGAAC', 'CKSAAGP type 1', 'CKSAAGP type 2',
#                                                             'GDPC type 1', 'GDPC type 2', 'TPC type 1', 'TPC type 2',
#                                                             'GTPC type 1', 'GTPC type 2', 'DPC type 1', 'DPC type 2',
#                                                             'Moran', 'Geary', 'NMBroto', 'AC', 'CC', 'ACC',
#                                                             'PAAC', 'APAAC'])

In [167]:
df = create_new_dataframe(df, ['seq_50k_a.fasta', 'seq_50k_b.fasta'], descriptors=[f'QSOrder'])

File imported successfully.
File imported successfully.


Split data into dev and test, and then split that into train and validation.

In [None]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       , 'prot_pair_index', 'ogt_difference', 'Jaccard_Score',
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'Unnamed: 0', 'thermo_index','m_protein_seq', 't_protein_seq', 
                       'm_protein_desc', 't_protein_desc'])

In [None]:
df=df.drop(columns=['index_y', 'index_x'])
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,...,0.840951,1.324497,0.378428,1.000131,1.035213,0.999638,1.027001,0.972217,1.003861,1.025202
1,0.322430,0.264368,0.245552,232,0.888889,208,0.691030,227,301,261,...,1.878335,inf,0.717419,1.067520,1.034354,0.945732,1.058166,0.979840,1.045301,0.977320
2,0.497143,0.426471,0.429630,175,0.857843,174,0.865672,422,201,204,...,1.419540,0.557676,3.903735,0.717013,0.931906,0.968043,1.063307,0.978006,0.983222,1.036263
3,0.391705,0.340000,0.346939,239,0.956000,234,0.975000,188,240,250,...,1.127988,0.501328,1.336874,0.898933,0.987622,1.039015,0.975567,0.963339,1.037357,0.999382
4,0.314554,0.241007,0.228669,218,0.784173,212,0.688312,228,308,278,...,0.873790,0.458740,1.467967,0.880780,0.978674,1.016874,1.041596,0.942693,1.009089,1.049212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,0.331707,0.294372,0.282158,202,0.874459,218,0.868526,232,251,231,...,1.335836,0.445279,1.781114,0.805742,1.058559,0.895396,1.073475,0.988015,0.990166,1.023015
17104,0.322368,0.303406,0.294737,319,0.987616,335,0.979532,122,342,323,...,1.266079,1.424339,0.474780,1.126222,1.059357,0.962280,0.990585,1.050090,0.950814,1.000060
17105,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,271,353,...,1.685137,11.234249,0.561712,1.235767,1.037673,0.938086,1.001577,1.014846,0.996361,0.989802
17106,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,331,338,...,1.093541,3.107960,0.690658,0.897855,0.961526,1.004391,1.030476,1.010615,1.009159,0.981011


In [None]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'm_protein_len', 't_protein_len', 'protein_match',
       'structure_match', 'dual_match', 'QSOrder_Schneider.Xr.A',
       'QSOrder_Schneider.Xr.R', 'QSOrder_Schneider.Xr.N',
       'QSOrder_Schneider.Xr.D', 'QSOrder_Schneider.Xr.C',
       'QSOrder_Schneider.Xr.Q', 'QSOrder_Schneider.Xr.E',
       'QSOrder_Schneider.Xr.G', 'QSOrder_Schneider.Xr.H',
       'QSOrder_Schneider.Xr.I', 'QSOrder_Schneider.Xr.L',
       'QSOrder_Schneider.Xr.K', 'QSOrder_Schneider.Xr.M',
       'QSOrder_Schneider.Xr.F', 'QSOrder_Schneider.Xr.P',
       'QSOrder_Schneider.Xr.S', 'QSOrder_Schneider.Xr.T',
       'QSOrder_Schneider.Xr.W', 'QSOrder_Schneider.Xr.Y',
       'QSOrder_Schneider.Xr.V', 'QSOrder_Grantham.Xr.A',
       'QSOrder_Grantham.Xr.R', 'QSOrder_Grantham.Xr.N',
       'QSO

In [None]:
df=df.drop(columns=['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'm_protein_len', 't_protein_len'])

In [None]:
df

Unnamed: 0,protein_match,structure_match,dual_match,QSOrder_Schneider.Xr.A,QSOrder_Schneider.Xr.R,QSOrder_Schneider.Xr.N,QSOrder_Schneider.Xr.D,QSOrder_Schneider.Xr.C,QSOrder_Schneider.Xr.Q,QSOrder_Schneider.Xr.E,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0,0,0,1.704647,1.198110,1.780974,1.424779,0.427434,0.508850,0.838106,...,0.840951,1.324497,0.378428,1.000131,1.035213,0.999638,1.027001,0.972217,1.003861,1.025202
1,1,1,2,0.886249,1.194510,0.693781,1.059593,1.665075,0.740033,0.832537,...,1.878335,inf,0.717419,1.067520,1.034354,0.945732,1.058166,0.979840,1.045301,0.977320
2,1,0,1,1.237262,0.674870,2.811960,0.766898,0.000000,1.012306,1.124784,...,1.419540,0.557676,3.903735,0.717013,0.931906,0.968043,1.063307,0.978006,0.983222,1.036263
3,1,1,2,0.628310,0.654490,2.617960,1.063546,0.000000,1.178082,1.063546,...,1.127988,0.501328,1.336874,0.898933,0.987622,1.039015,0.975567,0.963339,1.037357,0.999382
4,0,0,0,0.976681,1.139461,1.017376,0.968930,0.542601,0.626078,1.252155,...,0.873790,0.458740,1.467967,0.880780,0.978674,1.016874,1.041596,0.942693,1.009089,1.049212
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17103,1,0,1,0.955860,0.848994,1.142876,1.097161,1.828602,0.351654,1.142876,...,1.335836,0.445279,1.781114,0.805742,1.058559,0.895396,1.073475,0.988015,0.990166,1.023015
17104,1,1,2,0.988189,1.129359,0.470566,1.680594,0.941133,0.752906,0.687751,...,1.266079,1.424339,0.474780,1.126222,1.059357,0.962280,0.990585,1.050090,0.950814,1.000060
17105,0,1,1,0.939091,0.723428,1.157484,1.702183,0.771656,0.771656,0.670122,...,1.685137,11.234249,0.561712,1.235767,1.037673,0.938086,1.001577,1.014846,0.996361,0.989802
17106,0,1,1,0.940441,0.697747,2.060013,1.351884,0.171668,2.446266,0.978506,...,1.093541,3.107960,0.690658,0.897855,0.961526,1.004391,1.030476,1.010615,1.009159,0.981011


In [None]:
#maybe divide all of the AAC/GAAC columns by the protein length 

In [None]:
df.shape

(2210, 72)

In [None]:
df = df.replace([np.inf, -np.inf], np.nan)

In [None]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

protein_match                0
structure_match              0
dual_match                   0
QSOrder_Schneider.Xr.A       0
QSOrder_Schneider.Xr.R       7
QSOrder_Schneider.Xr.N     452
QSOrder_Schneider.Xr.D      12
QSOrder_Schneider.Xr.C    3454
QSOrder_Schneider.Xr.Q     138
QSOrder_Schneider.Xr.E      13
QSOrder_Schneider.Xr.G       0
QSOrder_Schneider.Xr.H     332
QSOrder_Schneider.Xr.I      25
QSOrder_Schneider.Xr.L       1
QSOrder_Schneider.Xr.K     421
QSOrder_Schneider.Xr.M       2
QSOrder_Schneider.Xr.F     138
QSOrder_Schneider.Xr.P      23
QSOrder_Schneider.Xr.S       7
QSOrder_Schneider.Xr.T       7
QSOrder_Schneider.Xr.W    3143
QSOrder_Schneider.Xr.Y     413
QSOrder_Schneider.Xr.V       1
QSOrder_Grantham.Xr.A        0
QSOrder_Grantham.Xr.R        7
QSOrder_Grantham.Xr.N      452
QSOrder_Grantham.Xr.D       12
QSOrder_Grantham.Xr.C     3454
QSOrder_Grantham.Xr.Q      138
QSOrder_Grantham.Xr.E       13
QSOrder_Grantham.Xr.G        0
QSOrder_Grantham.Xr.H      332
QSOrder_

array([   0,    7,  452,   12, 3454,  138,   13,  332,   25,    1,  421,
          2,   23, 3143,  413])

In [None]:
df.columns[36:]

Index(['QSOrder_Grantham.Xd.2', 'QSOrder_Grantham.Xd.3'], dtype='object')

In [None]:
log_cols = df.columns[36:]
df[log_cols] = df[log_cols].apply(lambda x: np.log(x))

In [None]:
df = df.dropna(axis=1, how='any')

In [None]:
nan_counts = df.isna().sum()
nan_counts

protein_match             0
structure_match           0
dual_match                0
QSOrder_Schneider.Xr.A    0
QSOrder_Schneider.Xr.G    0
QSOrder_Grantham.Xr.A     0
QSOrder_Grantham.Xr.G     0
QSOrder_Schneider.Xd.1    0
QSOrder_Schneider.Xd.2    0
QSOrder_Schneider.Xd.3    0
QSOrder_Grantham.Xd.1     0
QSOrder_Grantham.Xd.2     0
QSOrder_Grantham.Xd.3     0
dtype: int64

In [None]:
df.shape

(2210, 56)

Use MRMR to select for the best features. Going to start by grouping into different categories of features generated from iFeature Omega.

In [None]:
#use MRMR to select for the best features from PseKRAAC
df_subset = df.loc[:, df.columns != 'protein_match']
print(type(df_subset))

# select top 10 features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=df_subset.iloc[:,10:], y=df['protein_match'], K=20)

selected_features

<class 'pandas.core.frame.DataFrame'>


100%|██████████| 20/20 [00:00<00:00, 48.64it/s]


['QSOrder_Schneider.Xr.A',
 'QSOrder_Grantham.Xd.3',
 'QSOrder_Grantham.Xr.R',
 'QSOrder_Schneider.Xd.1',
 'AAC_A',
 'QSOrder_Schneider.Xd.3',
 'QSOrder_Grantham.Xr.A',
 'AAC_I',
 'QSOrder_Schneider.Xr.R',
 'AAC_S',
 'GAAC_alphatic',
 'AAC_E',
 'QSOrder_Grantham.Xd.2',
 'AAC_N',
 'GAAC_negativecharge',
 'QSOrder_Schneider.Xr.L',
 'QSOrder_Schneider.Xr.G',
 'AAC_T',
 'GAAC_uncharge',
 'AAC_R']

In [None]:
best_features_df = df[[feature for feature in selected_features]]

#concatenates original feature vector back into the dataframe
df = pd.concat([df.iloc[:, :11], best_features_df], axis=1)
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,GAAC_alphatic,AAC_E,QSOrder_Grantham.Xd.2,AAC_N,GAAC_negativecharge,QSOrder_Schneider.Xr.L,QSOrder_Schneider.Xr.G,AAC_T,GAAC_uncharge,AAC_R
0,0.283582,0.231707,0.240506,133,0.810976,134,0.881579,99,152,164,...,-0.015083,-0.013479,1.036564,0.006579,0.004974,1.329084,0.701461,0.003851,0.059050,-0.034981
1,0.268482,0.190083,0.189300,248,0.683196,271,0.740437,132,366,363,...,0.078422,-0.030461,0.982711,0.002394,-0.006345,0.920360,1.465759,0.010432,-0.035113,-0.030800
2,0.265306,0.201550,0.210243,320,0.826873,294,0.828169,140,355,387,...,0.030913,-0.020017,0.978314,0.011479,-0.021691,0.886583,1.065708,0.018277,0.057837,-0.010125
3,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,...,0.064287,-0.009864,1.003861,0.005898,0.011143,1.139824,0.931587,-0.013581,-0.068788,0.016882
4,0.394495,0.346774,0.369099,108,0.870968,106,0.972477,130,109,124,...,-0.028781,-0.030334,0.917208,0.000000,-0.007547,1.961181,1.123222,0.013613,0.063332,-0.031444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205,0.241758,0.134557,0.144975,181,0.553517,190,0.678571,107,280,327,...,0.049945,0.007678,1.001902,0.024509,0.004642,0.949185,1.416965,-0.051409,-0.086435,-0.043283
2206,0.321429,0.186391,0.223801,197,0.582840,192,0.853333,195,225,338,...,0.082104,-0.038396,0.931608,0.002998,-0.030901,1.624165,1.373086,0.014872,-0.030743,-0.005786
2207,0.290909,0.162437,0.154589,110,0.558376,112,0.516129,85,217,197,...,-0.074037,0.016959,0.948476,0.008281,0.038995,0.890999,0.712799,0.015626,0.022831,0.000327
2208,0.447368,0.387833,0.373626,229,0.870722,233,0.823322,448,283,263,...,-0.037660,-0.007833,0.961154,-0.009217,0.006610,0.900462,0.840163,-0.021430,0.020557,0.004958


In [None]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(1878, 12)
(332, 12)
(1596, 12)
(282, 12)


In [None]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [None]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(1878, 11) (332, 11) (1878, 1) (332, 1)


In [None]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [None]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [None]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier(n_estimators=150, max_depth=None, max_samples=0.5,
                                                max_features=0.5, min_weight_fraction_leaf=0.000215,
                                               min_samples_split=10)

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [None]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.5212765957446809
['Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'No'
 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes'
 'No' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes'
 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No'
 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No'
 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes'
 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No'
 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No'
 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'No'
 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No'
 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'No'
 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'Yes'
 'Yes' 'No

In [None]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7ff6211ba520>

In [None]:
def time_feature_generation(dataframe, file_name:str, descriptor:str):
    
    """
    Times how long it takes to generate a specific descriptor.
    
    Parameters
    ----------
    Pandas dataframe, fasta file name, descriptor name (string).

    Returns
    -------
    Time to generate descriptor.
    """

    # Record the current time
    start_time = time.time()

    # Code to be timed goes here
    create_new_dataframe(dataframe, file_name, descriptors=[descriptor])

    # Record the time again and calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    # Print the elapsed time
    return elapsed_time

In [None]:
def time_feature_dict(dataframe, file_name:str, feature_list):
    
    """
    Iterates through dictionary of protein descriptors and returns the time it takes to run the code.
    
    Parameters
    ----------
    Pandas dataframe, fasta file name, descriptors (list of strings).

    Returns
    -------
    Dictionary of descriptors and time to generate them.
    """
    
    protein = iFeatureOmegaCLI.iProtein(file_name)
    
    time_dict = {}
    
    for feature in feature_list:
        time_dict[feature] = time_feature_generation(dataframe, file_name, feature)
    
    return time_dict