This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

To do:

1) Write unit tests for iFeatureOmega
2) time trial for different descriptors - DONE
3) Figure out how to append meso and thermo descriptors
4) Make sure protein length is in training
5) try changing subject_align_len to subject_align_len/m_protein_len
6) log(ratio) for proteins approacing zero or infinity
    take distributions to assess for this.

In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [24]:
cd /Users/loganroberts/Learn2Therm/ValidProt/FAFSA

/Users/loganroberts/Learn2Therm/ValidProt/FAFSA


In [25]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [26]:
df['prot_pair_index']

0         48641291
1         92992745
2        157628663
3        136708305
4        133672542
           ...    
49995     78849058
49996    108797464
49997    161110219
49998     74177185
49999    143737378
Name: prot_pair_index, Length: 50000, dtype: int64

In [27]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [28]:
target = pd.read_csv('protein_match_6k.csv')

In [29]:
target

Unnamed: 0,prot_pair_index,Jaccard_Score,protein_match
0,48641291,1.0,Yes
1,92992745,1.0,Yes
2,157628663,1.0,Yes
3,136708305,1.0,Yes
4,133672542,1.0,Yes
...,...,...,...
6343,55489429,0.5,Yes
6344,172293605,1.0,Yes
6345,47082975,1.0,Yes
6346,154513027,1.0,Yes


In [30]:
df = pd.merge(df, target, on=['prot_pair_index'])
df.shape

(6348, 32)

In [31]:
df

Unnamed: 0.1,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len,Jaccard_Score,protein_match
0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,875,...,50.0,22.5,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,MPSQITESERIELAERFERDALPLLDQLYSAALRMTRNPADAEDLV...,ECF RNA polymerase sigma factor SigK,sigma-70 family RNA polymerase sigma factor,206,202,1.0,Yes
1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,11324,...,54.0,29.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,response regulator transcription factor,response regulator transcription factor,233,237,1.0,Yes
2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,875,...,50.0,22.0,MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAE...,MTPEQIFSGQTAIVTGGASGIGAATVEHIARRGGRVFSVDLSYDSP...,SDR family oxidoreductase,SDR family oxidoreductase,287,252,1.0,Yes
3,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,875,...,50.0,22.0,MTSGLWERVLDGVWVTIQLLVLSALLATAVSFVVGIARTHRLWIVR...,MAMSRRKRGQLARGIQYAILVIVVVVLALLADWGKIGKAFFDWEAA...,ectoine/hydroxyectoine ABC transporter permeas...,amino acid ABC transporter permease,234,269,1.0,Yes
4,4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,9827,...,50.0,20.0,MIISLRRGLRFIRFIVFFAALVYLFYHVLDLFNGWISPVDQYQMPT...,MKRMVWRTLKVFIIFIACTLLFYFGLRFMHLEYEQFHRYEPPEGPA...,YqzK family protein,YqzK family protein,80,66,1.0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,6495,0.387574,0.367978,0.360882,336,0.943820,337,0.910811,588,11707,...,50.0,20.0,MDNEKPIVDFSALRDELGKNYEPVQVMDGQAKVIKPEIVSKYSNDQ...,MKLEFPILQIVDENGNLVNENYRKAITEELVKKFYYHMIRIRTFDR...,pyruvate dehydrogenase (acetyl-transferring) E...,pyruvate dehydrogenase (acetyl-transferring) E...,370,356,0.5,Yes
6344,6496,0.488462,0.488462,0.455197,258,0.992308,263,0.882550,619,875,...,50.0,20.0,MANPYLDDLRLAHVLADSADNLSMDRFGALDLEVSTKPDMTYVTES...,MTRPYADDIHLAKRLADTADAITVPRYSVRDLTVRTKPDRSPVTDA...,histidinol phosphatase,histidinol-phosphatase,298,260,1.0,Yes
6345,6497,0.383838,0.376238,0.374384,296,0.976898,299,0.977124,436,5381,...,45.0,21.5,MGHDHDHSHGTATTNRTKLAWAFGITFTILIAEVIGAILTNSLALL...,MAHQHGPARPEHASGRYLKRLMAAFGIGLVFMVLEVVVGVLTGSLA...,cation diffusion facilitator family transporter,cation diffusion facilitator family transporter,306,303,1.0,Yes
6346,6498,0.769231,0.769231,0.769231,78,1.000000,78,1.000000,258,14963,...,52.5,22.5,MAKPALRKPKKKSNPLKAAKISYVDYKDTALLRKFISDRGKIRARR...,MAKQPPRKPKKKVCVFCQEKISYVDYKDTALLRKFISDRGKIRARR...,30S ribosomal protein S18,30S ribosomal protein S18,78,78,1.0,Yes


In [32]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [33]:
#getting an error in the try/except block


from Bio import Seq, SeqIO

def get_fasta_from_dataframe_biopython(dataframe, output_file:str):
    # Create a list of SeqRecord objects
    records = []
    for _, row in dataframe.iterrows():
        try:
            seq = Seq(row['m_protein_seq'])
        except Exception as e:
            print(f"Skipping row {row.name}: {str(e)}")
            continue
        record = SeqIO.SeqRecord(
            seq=seq,
            id=row['meso_index'],
            description=""
        )
        records.append(record)
    
    # Write the records to a FASTA file
    with open(output_file, 'w') as f:
        SeqIO.write(records, f, 'fasta')
    
    return output_file


In [34]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [35]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    def compute_descriptor_ratio(fasta_files, descriptors=[]):
        """
        Generates dictionary of descriptors for each of the two input sequences.
        Computes the difference between each instance of a descriptor.
        
        Parameters
        ----------
        List of two fasta files (str) and list of descriptors (str).

        Returns
        -------
        Dictionary with difference between descriptors for each of the 
        input sequences.
        """
        desc_a = get_protein_descriptors(fasta_files[0], descriptors)
        desc_b = get_protein_descriptors(fasta_files[1], descriptors)

        feature_dict = {}

        for key in desc_a:
            feature_dict[key] = desc_a[key] / desc_b[key]

        return feature_dict
    
    feature_dict = compute_descriptor_ratio(fasta_files, descriptors)


    df = dataframe.reset_index()

    for desc in descriptors:

        feature_dict[desc].index = feature_dict[desc].index.astype(int)
        features = feature_dict[desc].reset_index()

        df = pd.merge(
            df,
            features,
            how='outer',
            left_index=True,
            right_index=True)

    return df

In [36]:
df = create_new_dataframe(df, ['seq_50k_a.fasta', 'seq_50k_b.fasta'], descriptors=['AAC', 'GAAC', 'QSOrder'])
df

File imported successfully.
File imported successfully.


  df = pd.merge(


Unnamed: 0.1,index_x,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,...,0.995501,0.995501,0.746626,2.433447,0.942154,1.023105,1.047587,0.937454,0.974866,1.104177
1,1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,...,0.730477,0.996105,0.996105,1.261733,0.980941,1.056692,0.967775,0.910832,1.147138,0.966570
2,2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,...,0.888613,1.332920,0.493674,1.127855,1.046793,1.005878,1.003981,1.034618,1.025785,0.938530
3,3,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,...,0.849359,2.807602,0.656323,1.323584,0.950250,1.057052,0.957343,0.921186,1.106142,0.987225
4,4,4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,...,0.797894,2.393681,1.329823,1.861752,0.913500,1.051214,1.066765,0.976156,1.111363,0.908320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,6343,6495,0.387574,0.367978,0.360882,336,0.943820,337,0.910811,588,...,0.929210,0.655913,1.115052,1.229837,1.043239,0.997041,0.961075,1.004553,1.029910,0.965934
6344,6344,6496,0.488462,0.488462,0.455197,258,0.992308,263,0.882550,619,...,0.833497,0.983990,1.062709,0.695821,1.074279,0.985104,0.969361,1.053592,0.951543,1.000276
6345,6345,6497,0.383838,0.376238,0.374384,296,0.976898,299,0.977124,436,...,1.695522,0.726652,0.322957,0.523713,0.946281,0.992210,1.060345,0.938189,1.001573,1.063319
6346,6346,6498,0.769231,0.769231,0.769231,78,1.000000,78,1.000000,258,...,0.375423,,1.126270,1.126270,1.073105,1.012337,0.921517,1.023642,0.964341,1.017542


In [37]:
#this function is only necessary if we get a fasta file with a descriptor generated from component 3

def remove_fasta_description(filename:str):
    
    """
    Removes description from fasta file so that iProtein can read the input.
    Might not be necessary in our current format.
    
    Parameters
    ----------
    Fasta file with protein sequences.

    Returns
    -------
    Fasta file with protein sequences, sans description.
    """
    
    #assign unwanted string to object
    string_to_remove = "<unknown description>"

    #open file
    with open(filename, "r") as file:
        content = file.read()
    
    # Remove the string
    new_content = content.replace(string_to_remove, "")

    #overwrite file without string
    with open(filename, "w") as file:
        seq = file.write(new_content)
        
    return seq

In [38]:
#combine both of the above functions

def fasta_to_descriptors(fasta:str, descriptors=[]):

    #remove description from fasta file
    remove_fasta_description(fasta)
    
    #return protein descriptors
    return get_protein_descriptors(fasta, descriptors=descriptors)

In [39]:
#need to figure out how to pass fasta for both descriptors and append to one dataframe
#set this back to what it was originally. Gonna calculate difference for each descriptors

# def create_new_dataframe(dataframe, fasta_files:list, descriptors=[]):
#     """
#     Creates new dataframe with descriptors added.
    
#     Parameters
#     ----------
#     Pandas dataframe, list of descriptors as strings, output file name.

#     Returns
#     -------
#     Dataframe including vector(s) of descriptors
#     """
    
#     #descriptors for meso sequence
#     fasta = get_fasta_from_dataframe(dataframe, fasta_files[0], fasta_files[1])
    
#     #create feature dictionary for sequence a
#     feature_dict_a = get_protein_descriptors(fasta[0], 'Meso', descriptors)
#     print(feature_dict_a)
    
#     #create feature dictionary for sequence b
#     feature_dict_b = get_protein_descriptors(fasta[1], 'Thermo', descriptors)
    
#     df = dataframe.reset_index()
    
#     #merge meso sequences
#     for desc in descriptors:
        
#         feature_dict_a[desc+'_Meso'].index = feature_dict_a[desc+'_Meso'].index.astype(int)
#         features_a = feature_dict_a[desc+'_Meso'].reset_index()
        
#     df = pd.merge(df, features_a, how='outer', left_index=True, right_index=True)
    
#     #merge thermo sequences
# #     for desc in descriptors:
        
# #         feature_dict_b[desc+'_Thermo'].index = feature_dict_b[desc+'_Thermo'].index.astype(int)
# #         features_b = feature_dict_b[desc+'_Thermo'].reset_index()
        
# #     df = pd.merge(df, features_b, how='outer', left_index=True, right_index=True)
        
#     return df

Let's see how long generating some of the descriptors takes:

In [40]:
df

Unnamed: 0.1,index_x,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,...,0.995501,0.995501,0.746626,2.433447,0.942154,1.023105,1.047587,0.937454,0.974866,1.104177
1,1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,...,0.730477,0.996105,0.996105,1.261733,0.980941,1.056692,0.967775,0.910832,1.147138,0.966570
2,2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,...,0.888613,1.332920,0.493674,1.127855,1.046793,1.005878,1.003981,1.034618,1.025785,0.938530
3,3,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,...,0.849359,2.807602,0.656323,1.323584,0.950250,1.057052,0.957343,0.921186,1.106142,0.987225
4,4,4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,...,0.797894,2.393681,1.329823,1.861752,0.913500,1.051214,1.066765,0.976156,1.111363,0.908320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,6343,6495,0.387574,0.367978,0.360882,336,0.943820,337,0.910811,588,...,0.929210,0.655913,1.115052,1.229837,1.043239,0.997041,0.961075,1.004553,1.029910,0.965934
6344,6344,6496,0.488462,0.488462,0.455197,258,0.992308,263,0.882550,619,...,0.833497,0.983990,1.062709,0.695821,1.074279,0.985104,0.969361,1.053592,0.951543,1.000276
6345,6345,6497,0.383838,0.376238,0.374384,296,0.976898,299,0.977124,436,...,1.695522,0.726652,0.322957,0.523713,0.946281,0.992210,1.060345,0.938189,1.001573,1.063319
6346,6346,6498,0.769231,0.769231,0.769231,78,1.000000,78,1.000000,258,...,0.375423,,1.126270,1.126270,1.073105,1.012337,0.921517,1.023642,0.964341,1.017542


In [41]:
df
df['protein_match'].value_counts()

Yes    5243
No     1105
Name: protein_match, dtype: int64

In [42]:
#iFeature properties

protein = iFeatureOmegaCLI.iProtein('meso_50k.fasta')
protein.display_feature_types()


        ----- Available feature types ------        
        
        AAC                                                Amino acid composition
        EAAC                                               Enhanced amino acid composition
        CKSAAP type 1                                      Composition of k-spaced amino acid pairs type 1 - normalized
        CKSAAP type 2                                      Composition of k-spaced amino acid pairs type 2 - raw count
        DPC type 1                                         Dipeptide composition type 1 - normalized
        DPC type 2                                         Dipeptide composition type 2 - raw count
        TPC type 1                                         Tripeptide composition type 1 - normalized
        TPC type 2                                         Tripeptide composition type 1 - raw count
        CTDC                                               Composition
        CTDT                                      

In [43]:
"""
this list comes from a combination of reading through the features and determining which might be useful
and timing some of the feature generations. those that took more than 30ish seconds were eliminated
Also removed those that have really high dimensionality (>4000)
"""

feature_list = ['AAC', 'GAAC', 'DistancePair',
               'DPC type 1', 'CTDC', 'CTDT', 'CTDD', 'CTriad',
                'CKSAAGP type 1', 'PseKRAAC type 1', 'PseKRAAC type 2', 'PseKRAAC type 3A',
                'PseKRAAC type 3B', 'PseKRAAC type 4', 'PseKRAAC type 5',
                'APAAC', 'QSOrder']

In [44]:
import time

In [45]:
def time_feature_generation(dataframe, file_name:str, descriptor:str):
    
    """
    Times how long it takes to generate a specific descriptor.
    
    Parameters
    ----------
    Pandas dataframe, fasta file name, descriptor name (string).

    Returns
    -------
    Time to generate descriptor.
    """

    # Record the current time
    start_time = time.time()

    # Code to be timed goes here
    create_new_dataframe(dataframe, file_name, descriptors=[descriptor])

    # Record the time again and calculate the elapsed time
    end_time = time.time()
    elapsed_time = end_time - start_time

    # Print the elapsed time
    return elapsed_time

In [46]:
time_feature_generation(df, 'meso_50k.fasta', 'QSOrder')

File imported successfully.
File imported successfully.


  df = pd.merge(


12.32142186164856

In [47]:
def time_feature_dict(dataframe, file_name:str, feature_list):
    
    """
    Iterates through dictionary of protein descriptors and returns the time it takes to run the code.
    
    Parameters
    ----------
    Pandas dataframe, fasta file name, descriptors (list of strings).

    Returns
    -------
    Dictionary of descriptors and time to generate them.
    """
    
    protein = iFeatureOmegaCLI.iProtein(file_name)
    
    time_dict = {}
    
    for feature in feature_list:
        time_dict[feature] = time_feature_generation(dataframe, file_name, feature)
    
    return time_dict

In [48]:
# time_feature_dict(df, 'meso_50k.fasta', feature_list)

In [49]:
df['Jaccard_Score']

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
6343    0.5
6344    1.0
6345    1.0
6346    1.0
6347    0.2
Name: Jaccard_Score, Length: 6348, dtype: float64

In [50]:
df

Unnamed: 0.1,index_x,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,...,0.995501,0.995501,0.746626,2.433447,0.942154,1.023105,1.047587,0.937454,0.974866,1.104177
1,1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,...,0.730477,0.996105,0.996105,1.261733,0.980941,1.056692,0.967775,0.910832,1.147138,0.966570
2,2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,...,0.888613,1.332920,0.493674,1.127855,1.046793,1.005878,1.003981,1.034618,1.025785,0.938530
3,3,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,...,0.849359,2.807602,0.656323,1.323584,0.950250,1.057052,0.957343,0.921186,1.106142,0.987225
4,4,4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,...,0.797894,2.393681,1.329823,1.861752,0.913500,1.051214,1.066765,0.976156,1.111363,0.908320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,6343,6495,0.387574,0.367978,0.360882,336,0.943820,337,0.910811,588,...,0.929210,0.655913,1.115052,1.229837,1.043239,0.997041,0.961075,1.004553,1.029910,0.965934
6344,6344,6496,0.488462,0.488462,0.455197,258,0.992308,263,0.882550,619,...,0.833497,0.983990,1.062709,0.695821,1.074279,0.985104,0.969361,1.053592,0.951543,1.000276
6345,6345,6497,0.383838,0.376238,0.374384,296,0.976898,299,0.977124,436,...,1.695522,0.726652,0.322957,0.523713,0.946281,0.992210,1.060345,0.938189,1.001573,1.063319
6346,6346,6498,0.769231,0.769231,0.769231,78,1.000000,78,1.000000,258,...,0.375423,,1.126270,1.126270,1.073105,1.012337,0.921517,1.023642,0.964341,1.017542


In [51]:
df.columns

Index(['index_x', 'Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score',
       ...
       'QSOrder_Grantham.Xr.T', 'QSOrder_Grantham.Xr.W',
       'QSOrder_Grantham.Xr.Y', 'QSOrder_Grantham.Xr.V',
       'QSOrder_Schneider.Xd.1', 'QSOrder_Schneider.Xd.2',
       'QSOrder_Schneider.Xd.3', 'QSOrder_Grantham.Xd.1',
       'QSOrder_Grantham.Xd.2', 'QSOrder_Grantham.Xd.3'],
      dtype='object', length=107)

# Scratch work for above function

In [52]:
# sequence = "meso_input_copy.fasta.txt"
# string_to_remove = "<unknown description>"

# with open(sequence, "r") as file:
#     content = file.read()
    
# # Remove the string
# new_content = content.replace(string_to_remove, "")

# with open(sequence, "w") as file:
#     file.write(new_content)


In [53]:
df['t_protein_len'].describe()

count    6348.000000
mean      264.121141
std        67.865076
min        35.000000
25%       229.000000
50%       262.000000
75%       312.000000
max       400.000000
Name: t_protein_len, dtype: float64

The bit-score provides a better rule-of-thumb for inferring homology. For average length proteins, a bit score of 50 is almost always significant. A bit score of 40 is only significant (E() < 0.001) in searches of protein databases with fewer than 7000 entries. Increasing the score by 10 bits increases the significance 210=1000-fold, so 50 bits would be significant in a database with less than 7 million entries (10 times SwissProt, and within a factor of 3 of the largest protein databases). Thus, the NCBI Blast web site uses a color code of blue for alignment with scores between 40–50 bits; and green for scores between 50–80 bits. In the yeast vs human example, the alignments with less than 20% identity had scores ranging from 55 – 170 bits. Except for very long proteins and very large databases, 50 bits of similarity score will always be statistically significant and is a much better rule-of-thumb for inferring homology in protein alignments.

Pearson et al., 2013: An Introduction to Sequence Similarity (“Homology”) Searching

In [54]:
df['protein_match'].value_counts()

Yes    5243
No     1105
Name: protein_match, dtype: int64

In [55]:
df['prot_pair_index']

0        48641291
1        92992745
2       157628663
3       136708305
4       133672542
          ...    
6343     55489429
6344    172293605
6345     47082975
6346    154513027
6347     90211763
Name: prot_pair_index, Length: 6348, dtype: int64

In [56]:
df['index_x']

Unnamed: 0,index_x,index_x.1
0,0,48641291
1,1,92992745
2,2,157628663
3,3,136708305
4,4,133672542
...,...,...
6343,6343,55489429
6344,6344,172293605
6345,6345,47082975
6346,6346,154513027


In [57]:
#get rid of stuff that isn't quantitative

df = df.drop(columns = (['Unnamed: 0', 'index_x', 'thermo_index',
                         'm_protein_seq', 't_protein_seq', 'm_protein_desc', 't_protein_desc']))

In [58]:
df.head()

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,meso_index,prot_pair_index,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,12897,48641291,...,0.995501,0.995501,0.746626,2.433447,0.942154,1.023105,1.047587,0.937454,0.974866,1.104177
1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,13026,92992745,...,0.730477,0.996105,0.996105,1.261733,0.980941,1.056692,0.967775,0.910832,1.147138,0.96657
2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,8203,157628663,...,0.888613,1.33292,0.493674,1.127855,1.046793,1.005878,1.003981,1.034618,1.025785,0.93853
3,0.327273,0.200743,0.214712,166,0.6171,163,0.696581,175,3340,136708305,...,0.849359,2.807602,0.656323,1.323584,0.95025,1.057052,0.957343,0.921186,1.106142,0.987225
4,0.33871,0.318182,0.287671,60,0.909091,71,0.8875,61,14020,133672542,...,0.797894,2.393681,1.329823,1.861752,0.9135,1.051214,1.066765,0.976156,1.111363,0.90832


From pairplot below, it looks like query_align_cov_16s and subject_align_cov_16s dont correlate with target. Let's remove them.

In [59]:
df = df.drop(columns = ['query_align_cov_16s', 'subject_align_cov_16s'])

In [60]:
# sns.pairplot(df.sample(500),height = 4)

In [61]:
df.describe()

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,meso_index,prot_pair_index,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
count,6348.0,6348.0,6348.0,6348.0,6348.0,6348.0,6348.0,6348.0,6348.0,6348.0,...,6346.0,6044.0,6320.0,6348.0,6348.0,6348.0,6348.0,6348.0,6348.0,6348.0
mean,0.348174,0.276047,0.270881,208.339162,0.792409,207.6046,0.773547,233.339162,8098.207152,90595790.0,...,inf,inf,inf,inf,1.003198,0.999446,1.005768,1.001528,0.998921,1.002744
std,0.09639,0.113913,0.11518,63.261682,0.145697,63.264993,0.175379,186.940699,4565.102004,52502680.0,...,,,,,0.055486,0.052187,0.058699,0.044135,0.04582,0.047034
min,0.193431,0.106383,0.066929,23.0,0.327935,23.0,0.083916,37.0,5.0,8858.0,...,0.0,0.0,0.0,0.0,0.611851,0.611344,0.581262,0.845567,0.832563,0.787056
25%,0.292571,0.205948,0.203093,176.0,0.672716,176.0,0.657086,121.0,4243.25,45301010.0,...,0.819391,0.48156,0.617295,0.820078,0.967586,0.966991,0.96884,0.97215,0.969537,0.971821
50%,0.32372,0.25,0.244565,211.0,0.811727,210.0,0.805808,188.0,8046.0,90856250.0,...,1.051477,0.948316,0.961569,0.990124,1.00055,0.997337,1.00256,1.001085,0.997319,1.001131
75%,0.366709,0.306723,0.300314,241.0,0.925,240.0,0.924258,270.0,12069.0,136028500.0,...,1.363532,1.736024,1.444,1.198135,1.036337,1.029358,1.037681,1.029653,1.028056,1.032546
max,0.95935,0.951613,0.955466,398.0,1.0,398.0,1.0,1565.0,16345.0,181428600.0,...,inf,inf,inf,inf,1.589606,1.72432,1.466216,1.209408,1.564249,1.257702


Split data into dev and test, and then split that into train and validation.

In [62]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       , 'prot_pair_index', 'ogt_difference'])

In [63]:
df.columns

Index(['local_gap_compressed_percent_id', 'scaled_local_query_percent_id',
       'scaled_local_symmetric_percent_id', 'query_align_len',
       'query_align_cov', 'subject_align_len', 'subject_align_cov',
       'bit_score', 'm_protein_len', 't_protein_len', 'Jaccard_Score',
       'protein_match', 'index_y', 'AAC_A', 'AAC_C', 'AAC_D', 'AAC_E', 'AAC_F',
       'AAC_G', 'AAC_H', 'AAC_I', 'AAC_K', 'AAC_L', 'AAC_M', 'AAC_N', 'AAC_P',
       'AAC_Q', 'AAC_R', 'AAC_S', 'AAC_T', 'AAC_V', 'AAC_W', 'AAC_Y',
       'GAAC_alphatic', 'GAAC_aromatic', 'GAAC_postivecharge',
       'GAAC_negativecharge', 'GAAC_uncharge', 'index_y',
       'QSOrder_Schneider.Xr.A', 'QSOrder_Schneider.Xr.R',
       'QSOrder_Schneider.Xr.N', 'QSOrder_Schneider.Xr.D',
       'QSOrder_Schneider.Xr.C', 'QSOrder_Schneider.Xr.Q',
       'QSOrder_Schneider.Xr.E', 'QSOrder_Schneider.Xr.G',
       'QSOrder_Schneider.Xr.H', 'QSOrder_Schneider.Xr.I',
       'QSOrder_Schneider.Xr.L', 'QSOrder_Schneider.Xr.K',
       'QSOrder_Sch

In [64]:
df = df.drop(columns=['Jaccard_Score'])

In [65]:
df = df.drop(columns=['index_y'])

In [66]:
df['protein_match']

0       Yes
1       Yes
2       Yes
3       Yes
4       Yes
       ... 
6343    Yes
6344    Yes
6345    Yes
6346    Yes
6347     No
Name: protein_match, Length: 6348, dtype: object

In [102]:
log_cols = df.columns[11:]


In [103]:
df[log_cols] = df[log_cols].apply(lambda x: np.log(x))

In [104]:
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,206,202,...,-0.004509,-0.004509,-0.292191,0.889309,-0.059586,0.022842,0.046489,-0.064588,-0.025455,0.099100
1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,233,237,...,-0.314057,-0.003902,-0.003902,0.232486,-0.019243,0.055143,-0.032755,-0.093397,0.137270,-0.034002
2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,287,252,...,-0.118093,0.287372,-0.705880,0.120318,0.045732,0.005861,0.003973,0.034032,0.025458,-0.063440
3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,234,269,...,-0.163274,1.032331,-0.421103,0.280343,-0.051030,0.055484,-0.043593,-0.082094,0.100878,-0.012858
4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,80,66,...,-0.225780,0.872832,0.285046,0.621518,-0.090472,0.049946,0.064631,-0.024133,0.105587,-0.096159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,0.387574,0.367978,0.360882,336,0.943820,337,0.910811,588,370,356,...,-0.073421,-0.421727,0.108901,0.206881,0.042330,-0.002964,-0.039703,0.004543,0.029472,-0.034660
6344,0.488462,0.488462,0.455197,258,0.992308,263,0.882550,619,298,260,...,-0.182125,-0.016140,0.060821,-0.362662,0.071649,-0.015008,-0.031118,0.052205,-0.049670,0.000276
6345,0.383838,0.376238,0.374384,296,0.976898,299,0.977124,436,306,303,...,0.527991,-0.319307,-1.130237,-0.646811,-0.055216,-0.007821,0.058595,-0.063804,0.001572,0.061395
6346,0.769231,0.769231,0.769231,78,1.000000,78,1.000000,258,78,78,...,-0.979701,,0.118911,0.118911,0.070557,0.012262,-0.081734,0.023367,-0.036310,0.017390


In [113]:
inf_cols = df.columns[np.isinf(df).any()]
df = df.drop(columns=inf_cols)

TypeError: ufunc 'isinf' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [105]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(5395, 82)
(953, 82)
(4585, 82)
(810, 82)


In [106]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [107]:
print(input_features)
print(target)

['local_gap_compressed_percent_id', 'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id', 'query_align_len', 'query_align_cov', 'subject_align_len', 'subject_align_cov', 'bit_score', 'm_protein_len', 't_protein_len', 'AAC_A', 'AAC_C', 'AAC_D', 'AAC_E', 'AAC_F', 'AAC_G', 'AAC_H', 'AAC_I', 'AAC_K', 'AAC_L', 'AAC_M', 'AAC_N', 'AAC_P', 'AAC_Q', 'AAC_R', 'AAC_S', 'AAC_T', 'AAC_V', 'AAC_W', 'AAC_Y', 'GAAC_alphatic', 'GAAC_aromatic', 'GAAC_postivecharge', 'GAAC_negativecharge', 'GAAC_uncharge', 'QSOrder_Schneider.Xr.A', 'QSOrder_Schneider.Xr.R', 'QSOrder_Schneider.Xr.N', 'QSOrder_Schneider.Xr.D', 'QSOrder_Schneider.Xr.C', 'QSOrder_Schneider.Xr.Q', 'QSOrder_Schneider.Xr.E', 'QSOrder_Schneider.Xr.G', 'QSOrder_Schneider.Xr.H', 'QSOrder_Schneider.Xr.I', 'QSOrder_Schneider.Xr.L', 'QSOrder_Schneider.Xr.K', 'QSOrder_Schneider.Xr.M', 'QSOrder_Schneider.Xr.F', 'QSOrder_Schneider.Xr.P', 'QSOrder_Schneider.Xr.S', 'QSOrder_Schneider.Xr.T', 'QSOrder_Schneider.Xr.W', 'QSOrder_Schneider.Xr.Y

In [110]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(5395, 81) (953, 81) (5395, 1) (953, 1)


In [111]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [112]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

ValueError: Input X contains infinity or a value too large for dtype('float64').

Train the model

In [113]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier(n_estimators=150, max_depth=None, max_samples=0.5,
                                                max_features=0.5, min_weight_fraction_leaf=0.000215,
                                               min_samples_split=10)

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [114]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.8135802469135802
['Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' '

In [115]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fdd4eafedc0>

Convert the above code into functions.

In [79]:
def train_model(dataframe, columns = [],  target = []):
    """
    Takes dataframe and splits it into a training and testing set. 
    Note: Data is called train and test, but this test set is currently
    closer to a validation set. Keeping nomenclature to keep model robust.
    Trains a KNN classifier model with selected data.
    
    Params
    ----------
    dataframe: Pandas dataframe 
    columns: list of strings, representing input features
    target: list of strings, representing target feature(s)

    Returns
    -------
    -Sk-learn model object
    -train data (features)
    -train data (target)
    -validation data (features)
    -validation data (target)
    """
    #split data
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)
    
    #test input arguments
    assert "pandas.core.frame.DataFrame" in str(type(dev))
    assert "pandas.core.frame.DataFrame" in str(type(test))
    assert "str" in str(type(columns[0]))
    assert "str" in str(type(target[0]))
   
    #split into input and output feature(s)
    dev_X = dev[columns].values
    test_X = test[columns].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1)
    
    #scale data
    scaler = sklearn.preprocessing.StandardScaler()
    dev_X = scaler.fit_transform(dev_X)
    test_X = scaler.fit_transform(test_X)
    
    #train model
    model = sklearn.ensemble.RandomForestClassifier()
    model = model.fit(dev_X, dev_y.ravel())
    
    return model, dev_X, dev_y, test_X, test_y
    

In [80]:
model, dev_X, dev_y, test_X, test_y = train_model(df, columns = input_features, 
                                          target='protein_match')

In [81]:
class TestModelTraining(unittest.TestCase):
    
    def test_invalid_inputs(self):
    
        #test that input data type is correct
        
        try:
            train_model([1,2,3], columns = 'string', target = 'string')
            self.assertTrue(False)
        except AssertionError:
            self.assertTrue(True)
    
#     def test_input_distro(self):
        
#         #test that dev and test features have similar Jensen Shannon Distribution
        
#         JSD = (
#             scipy.stats.bootstrap((train_reg(split_data(df)[0], split_data(df)[1],
#                                 columns = input_features, target=target)[1], train_reg(split_data(df)[0], 
#                                 split_data(df)[1], columns = input_features, target=target)[3]), 
#                                   JSD_dev_and_test, n_resamples=1000, 
#                                   batch=5, method='percentile')
#         )

#         div = JSD.confidence_interval[1]
        
#         #asserts that the divergence between data sets is sufficiently low
#         assert abs(div) < 0.3, "Warning! High JSD between dev and test set!"
        
    def test_output_format(self):
        
        #asserts that function returns 4 objects to be assigned to pearson_corr, model, test_X, test_y
        assert len(train_model(df, columns = input_features, 
                                          target = target)) == 5
        

suite = unittest.TestLoader().loadTestsFromTestCase(TestModelTraining)
_ = unittest.TextTestRunner().run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 14.866s

OK


In [46]:
#need to fix this

def evaluate_model(model, test_X, test_y):
    
    """
    Takes a trained model and test data and tests the model.
    
    Params
    ----------
    model: sklearn.neighbors.KNeighborsClassifier
    test_X: numpy array
    test_y: numpy array

    Returns
    -------
    Vector of predictions based on the model (numpy array)
    """
    
    #test input arguments
    assert "sklearn" in str(type(model))
    assert "numpy.ndarray" in str(type(test_X))
    assert "numpy.ndarray" in str(type(test_y))
    
    preds = model.predict(test_X)
    
    return preds

In [47]:
preds = evaluate_model(model, test_X, test_y)
preds

array([False, False, False, ..., False,  True, False])

In [48]:
class TestModelPerformance(unittest.TestCase):

    def test_asserts(self):
        model, _, _, test_X, test_y = train_model(
            df, columns=input_features, target='protein_match'
        )
        # assert that input types are correct
        with self.assertRaises(AssertionError):
            evaluate_model(model, [1, 2, 3], test_y)
            
    def test_model_output(self):
        model, _, _, test_X, test_y = train_model(
            df, columns=input_features, target='protein_match'
        )
        # assert output type is correct
        output = evaluate_model(model, test_X, test_y)
        self.assertIsInstance(output, np.ndarray)
        
    def test_pred_dimension(self):
        model, _, _, test_X, test_y = train_model(df, 
            columns=input_features, target='protein_match'
        )
        # want to check that the number of predictions is equal to the number of test examples
        preds = evaluate_model(model, test_X, test_y)
        self.assertEqual(len(test_y), len(preds))

suite = unittest.TestLoader().loadTestsFromTestCase(TestModelPerformance)
_ = unittest.TextTestRunner().run(suite)


...
----------------------------------------------------------------------
Ran 3 tests in 236.519s

OK


A good test here can be adding outliers and seeing how much the score and predictions change (probably for the above function).

In [49]:
def plot_model(model, test_X, test_y):
    """
    Takes a test KNN Classifier model and plots the confusion matrix.
    
    Params
    ----------
    model: sklearn.neighbors.KNeighborsClassifier
    test_X: numpy array
    test_y: numpy array

    Returns
    -------
    -Confusion predictions vs. observations
    -Model score
    """
    
    #test input arguments
    assert "sklearn" in str(type(model))
    assert "numpy.ndarray" in str(type(test_X))
    assert "numpy.ndarray" in str(type(test_y))
    
    score = model.score(test_X, test_y)
    preds = evaluate_model(model, test_X, test_y)
   
    # plot confusion matrix
    confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
    cm_plot = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix)
    
    cm_plot.plot(cmap=plt.cm.Blues)
    cm_plot.ax_.set_title('Confusion Matrix')
    
    return score
    

In [50]:
#make some appropriate display labels here

plot_model(model, test_X, test_y)

0.9904

In [51]:
#wrapper function

def RF_wrapper(dataframe):
    """
    Takes dataframe and runs it through kNN model.
    
    Params
    ----------
    dataframe: Pandas dataframe 

    Returns
    -------
    -Target feature predictions
    -Parity plot
    """
    
    assert 'pandas.core.frame.DataFrame' in str(type(dataframe))

    #user inputs target feature
    target = 'protein_match'
    
    #define input features
    input_features = [columns for columns in dataframe]
    
    input_features.remove(target)

    #train the model based off data split
    model, dev_X, dev_y, test_X, test_y = train_model(
                                        dataframe, columns=input_features, 
                                        target=target
                                                   )
    
    #test the model and return predictions
    preds = evaluate_model(model, test_X, test_y)

    #plot the results of the model
    plot_model(model, test_X, test_y)
    
    return preds

In [52]:
RF_wrapper(df)

array([False, False, False, ..., False,  True, False])

In [53]:
class TestWrapper(unittest.TestCase):
    
    def test_wrapper_input(self):
        #test that input data type is correct
        try:
            RF_wrapper([1,2,3])
            self.assertTrue(False)
        except AssertionError:
            self.assertTrue(True)

    def test_wrapper_output(self):
        model, _, _, test_X, test_y = train_model(
            df, 
            columns=input_features, target='protein_match'
        )
        # assert output type is correct
        output = evaluate_model(model, test_X, test_y)
        self.assertIsInstance(output, np.ndarray)
        
    def test_output_dimension(self):
        model, _, _, test_X, test_y = train_model(
            df, 
            columns=input_features, target='protein_match'
        )
        # want to check that the number of predictions is equal to the number of test examples
        preds = evaluate_model(model, test_X, test_y)
        self.assertEqual(len(test_y), len(preds))
        

suite = unittest.TestLoader().loadTestsFromTestCase(TestWrapper)
_ = unittest.TextTestRunner().run(suite)

...
----------------------------------------------------------------------
Ran 3 tests in 158.947s

OK


**IGNORE BELOW**

In [54]:
# #plot empirical distribution of scaled_local_query_percent_id

# target = df['protein_match']
# #create histplot
# fig, ax = plt.subplots()

# sns.histplot(data = df, x = target)

# ax.set_title('Sample Histogram', fontsize=16)
# ax.set_xlabel('Class', fontsize=14)
# ax.set_ylabel('Count', fontsize=14)

Things I've tried to improve model: 

1. drop bit scores over 1000
2. Switch to 85/15 train/test split
3. Lasso regression - not great
4. Ridge regression - R2 = 0.87, about the same as normal Linear regression
5. KNN regression (n_neighbors optimized at 8) gives best result, R2 = ~0.93 --> 
*got it up to .942 by removing a few features
6. Decision tree regressor was slightly worse than KNN
7. Input DT classifier and RF classifier. RF classifier has best performance
8. Changed target to binary protein function match