The purpose of this notebook is to document data exploration processes with an n=50,000 sample of our Learn2Therm protein database. Doing so provides a workspace to select and optimize a model for predicting protein pair functionality between two sequences. Several classifier architectures were tested prior to the release of the final version of this component.

This notebook includes exploration of a Random Forest Classifier.

Problems:

1) get_protein_descriptors returns a dictionary of dataframes. If one descriptor is passed, it just returns a dictionary. Need to ensure that output is a dataframe regardless of how many descriptors are passed
2) Need to append dataframe as new descriptors are added, making function more robust
3) Need to add code to deal with fasta format in get protein descriptors
4) need to write unit tests

In [36]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import io
from io import StringIO

In [37]:
cd /Users/loganroberts/Learn2Therm/ValidProt/FAFSA

/Users/loganroberts/Learn2Therm/ValidProt/FAFSA


In [38]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k.csv')

In [39]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [40]:
def get_fasta_from_dataframe(dataframe, output_file:str):
    #adjust this to write function with BioPython

    with open(output_file, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['meso_index']), row['m_protein_seq']))
    return output_file

In [41]:
def get_protein_descriptors(fasta:str, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with protein sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [42]:
#this function is only necessary if we get a fasta file with a descriptor generated from component 3

def remove_fasta_description(filename:str):
    
    """
    Removes description from fasta file so that iProtein can read the input.
    Might not be necessary in our current format.
    """
    
    #assign unwanted string to object
    string_to_remove = "<unknown description>"

    #open file
    with open(filename, "r") as file:
        content = file.read()
    
    # Remove the string
    new_content = content.replace(string_to_remove, "")

    #overwrite file without string
    with open(filename, "w") as file:
        seq = file.write(new_content)
        
    return seq

In [43]:
#combine both of the above functions

def fasta_to_descriptors(fasta:str, descriptors=[]):

    #remove description from fasta file
    remove_fasta_description(fasta)
    
    #return protein descriptors
    return get_protein_descriptors(fasta, descriptors=descriptors)

In [44]:
def create_new_dataframe(dataframe, output_file, descriptors=[]):
    """
    Creates new dataframe with descriptors added.
    """
    
    fasta = get_fasta_from_dataframe(dataframe, output_file)
    
    feature_dict = get_protein_descriptors(fasta, descriptors)
    
    df = dataframe.reset_index()
    
    for desc in descriptors:
        
        feature_dict[desc].index = feature_dict[desc].index.astype(int)
        features = feature_dict[desc].reset_index()
        
        df = pd.merge(df, features, how='outer', left_index=True, right_index=True)
        
    return df

In [45]:
df = create_new_dataframe(df, 'meso_50k.fasta', descriptors=['AAC', 'GAAC'])

File imported successfully.


In [12]:
# write the dataframe to a fasta file
#adjust this to write function with BioPython

with open('meso_50k.fasta', 'w') as f:
    for i, row in df.iterrows():
        f.write('>{}\n{}\n'.format((row['meso_index']), row['m_protein_seq']))


Let's do the same thing with the whole database:

In [13]:
df_l2t = get_protein_descriptors('meso_50k.fasta', descriptors=['AAC','GAAC'])['AAC']
df_l2t.index = df_l2t.index.astype(int)
df_l2t = df_l2t.reset_index()

In [14]:
indexed_df = df.set_index(('meso_index'))
indexed_df = indexed_df.reset_index()

In [15]:
df= pd.merge(indexed_df, df_l2t, how='outer', left_index=True, right_index=True)
df

Unnamed: 0.1,meso_index,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,...,AAC_M,AAC_N,AAC_P,AAC_Q,AAC_R,AAC_S,AAC_T,AAC_V,AAC_W,AAC_Y
0,12897,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,...,0.009709,0.004854,0.053398,0.038835,0.131068,0.033981,0.048544,0.106796,0.009709,0.029126
1,13026,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,...,0.055794,0.025751,0.025751,0.042918,0.090129,0.042918,0.047210,0.081545,0.004292,0.030043
2,8203,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,...,0.010453,0.020906,0.034843,0.013937,0.104530,0.031359,0.055749,0.114983,0.010453,0.017422
3,3340,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,...,0.029915,0.008547,0.047009,0.025641,0.051282,0.055556,0.051282,0.094017,0.029915,0.025641
4,14020,4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,...,0.050000,0.037500,0.050000,0.037500,0.087500,0.037500,0.025000,0.087500,0.037500,0.062500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,8772,49995,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,...,0.011070,0.014760,0.081181,0.022140,0.073801,0.036900,0.055351,0.081181,0.036900,0.007380
49996,1395,49996,0.417989,0.389163,0.392060,190,0.935961,187,0.935000,314,...,0.010000,0.005000,0.025000,0.020000,0.120000,0.060000,0.030000,0.150000,0.005000,0.015000
49997,8513,49997,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,...,0.015106,0.012085,0.060423,0.057402,0.063444,0.030211,0.057402,0.078550,0.009063,0.012085
49998,4295,49998,0.334764,0.331915,0.329810,232,0.987234,230,0.966387,281,...,0.016807,0.008403,0.046218,0.016807,0.096639,0.071429,0.029412,0.088235,0.004202,0.008403


Scratch work for above function

In [16]:
sequence = "meso_input_copy.fasta.txt"
string_to_remove = "<unknown description>"

with open(sequence, "r") as file:
    content = file.read()
    
# Remove the string
new_content = content.replace(string_to_remove, "")

with open(sequence, "w") as file:
    file.write(new_content)


FileNotFoundError: [Errno 2] No such file or directory: 'meso_input_copy.fasta.txt'

In [None]:
#just figured out that iFeature cannot read the <unknown descriptor> part of the fasta sequence


protein = iFeatureOmegaCLI.iProtein('meso_input_copy.fasta.txt')

In [None]:
protein.display_feature_types()

In [None]:
df['t_protein_len'].describe()

The bit-score provides a better rule-of-thumb for inferring homology. For average length proteins, a bit score of 50 is almost always significant. A bit score of 40 is only significant (E() < 0.001) in searches of protein databases with fewer than 7000 entries. Increasing the score by 10 bits increases the significance 210=1000-fold, so 50 bits would be significant in a database with less than 7 million entries (10 times SwissProt, and within a factor of 3 of the largest protein databases). Thus, the NCBI Blast web site uses a color code of blue for alignment with scores between 40–50 bits; and green for scores between 50–80 bits. In the yeast vs human example, the alignments with less than 20% identity had scores ranging from 55 – 170 bits. Except for very long proteins and very large databases, 50 bits of similarity score will always be statistically significant and is a much better rule-of-thumb for inferring homology in protein alignments.

Pearson et al., 2013: An Introduction to Sequence Similarity (“Homology”) Searching

In [46]:
df['protein_match'] = ((df['bit_score'] > 50 ) & (df['query_align_cov'] > 0.8))

In [47]:
df['protein_match'].value_counts()

True     25887
False    24113
Name: protein_match, dtype: int64

In [48]:
df

Unnamed: 0.1,index_x,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,...,AAC_V,AAC_W,AAC_Y,index,GAAC_alphatic,GAAC_aromatic,GAAC_postivecharge,GAAC_negativecharge,GAAC_uncharge,protein_match
0,0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,...,0.106796,0.009709,0.029126,12897,0.451456,0.053398,0.165049,0.140777,0.189320,False
1,1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,...,0.081545,0.004292,0.030043,13026,0.412017,0.064378,0.154506,0.184549,0.184549,True
2,2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,...,0.114983,0.010453,0.017422,8203,0.543554,0.059233,0.135889,0.097561,0.163763,True
3,3,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,...,0.094017,0.029915,0.025641,3340,0.551282,0.111111,0.094017,0.055556,0.188034,False
4,4,4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,...,0.087500,0.037500,0.062500,14020,0.425000,0.200000,0.112500,0.075000,0.187500,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,49995,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,...,0.081181,0.036900,0.007380,8772,0.468635,0.081181,0.099631,0.132841,0.217712,False
49996,49996,49996,0.417989,0.389163,0.392060,190,0.935961,187,0.935000,314,...,0.150000,0.005000,0.015000,1395,0.485000,0.050000,0.170000,0.155000,0.140000,True
49997,49997,49997,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,...,0.078550,0.009063,0.012085,8513,0.510574,0.039275,0.108761,0.120846,0.220544,False
49998,49998,49998,0.334764,0.331915,0.329810,232,0.987234,230,0.966387,281,...,0.088235,0.004202,0.008403,4295,0.567227,0.042017,0.117647,0.100840,0.172269,True


In [49]:
#get rid of stuff that isn't quantitative

df = df.drop(columns = ['Unnamed: 0','thermo_index', 'm_protein_seq', 't_protein_seq',
                        'm_protein_desc', 't_protein_desc'])

In [50]:
df.head()

Unnamed: 0,index_x,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,meso_index,...,AAC_V,AAC_W,AAC_Y,index,GAAC_alphatic,GAAC_aromatic,GAAC_postivecharge,GAAC_negativecharge,GAAC_uncharge,protein_match
0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,12897,...,0.106796,0.009709,0.029126,12897,0.451456,0.053398,0.165049,0.140777,0.18932,False
1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,13026,...,0.081545,0.004292,0.030043,13026,0.412017,0.064378,0.154506,0.184549,0.184549,True
2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,8203,...,0.114983,0.010453,0.017422,8203,0.543554,0.059233,0.135889,0.097561,0.163763,True
3,3,0.327273,0.200743,0.214712,166,0.6171,163,0.696581,175,3340,...,0.094017,0.029915,0.025641,3340,0.551282,0.111111,0.094017,0.055556,0.188034,False
4,4,0.33871,0.318182,0.287671,60,0.909091,71,0.8875,61,14020,...,0.0875,0.0375,0.0625,14020,0.425,0.2,0.1125,0.075,0.1875,True


From pairplot below, it looks like query_align_cov_16s and subject_align_cov_16s dont correlate with target. Let's remove them.

In [51]:
# sns.pairplot(df.sample(500),height = 4)

In [52]:
df = df.drop(columns = ['query_align_cov_16s', 'subject_align_cov_16s'])

In [53]:
df.describe()

Unnamed: 0,index_x,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,meso_index,...,AAC_T,AAC_V,AAC_W,AAC_Y,index,GAAC_alphatic,GAAC_aromatic,GAAC_postivecharge,GAAC_negativecharge,GAAC_uncharge
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,24999.5,0.349026,0.276478,0.270792,206.01166,0.791507,205.41492,0.770875,230.63206,8098.85812,...,0.057258,0.088333,0.009929,0.019106,8098.85812,0.477431,0.059405,0.126355,0.118392,0.218418
std,14433.901067,0.096608,0.114404,0.116039,64.260901,0.144893,64.430548,0.179955,186.44859,4608.982941,...,0.016214,0.022656,0.008839,0.011123,4608.982941,0.052699,0.026374,0.028722,0.030594,0.033043
min,0.0,0.164103,0.081301,0.045977,19.0,0.284553,19.0,0.082019,35.0,5.0,...,0.0,0.0,0.0,0.0,5.0,0.183673,0.0,0.0,0.0,0.024735
25%,12499.75,0.29375,0.204545,0.20202,172.0,0.673288,172.0,0.656854,118.0,4121.0,...,0.046296,0.072874,0.00369,0.011494,4121.0,0.443983,0.041667,0.109966,0.106122,0.196787
50%,24999.5,0.325,0.25,0.244648,210.0,0.811321,209.0,0.806584,186.0,8046.0,...,0.056225,0.0875,0.007905,0.017391,8046.0,0.477273,0.054264,0.128492,0.121569,0.217791
75%,37499.25,0.367257,0.3083,0.301639,240.0,0.923077,240.0,0.923077,267.0,12103.0,...,0.067114,0.103226,0.014409,0.025,12103.0,0.510417,0.070755,0.143918,0.136519,0.239057
max,49999.0,0.985075,0.985075,0.985075,400.0,1.0,399.0,1.0,1788.0,16564.0,...,0.208696,0.277778,0.107143,0.156863,16564.0,0.876325,0.252336,0.625,0.411111,0.533333


Split data into dev and test, and then split that into train and validation.

In [54]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       , 'prot_pair_index', 'ogt_difference'])

In [55]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(42500, 39)
(7500, 39)
(36125, 39)
(6375, 39)


In [56]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [57]:
print(input_features)
print(target)

['index_x', 'local_gap_compressed_percent_id', 'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id', 'query_align_len', 'query_align_cov', 'subject_align_len', 'subject_align_cov', 'bit_score', 'm_protein_len', 't_protein_len', 'index_y', 'AAC_A', 'AAC_C', 'AAC_D', 'AAC_E', 'AAC_F', 'AAC_G', 'AAC_H', 'AAC_I', 'AAC_K', 'AAC_L', 'AAC_M', 'AAC_N', 'AAC_P', 'AAC_Q', 'AAC_R', 'AAC_S', 'AAC_T', 'AAC_V', 'AAC_W', 'AAC_Y', 'index', 'GAAC_alphatic', 'GAAC_aromatic', 'GAAC_postivecharge', 'GAAC_negativecharge', 'GAAC_uncharge']
protein_match


In [58]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(42500, 38) (7500, 38) (42500, 1) (7500, 1)


In [59]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [60]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [61]:
#Random Forest

model = sklearn.ensemble.RandomForestClassifier(n_estimators=150, max_depth=None, max_samples=0.5,
                                                max_features=0.5, min_weight_fraction_leaf=0.000215,
                                               min_samples_split=10)
# model = sklearn.ensemble.RandomForestClassifier()
model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [62]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.9927843137254901
[False False False ... False  True False]


In [74]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fea041546d0>

Convert the above code into functions.

In [66]:
def train_model(dataframe, columns = [],  target = []):
    """
    Takes dataframe and splits it into a training and testing set. 
    Note: Data is called train and test, but this test set is currently
    closer to a validation set. Keeping nomenclature to keep model robust.
    Trains a KNN classifier model with selected data.
    
    Params
    ----------
    dataframe: Pandas dataframe 
    columns: list of strings, representing input features
    target: list of strings, representing target feature(s)

    Returns
    -------
    -Sk-learn model object
    -train data (features)
    -train data (target)
    -validation data (features)
    -validation data (target)
    """
    #split data
    dev, test = sklearn.model_selection.train_test_split(dataframe, test_size=0.15, random_state=1)
    
    #test input arguments
    assert "pandas.core.frame.DataFrame" in str(type(dev))
    assert "pandas.core.frame.DataFrame" in str(type(test))
    assert "str" in str(type(columns[0]))
    assert "str" in str(type(target[0]))
   
    #split into input and output feature(s)
    dev_X = dev[columns].values
    test_X = test[columns].values

    dev_y = dev[target].values.reshape(-1,1)
    test_y = test[target].values.reshape(-1,1)
    
    #scale data
    scaler = sklearn.preprocessing.StandardScaler()
    dev_X = scaler.fit_transform(dev_X)
    test_X = scaler.fit_transform(test_X)
    
    #train model
    model = sklearn.ensemble.RandomForestClassifier()
    model = model.fit(dev_X, dev_y.ravel())
    
    return model, dev_X, dev_y, test_X, test_y
    

In [67]:
model, dev_X, dev_y, test_X, test_y = train_model(df, columns = input_features, 
                                          target='protein_match')

In [68]:
class TestModelTraining(unittest.TestCase):
    
    def test_invalid_inputs(self):
    
        #test that input data type is correct
        
        try:
            train_model([1,2,3], columns = 'string', target = 'string')
            self.assertTrue(False)
        except AssertionError:
            self.assertTrue(True)
    
#     def test_input_distro(self):
        
#         #test that dev and test features have similar Jensen Shannon Distribution
        
#         JSD = (
#             scipy.stats.bootstrap((train_reg(split_data(df)[0], split_data(df)[1],
#                                 columns = input_features, target=target)[1], train_reg(split_data(df)[0], 
#                                 split_data(df)[1], columns = input_features, target=target)[3]), 
#                                   JSD_dev_and_test, n_resamples=1000, 
#                                   batch=5, method='percentile')
#         )

#         div = JSD.confidence_interval[1]
        
#         #asserts that the divergence between data sets is sufficiently low
#         assert abs(div) < 0.3, "Warning! High JSD between dev and test set!"
        
    def test_output_format(self):
        
        #asserts that function returns 4 objects to be assigned to pearson_corr, model, test_X, test_y
        assert len(train_model(df, columns = input_features, 
                                          target = target)) == 5
        

suite = unittest.TestLoader().loadTestsFromTestCase(TestModelTraining)
_ = unittest.TextTestRunner().run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 8.896s

OK


In [69]:
#need to fix this

def evaluate_model(model, test_X, test_y):
    
    """
    Takes a trained model and test data and tests the model.
    
    Params
    ----------
    model: sklearn.neighbors.KNeighborsClassifier
    test_X: numpy array
    test_y: numpy array

    Returns
    -------
    Vector of predictions based on the model (numpy array)
    """
    
    #test input arguments
    assert "sklearn" in str(type(model))
    assert "numpy.ndarray" in str(type(test_X))
    assert "numpy.ndarray" in str(type(test_y))
    
    preds = model.predict(test_X)
    
    return preds

In [70]:
preds = evaluate_model(model, test_X, test_y)
preds

array([False, False, False, ..., False,  True, False])

In [71]:
class TestModelPerformance(unittest.TestCase):

    def test_asserts(self):
        model, _, _, test_X, test_y = train_model(
            df, columns=input_features, target='protein_match'
        )
        # assert that input types are correct
        with self.assertRaises(AssertionError):
            evaluate_model(model, [1, 2, 3], test_y)
            
    def test_model_output(self):
        model, _, _, test_X, test_y = train_model(
            df, columns=input_features, target='protein_match'
        )
        # assert output type is correct
        output = evaluate_model(model, test_X, test_y)
        self.assertIsInstance(output, np.ndarray)
        
    def test_pred_dimension(self):
        model, _, _, test_X, test_y = train_model(df, 
            columns=input_features, target='protein_match'
        )
        # want to check that the number of predictions is equal to the number of test examples
        preds = evaluate_model(model, test_X, test_y)
        self.assertEqual(len(test_y), len(preds))

suite = unittest.TestLoader().loadTestsFromTestCase(TestModelPerformance)
_ = unittest.TextTestRunner().run(suite)


...
----------------------------------------------------------------------
Ran 3 tests in 24.048s

OK


A good test here can be adding outliers and seeing how much the score and predictions change (probably for the above function).

In [72]:
def plot_model(model, test_X, test_y):
    """
    Takes a test KNN Classifier model and plots the confusion matrix.
    
    Params
    ----------
    model: sklearn.neighbors.KNeighborsClassifier
    test_X: numpy array
    test_y: numpy array

    Returns
    -------
    -Confusion predictions vs. observations
    -Model score
    """
    
    #test input arguments
    assert "sklearn" in str(type(model))
    assert "numpy.ndarray" in str(type(test_X))
    assert "numpy.ndarray" in str(type(test_y))
    
    score = model.score(test_X, test_y)
    preds = evaluate_model(model, test_X, test_y)
   
    # plot confusion matrix
    confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
    cm_plot = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix)
    
    cm_plot.plot(cmap=plt.cm.Blues)
    cm_plot.ax_.set_title('Confusion Matrix')
    
    return score
    

In [73]:
#make some appropriate display labels here

plot_model(model, test_X, test_y)

0.9957333333333334

In [None]:
#wrapper function

def RF_wrapper(dataframe):
    """
    Takes dataframe and runs it through kNN model.
    
    Params
    ----------
    dataframe: Pandas dataframe 

    Returns
    -------
    -Target feature predictions
    -Parity plot
    """
    
    assert 'pandas.core.frame.DataFrame' in str(type(dataframe))

    #user inputs target feature
    target = 'protein_match'
    
    #define input features
    input_features = [columns for columns in dataframe]
    
    input_features.remove(target)

    #train the model based off data split
    model, dev_X, dev_y, test_X, test_y = train_model(
                                        dataframe, columns=input_features, 
                                        target=target
                                                   )
    
    #test the model and return predictions
    preds = evaluate_model(model, test_X, test_y)

    #plot the results of the model
    plot_model(model, test_X, test_y)
    
    return preds

In [None]:
RF_wrapper(df)

In [None]:
class TestWrapper(unittest.TestCase):
    
    def test_wrapper_input(self):
        #test that input data type is correct
        try:
            RF_wrapper([1,2,3])
            self.assertTrue(False)
        except AssertionError:
            self.assertTrue(True)

    def test_wrapper_output(self):
        model, _, _, test_X, test_y = train_model(
            df, 
            columns=input_features, target='protein_match'
        )
        # assert output type is correct
        output = evaluate_model(model, test_X, test_y)
        self.assertIsInstance(output, np.ndarray)
        
    def test_output_dimension(self):
        model, _, _, test_X, test_y = train_model(
            df, 
            columns=input_features, target='protein_match'
        )
        # want to check that the number of predictions is equal to the number of test examples
        preds = evaluate_model(model, test_X, test_y)
        self.assertEqual(len(test_y), len(preds))
        

suite = unittest.TestLoader().loadTestsFromTestCase(TestWrapper)
_ = unittest.TextTestRunner().run(suite)

**IGNORE BELOW**

In [None]:
# #plot empirical distribution of scaled_local_query_percent_id

# target = df['protein_match']
# #create histplot
# fig, ax = plt.subplots()

# sns.histplot(data = df, x = target)

# ax.set_title('Sample Histogram', fontsize=16)
# ax.set_xlabel('Class', fontsize=14)
# ax.set_ylabel('Count', fontsize=14)

Things I've tried to improve model: 

1. drop bit scores over 1000
2. Switch to 85/15 train/test split
3. Lasso regression - not great
4. Ridge regression - R2 = 0.87, about the same as normal Linear regression
5. KNN regression (n_neighbors optimized at 8) gives best result, R2 = ~0.93 --> 
*got it up to .942 by removing a few features
6. Decision tree regressor was slightly worse than KNN
7. Input DT classifier and RF classifier. RF classifier has best performance
8. Changed target to binary protein function match