In [1]:
#imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.neighbors
import duckdb
import unittest

In [2]:
cd /Users/loganroberts/Learn2Therm/ValidProt/data

/Users/loganroberts/Learn2Therm/ValidProt/data


In [3]:
df = pd.read_csv('learn2therm_sample_50k/learn2therm_sample_50k.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,bit_score_16s,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len
0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,875,...,1153.0,27.5,50.0,22.5,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,MPSQITESERIELAERFERDALPLLDQLYSAALRMTRNPADAEDLV...,ECF RNA polymerase sigma factor SigK,sigma-70 family RNA polymerase sigma factor,206,202
1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,11324,...,1014.0,25.0,54.0,29.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,response regulator transcription factor,response regulator transcription factor,233,237
2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,875,...,1138.0,28.0,50.0,22.0,MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAE...,MTPEQIFSGQTAIVTGGASGIGAATVEHIARRGGRVFSVDLSYDSP...,SDR family oxidoreductase,SDR family oxidoreductase,287,252
3,3,0.327273,0.200743,0.214712,166,0.6171,163,0.696581,175,875,...,1077.0,28.0,50.0,22.0,MTSGLWERVLDGVWVTIQLLVLSALLATAVSFVVGIARTHRLWIVR...,MAMSRRKRGQLARGIQYAILVIVVVVLALLADWGKIGKAFFDWEAA...,ectoine/hydroxyectoine ABC transporter permeas...,amino acid ABC transporter permease,234,269
4,4,0.33871,0.318182,0.287671,60,0.909091,71,0.8875,61,9827,...,991.0,30.0,50.0,20.0,MIISLRRGLRFIRFIVFFAALVYLFYHVLDLFNGWISPVDQYQMPT...,MKRMVWRTLKVFIIFIACTLLFYFGLRFMHLEYEQFHRYEPPEGPA...,YqzK family protein,YqzK family protein,80,66


**Subcomponent 1**: Test for pandas dataframe input 

**Use case**: User takes data from component 4 (where data is processed into pandas dataframe) and wants to pass it into relationship component.

In [4]:
#CODE - Function 1-1

def check_input_type(dataframe):
    """
    Takes in input dataframe and asserts that it is the correct data type.
    """
    assert "pandas.core.frame.DataFrame" in str(type(dataframe)), 'Not a pandas dataframe!'

In [5]:
#TEST CODE 1-1

import unittest

#unit tests - function 1 
class TestInputType(unittest.TestCase):
    
    def test_input_type(self): 
        """
        Tests that input data is a pandas dataframe.
        
        """
        try:
            check_input_type([4,3])
            self.assertTrue(False)
        except AssertionError:
            self.assertTrue(True)
        
suite = unittest.TestLoader().loadTestsFromTestCase(TestInputType)
_ = unittest.TextTestRunner().run(suite)

.
----------------------------------------------------------------------
Ran 1 test in 0.001s

OK


***
***

**Subcomponent 2**: Checks that input data is cleaned property (does it have all of the features we need, and are the features we don't need removed).

**Use case**: Input data does not include local E value, which we need as an input to our model.

In [6]:
#CODE - Function 2-1

def clean_input_columns(dataframe):
    """
    We want to clean certain columns out of the Pfam dataframe.
    Need to eliminate identifier columns + columns that don't have linear relationship with bit score.
    
    Input: Pandas dataframe (from Pfam)
    Output: Updated dataframe.
    """
    
    for title in dataframe:
        if title not in ['bit_score','local_gap_compressed_percent_id','scaled_local_query_percent_id',
                      'scaled_local_symmetric_percent_id','query_align_len', 'query_align_cov',
                      'subject_align_len', 'subject_align_cov', 'm_protein_len', 't_protein_len']:
            dataframe = dataframe.drop(columns = title)
        else:
            pass
    
    return dataframe

In [7]:
#CODE - Function 2-2

def verify_input_columns(dataframe):
    for title in ['bit_score','local_gap_compressed_percent_id','scaled_local_query_percent_id',
                      'scaled_local_symmetric_percent_id','query_align_len', 'query_align_cov',
                      'subject_align_len', 'subject_align_cov', 'm_protein_len', 't_protein_len']:
        
        if title not in dataframe:
            raise KeyError
        else:
            pass
    
    return dataframe

In [8]:
#TESTS - 2-1/2

import unittest

class TestInputCleaning(unittest.TestCase):

    #pass through some titles that should not be in the dataframe
    def test_input_cleaning(self):
        for title in ['Unnamed: 0','m_seq', 't_seq', 'prot_pair_index']:
            assert title not in clean_input_columns(df)
    
    def test_column_verification(self):
        
        try:
            verify_input_columns(df.drop(columns='meso_ogt'))
            self.assertTrue(False)
        except KeyError:
            self.assertTrue(True)
            
            
suite = unittest.TestLoader().loadTestsFromTestCase(TestInputCleaning)
_ = unittest.TextTestRunner().run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 0.320s

OK


In [9]:
#CODE - Function 2-3

def check_input_NANs(dataframe):
    """
    Checks for NaN values in input dataframe. Removes rows with NaN values present.

    Input: Pandas dataframe
    Output: Pandas dataframe

    """
    has_nan = dataframe.isna().any().any()
    nan_rows = dataframe[dataframe.isna().any(axis=1)]

    if has_nan:
        print('Dataframe has {} rows with NaN values!'.format(len(nan_rows)))
    else:
        print("DataFrame does not have any NaN values.")

    #Drop rows with NaN's
    dataframe = dataframe.dropna()
    print('Dataframe now has {} rows.'.format(len(dataframe)))

    return dataframe

In [10]:
#TEST 2-3

import unittest

class TestForNans(unittest.TestCase):

    def test_input_Nans(self):
        df['another_column'] = pd.DataFrame([np.nan for i in range(len(df))])
        
        assert check_input_NANs(df).isna().any().any() == False
            
suite = unittest.TestLoader().loadTestsFromTestCase(TestForNans)
_ = unittest.TextTestRunner().run(suite)

.

Dataframe has 50000 rows with NaN values!
Dataframe now has 0 rows.



----------------------------------------------------------------------
Ran 1 test in 0.121s

OK


In [11]:
#FUNCTION 2-4

def verify_protein_pairs(dataframe):
    """
    Checks that input data has two protein sequences. Will need to generalize this function other data sets 
    to simply make sure two sequences are entered. Code below is for our protein database
    """
    assert 'm_protein_len' in dataframe, 'Dataframe missing mesophillic sequence!'
    assert 't_protein_len' in dataframe, 'Dataframe missing thermophillic sequence!'
    
    print('OK!')
    return dataframe


In [12]:
verify_protein_pairs(df)

OK!


Unnamed: 0.1,Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,thermo_index,...,m_ogt,t_ogt,ogt_difference,m_protein_seq,t_protein_seq,m_protein_desc,t_protein_desc,m_protein_len,t_protein_len,another_column
0,0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,875,...,27.5,50.0,22.5,MAESGTSRRADHLVPVPGPDAEPPAVADELLRAVGRGDEQAFGRLY...,MPSQITESERIELAERFERDALPLLDQLYSAALRMTRNPADAEDLV...,ECF RNA polymerase sigma factor SigK,sigma-70 family RNA polymerase sigma factor,206,202,
1,1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,11324,...,25.0,54.0,29.0,MARIALVDDDRNILTSVSMTLEAEGFEVETYNDGQSALDAFNKRMP...,MRVLLVEDDPNTSRSIEMMLTHANLNVYATDMGEEGIDLAKLYDYD...,response regulator transcription factor,response regulator transcription factor,233,237,
2,2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,875,...,28.0,50.0,22.0,MKDTVVFVTGAARGIGAHTARLAVARGARVALVGLEPHLLADLAAE...,MTPEQIFSGQTAIVTGGASGIGAATVEHIARRGGRVFSVDLSYDSP...,SDR family oxidoreductase,SDR family oxidoreductase,287,252,
3,3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,875,...,28.0,50.0,22.0,MTSGLWERVLDGVWVTIQLLVLSALLATAVSFVVGIARTHRLWIVR...,MAMSRRKRGQLARGIQYAILVIVVVVLALLADWGKIGKAFFDWEAA...,ectoine/hydroxyectoine ABC transporter permeas...,amino acid ABC transporter permease,234,269,
4,4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,9827,...,30.0,50.0,20.0,MIISLRRGLRFIRFIVFFAALVYLFYHVLDLFNGWISPVDQYQMPT...,MKRMVWRTLKVFIIFIACTLLFYFGLRFMHLEYEQFHRYEPPEGPA...,YqzK family protein,YqzK family protein,80,66,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,49995,0.344828,0.169972,0.192308,180,0.509915,200,0.738007,53,14963,...,28.0,52.5,24.5,MDNATFRLGDDLSVRLPGHSRWIGQVEREQRWLPWLAPRLPLTVST...,MPPQPRPLRPNDPREIGGFALLGRLGEGGQGTVYLGGAPDGRRVAV...,aminoglycoside phosphotransferase family protein,serine/threonine protein kinase,271,353,
49996,49996,0.417989,0.389163,0.392060,190,0.935961,187,0.935000,314,7134,...,26.5,52.5,26.0,MFRTGVKAEIGRSLAVVGEAEDVERAVRVVLEQRPDVVLLDVHLPG...,MILEAEPDIVVVGEAGDGEKAVEEARALQPDVVLMDIRMPRKDGVE...,response regulator transcription factor,response regulator transcription factor,200,203,
49997,49997,0.352174,0.239645,0.242152,230,0.680473,227,0.685801,279,7134,...,28.0,52.5,24.5,MIRLAELTKTYPGQQHPAVDGISMEVAEGEIVVLVGPSGCGKTTTL...,MTEQPILSARGLTVDFRLRGGRRARAVDGVDLDLAPGEVLALAGES...,ABC transporter ATP-binding protein,ABC transporter ATP-binding protein,331,338,
49998,49998,0.334764,0.331915,0.329810,232,0.987234,230,0.966387,281,11324,...,29.0,54.0,25.0,MSESHAGALLSVRGLTAGYGGATALDGVSLTVAAGETVALLGANGA...,MSLLTTSGLTRHFSGIHAVEGVDFTLEAGEIRALIGSNGAGKTTLV...,ABC transporter ATP-binding protein,ABC transporter ATP-binding protein,238,235,


In [13]:
#TEST 2-4 
import unittest

class TestProteinPairs(unittest.TestCase):
    
    def test_protein_pair(self):

        try:
            verify_protein_pairs(df.drop(columns = (['m_protein_len', 't_protein_len'])))
            self.assertTrue(False)
        except AssertionError:
            self.assertTrue(True)
            
suite = unittest.TestLoader().loadTestsFromTestCase(TestProteinPairs)
_ = unittest.TextTestRunner().run(suite)

.
----------------------------------------------------------------------
Ran 1 test in 0.011s

OK


***
***