In [1]:
#imports
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.neighbors
import duckdb
import unittest

In [2]:
df = pd.read_csv("/Users/loganroberts/Learn2Therm/ValidProt/data/Sample.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,prot_pair_index,meso_seq,thermo_seq,meso_ogt,thermo_ogt,scaled_local_symmetric_percent_id,local_E_value,scaled_local_query_percent_id,local_gap_compressed_percent_id
0,1256842,126227630,MLLSDRDLVSEIKSGDLSLEPFEPALLQPSSIDVRLDRFFRVFNNH...,MLLSDRDLRKELESGRLELDPFDPAMLQPSSIDVRLDRFFRVFDNT...,27.5,45.0,0.777202,0.0,0.773196,0.802139
1,1456567,169784592,MRFEGTSGYVATDDLKVAVNAAIALERPLLVKGEPGTGKTVLAVEV...,MKFTGSDSYVATEDLMIAVNAAVTLERPLLVKGEPGTGKTELARQV...,30.0,54.0,0.782143,0.0,0.784946,0.784946
2,874464,31933768,MAYETINVDVQDHVCLIKLHRPEALNALNAALVSELCTALEEADAS...,MAYKTIIVEIEDHVALIKLNRPEALNALNSELLGELAQAVTEADAN...,19.5,54.0,0.775194,0.0,0.775194,0.775194
3,560201,32409414,MAIRKYKPTTPGRRGSSVADFAEITRSTPEKSLLRPLSKTGGRNNQ...,MGIRKYKPTTPGRRGASVADFVELTRREPEKSLLRPLPKKGGRNNR...,28.0,52.5,0.78777,0.0,0.790614,0.802198
4,33257,175862226,MLQRLQDRVAVVTGGGSGIGLATVRRFAAEGAKVVVADIDAAAGEA...,MSEDIICRRLTGRTAVVTGAGSGIGLASARRLASEGANVVCADVDE...,28.0,45.0,0.78835,0.0,0.780769,0.802372


# Software Component Five: s5.0_relation.py

**Params:** 

**Inputs:** Pandas Dataframe containing Pfam return data. Includes quantitative features (ID, some metric of percent similarlity) and string of amino acid sequence.

**Outputs:** Quantitative functional similarlity metric.

**Metrics:**

**Packages:** pandas, numpy, scipy, seaborn, fuzzywuzzy, unittest

***
***

**Subcomponent 1**: Test for pandas dataframe input (**ALREADY TESTED WITH CODE**)

**Use case**: User takes data from component 4 (where data is processed into pandas dataframe) and wants to pass it into relationship component.

```

def check_input_type(dataframe):
    tests that input data is a pandas dataframe with assert statement. 
    assert "pandas.core.frame.DataFrame" in str(type(dataframe)) 
    Output should pass unless assert statement fails.

```

In [3]:
#CODE - Function 1-1

def check_input_type(dataframe):
    """
    Takes in input dataframe and asserts that it is the correct data type.
    """
    assert "pandas.core.frame.DataFrame" in str(type(dataframe)), 'Not a pandas dataframe!'

In [4]:
#TEST CODE 1-1

import unittest

#unit tests - function 1 
class TestInputType(unittest.TestCase):
    
    def test_input_type(self): 
        """
        Tests that input data is a pandas dataframe.
        
        """
        try:
            check_input_type([4,3])
            self.assertTrue(False)
        except AssertionError:
            self.assertTrue(True)
        
suite = unittest.TestLoader().loadTestsFromTestCase(TestInputType)
_ = unittest.TextTestRunner().run(suite)

.
----------------------------------------------------------------------
Ran 1 test in 0.001s

OK


***
***

**Subcomponent 2**: Checks that input data is cleaned property (does it have all of the features we need, and are the features we don't need removed).

**Use case**: Input data does not include local E value, which we need as an input to our model.

In [5]:
#CODE - Function 2-1

def clean_input_columns(dataframe):
    """
    We want to clean certain columns out of the Pfam dataframe. For now, let's ensure that the dataframe 
    is missing 'Unnamed: 0', 'meso_seq', 'thermo_seq', and 'prot_pair_index'.
    
    Input: Pandas dataframe (from Pfam)
    Output: Updated dataframe.
    """
    
    for title in ['Unnamed: 0','meso_seq', 'thermo_seq', 'prot_pair_index']:
        if title in dataframe:
            dataframe = dataframe.drop(columns = title)
        else:
            pass
    
    return dataframe

In [6]:
#CODE - Function 2-2

def verify_input_columns(dataframe):
    for title in ['meso_ogt', 'thermo_ogt', 'scaled_local_symmetric_percent_id',
                  'local_E_value', 'scaled_local_query_percent_id', 'local_gap_compressed_percent_id']:
        
        if title not in dataframe:
            raise KeyError
        else:
            pass
    
    return dataframe

In [7]:
#TESTS - 2-1/2

import unittest

class TestInputCleaning(unittest.TestCase):

    
    def test_input_cleaning(self):
        for title in ['Unnamed: 0','meso_seq', 'thermo_seq', 'prot_pair_index']:
            assert title not in clean_input_columns(df)
    
    def test_column_verification(self):
        
        try:
            verify_input_columns(df.drop(columns='meso_ogt'))
            self.assertTrue(False)
        except KeyError:
            self.assertTrue(True)
            
            
suite = unittest.TestLoader().loadTestsFromTestCase(TestInputCleaning)
_ = unittest.TextTestRunner().run(suite)

..
----------------------------------------------------------------------
Ran 2 tests in 0.020s

OK


In [8]:
#CODE - Function 2-3

def check_input_NANs(dataframe):
    """
    Checks for NaN values in input dataframe. Removes rows with NaN values present.

    Input: Pandas dataframe
    Output: Pandas dataframe

    """
    has_nan = dataframe.isna().any().any()
    nan_rows = dataframe[dataframe.isna().any(axis=1)]

    if has_nan:
        print('Dataframe has {} rows with NaN values!'.format(len(nan_rows)))
    else:
        print("DataFrame does not have any NaN values.")

    #Drop rows with NaN's
    dataframe = dataframe.dropna()
    print('Dataframe now has {} rows.'.format(len(dataframe)))

    return dataframe

In [9]:
#TEST 2-3

import unittest

class TestForNans(unittest.TestCase):

    def test_input_Nans(self):
        df['another_column'] = pd.DataFrame([np.nan for i in range(len(df))])
        
        assert check_input_NANs(df).isna().any().any() == False
            
suite = unittest.TestLoader().loadTestsFromTestCase(TestForNans)
_ = unittest.TextTestRunner().run(suite)

.

Dataframe has 10000 rows with NaN values!
Dataframe now has 0 rows.



----------------------------------------------------------------------
Ran 1 test in 0.025s

OK


In [10]:
#FUNCTION 2-4

def verify_protein_pairs(dataframe):
    """
    Checks that input data has two protein sequences. Will need to generalize this function other data sets 
    to simply make sure two sequences are entered. Code below is for our protein database
    """
    assert 'meso_ogt' in dataframe, 'Dataframe missing mesophillic sequence!'
    assert 'thermo_ogt' in dataframe, 'Dataframe missing thermophillic sequence!'
    
    #ensure every entry has a thermophillic and mesophillic sequence 
    if len(dataframe['meso_ogt']) != len(dataframe['thermo_ogt']):
        raise ValueError
    else:
        print('OK!')

In [11]:
verify_protein_pairs(df)

OK!


In [13]:
#TEST 2-4 
import unittest

class TestProteinPairs(unittest.TestCase):
    
    def test_protein_pair(self):
        
        try:
            verify_protein_pairs(df.drop(columns = 'meso_ogt'))
            self.assertTrue(False)
        except AssertionError:
            self.assertTrue(True)
        
        
        #need to figure out how to write this second test correctly
        try:
            verify_protein_pairs(df.iloc[:-1, 0].values)
            self.assertTrue(False)
        except ValueError:
            self.assertTrue(True)
            
suite = unittest.TestLoader().loadTestsFromTestCase(TestProteinPairs)
_ = unittest.TextTestRunner().run(suite)

  assert 'meso_ogt' in dataframe, 'Dataframe missing mesophillic sequence!'
F
FAIL: test_protein_pair (__main__.TestProteinPairs)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-13-40856f54f551>", line 17, in test_protein_pair
    verify_protein_pairs(df.iloc[:-1, 0].values)
  File "<ipython-input-10-2e0db47c4721>", line 8, in verify_protein_pairs
    assert 'meso_ogt' in dataframe, 'Dataframe missing mesophillic sequence!'
AssertionError: Dataframe missing mesophillic sequence!

----------------------------------------------------------------------
Ran 1 test in 0.005s

FAILED (failures=1)


***
***

**Subcomponent 3**: Train the model with sample data. (**NOT TESTED WITH CODE**)

**Use case**:

```
def train_model(dataframe):
    import scipy, numpy
    Split data into dev and test (0.8/0.2 for now)
    Train model (KNN Linear Regression for now)
    Output: Print('Training successful!')
```

**Test**: 

1) assert len(dataframe)*0.8 == len(dev_data)

***
***

**Subcomponent 4**: Test the model with sample data. (**NOT TESTED WITH CODE**)

**Use case**:

```
def test_model(dataframe):
    Runs data through model (linear regression (KNN?)
    Output: Returns model_score, confusion matrix, MSE
```

**Test**: 

***
***

**Subcomponent 5**: Run confidence test on model output. (**NOT TESTED WITH CODE**)

**Use case:**

```
def check_model_confidence(model_score, ci_data):
    Runs a statistical test on model output and compares it to sample
    Output: Returns a confidence score along with the model score
```

**Test**: 
1) Run confidence test on some data for which we know the confidence score assert that the score is correct using numpy.isclose( )

***
***

**Subcomponent 6**: Calculate a 'functionality' metric that is the ultimate output of component five. This will factor in information from multiple software, not just Pfam. This will be built during spring quarter. (**NOT TESTED WITH CODE**)

**Use case:** We need to test that our protein pairs have a near maximal functionality score! This can be used as a basis for eventual user input scores.

```
def calculate_functionality(model_score, dataframe):
    runs user input data through some mathematical manipulation of their model score and input data
    Output: returns a functionality score, print statement categorizing functionality score
```

**Test**: 

***
***

# Plan Outline

1. Get data from component 4. This should already be in a pandas dataframe (data prep is included in C4)
2. Clean the data to prepare it for model training and testing
3. Train and test the model, return scores, MSE, and any other necessary indicator of model performance
4. Run confidence test on model output to determine quality of output
5. Input new user data and return a functionality score for the input protein pair