This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

To do:

1) Write unit tests for iFeatureOmega
2) time trial for different descriptors - DONE
3) Figure out how to append meso and thermo descriptors
4) Make sure protein length is in training
5) try changing subject_align_len to subject_align_len/m_protein_len
6) log(ratio) for proteins approacing zero or infinity
    take distributions to assess for this.
    Update -- taking log and then droping infinites and NaN improves score to 0.86 <-- cannot repeat this

In [39]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [40]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [41]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [42]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    def compute_descriptor_ratio(fasta_files, descriptors=[]):
        """
        Generates dictionary of descriptors for each of the two input sequences.
        Computes the difference between each instance of a descriptor.
        
        Parameters
        ----------
        List of two fasta files (str) and list of descriptors (str).

        Returns
        -------
        Dictionary with difference between descriptors for each of the 
        input sequences.
        """
        desc_a = get_protein_descriptors(fasta_files[0], descriptors)
        desc_b = get_protein_descriptors(fasta_files[1], descriptors)

        feature_dict = {}

        for key in desc_a:
            
            if 'AAC' in key:
                feature_dict[key] = desc_a[key] - desc_b[key]
            elif 'GAAC' in key:
                feature_dict[key] = desc_a[key] - desc_b[key]
            else:
                feature_dict[key] = desc_a[key] / desc_b[key]

        return feature_dict
    
    feature_dict = compute_descriptor_ratio(fasta_files, descriptors)


    df = dataframe.reset_index()

    for desc in descriptors:

        feature_dict[desc].index = feature_dict[desc].index.astype(int)
        features = feature_dict[desc].reset_index()

        df = pd.merge(
            df,
            features,
            how='outer',
            left_index=True,
            right_index=True)

    return df

In [43]:
#iFeature properties

# protein = iFeatureOmegaCLI.iProtein('meso_50k.fasta')
# protein.display_feature_types()

In [44]:
cd /Users/loganroberts/Learn2Therm/ValidProt/FAFSA

/Users/loganroberts/Learn2Therm/ValidProt/FAFSA


In [45]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [46]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [47]:
target = pd.read_csv('protein_match_6k.csv')

In [48]:
df = pd.merge(df, target, on=['prot_pair_index'])
df.shape

(6348, 32)

In [49]:
"""
this list comes from a combination of reading through the features and determining which might be useful
and timing some of the feature generations. those that took more than 30ish seconds were eliminated
Also removed those that have really high dimensionality (>4000)
"""

feature_list = ['AAC', 'GAAC', 'DistancePair',
               'CTDC', 'CTDT', 'CTDD', 'CTriad', 'GDPC type 1', 'GDPC type 2',
                'CKSAAGP type 1', 'CKSAAGP type 2', 'PseKRAAC type 2', 'PseKRAAC type 3A','PseKRAAC type 7',
                'PseKRAAC type 9', 'Geary','APAAC', 'QSOrder']

In [50]:
# df = create_new_dataframe(df, 'seq_50k.fasta', descriptors= ['EGAAC', 'CKSAAGP type 1', 'CKSAAGP type 2',
#                                                             'GDPC type 1', 'GDPC type 2', 'TPC type 1', 'TPC type 2',
#                                                             'GTPC type 1', 'GTPC type 2', 'DPC type 1', 'DPC type 2',
#                                                             'Moran', 'Geary', 'NMBroto', 'AC', 'CC', 'ACC',
#                                                             'PAAC', 'APAAC'])

In [51]:
df = create_new_dataframe(df, ['seq_50k_a.fasta', 'seq_50k_b.fasta'], descriptors=['AAC', 'GAAC', 'QSOrder'])

File imported successfully.
File imported successfully.


Split data into dev and test, and then split that into train and validation.

In [52]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       , 'prot_pair_index', 'ogt_difference', 'Jaccard_Score',
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'Unnamed: 0', 'thermo_index','m_protein_seq', 't_protein_seq', 
                       'm_protein_desc', 't_protein_desc'])

In [53]:
df=df.drop(columns=['index_y', 'index_x'])
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,206,202,...,0.995501,0.995501,0.746626,2.433447,0.942154,1.023105,1.047587,0.937454,0.974866,1.104177
1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,233,237,...,0.730477,0.996105,0.996105,1.261733,0.980941,1.056692,0.967775,0.910832,1.147138,0.966570
2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,287,252,...,0.888613,1.332920,0.493674,1.127855,1.046793,1.005878,1.003981,1.034618,1.025785,0.938530
3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,234,269,...,0.849359,2.807602,0.656323,1.323584,0.950250,1.057052,0.957343,0.921186,1.106142,0.987225
4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,80,66,...,0.797894,2.393681,1.329823,1.861752,0.913500,1.051214,1.066765,0.976156,1.111363,0.908320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,0.387574,0.367978,0.360882,336,0.943820,337,0.910811,588,370,356,...,0.929210,0.655913,1.115052,1.229837,1.043239,0.997041,0.961075,1.004553,1.029910,0.965934
6344,0.488462,0.488462,0.455197,258,0.992308,263,0.882550,619,298,260,...,0.833497,0.983990,1.062709,0.695821,1.074279,0.985104,0.969361,1.053592,0.951543,1.000276
6345,0.383838,0.376238,0.374384,296,0.976898,299,0.977124,436,306,303,...,1.695522,0.726652,0.322957,0.523713,0.946281,0.992210,1.060345,0.938189,1.001573,1.063319
6346,0.769231,0.769231,0.769231,78,1.000000,78,1.000000,258,78,78,...,0.375423,,1.126270,1.126270,1.073105,1.012337,0.921517,1.023642,0.964341,1.017542


In [74]:
df['QSOrder_Grantham.Xr.W'].describe()

count    6044.000000
mean             NaN
std              NaN
min             -inf
25%        -0.730724
50%        -0.053068
75%         0.551596
max              inf
Name: QSOrder_Grantham.Xr.W, dtype: float64

In [75]:
df.shape

(6348, 82)

In [76]:
df = df.replace([np.inf, -np.inf], np.nan)

In [77]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
                                    ..
QSOrder_Schneider.Xd.2               0
QSOrder_Schneider.Xd.3               0
QSOrder_Grantham.Xd.1                0
QSOrder_Grantham.Xd.2                0
QSOrder_Grantham.Xd.3                0
Length: 82, dtype: int64


array([   0,    2,    9,  273, 2373,   91,   13,    1,  243,   21,    3,
        279,   86,   16,    7, 1674])

In [72]:
df.columns[36:]

Index(['QSOrder_Schneider.Xr.A', 'QSOrder_Schneider.Xr.R',
       'QSOrder_Schneider.Xr.N', 'QSOrder_Schneider.Xr.D',
       'QSOrder_Schneider.Xr.C', 'QSOrder_Schneider.Xr.Q',
       'QSOrder_Schneider.Xr.E', 'QSOrder_Schneider.Xr.G',
       'QSOrder_Schneider.Xr.H', 'QSOrder_Schneider.Xr.I',
       'QSOrder_Schneider.Xr.L', 'QSOrder_Schneider.Xr.K',
       'QSOrder_Schneider.Xr.M', 'QSOrder_Schneider.Xr.F',
       'QSOrder_Schneider.Xr.P', 'QSOrder_Schneider.Xr.S',
       'QSOrder_Schneider.Xr.T', 'QSOrder_Schneider.Xr.W',
       'QSOrder_Schneider.Xr.Y', 'QSOrder_Schneider.Xr.V',
       'QSOrder_Grantham.Xr.A', 'QSOrder_Grantham.Xr.R',
       'QSOrder_Grantham.Xr.N', 'QSOrder_Grantham.Xr.D',
       'QSOrder_Grantham.Xr.C', 'QSOrder_Grantham.Xr.Q',
       'QSOrder_Grantham.Xr.E', 'QSOrder_Grantham.Xr.G',
       'QSOrder_Grantham.Xr.H', 'QSOrder_Grantham.Xr.I',
       'QSOrder_Grantham.Xr.L', 'QSOrder_Grantham.Xr.K',
       'QSOrder_Grantham.Xr.M', 'QSOrder_Grantham.Xr.F',
       'QSO

In [73]:
log_cols = df.columns[36:]
df[log_cols] = df[log_cols].apply(lambda x: np.log(x))

In [78]:
df = df.dropna(axis=1, how='any')

In [79]:
nan_counts = df.isna().sum()
nan_counts

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
subject_align_len                    0
subject_align_cov                    0
bit_score                            0
m_protein_len                        0
t_protein_len                        0
protein_match                        0
AAC_A                                0
AAC_C                                0
AAC_D                                0
AAC_E                                0
AAC_F                                0
AAC_G                                0
AAC_H                                0
AAC_I                                0
AAC_K                                0
AAC_L                                0
AAC_M                                0
AAC_N                                0
AAC_P                                0
AAC_Q                                0
AAC_R                    

In [80]:
df.shape

(6348, 42)

Use MRMR to select for the best features. Going to start by grouping into different categories of features generated from iFeature Omega.

In [81]:
#use MRMR to select for the best features from PseKRAAC
df_subset = df.loc[:, df.columns != 'protein_match']
print(type(df_subset))

# select top 10 features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=df_subset.iloc[:,10:], y=df['protein_match'], K=20)

selected_features

<class 'pandas.core.frame.DataFrame'>


100%|██████████| 20/20 [00:00<00:00, 45.33it/s]


['AAC_A',
 'QSOrder_Schneider.Xd.3',
 'QSOrder_Schneider.Xd.1',
 'AAC_L',
 'AAC_P',
 'GAAC_alphatic',
 'AAC_R',
 'QSOrder_Grantham.Xd.2',
 'GAAC_aromatic',
 'AAC_T',
 'AAC_K',
 'AAC_Q',
 'AAC_I',
 'QSOrder_Grantham.Xd.1',
 'AAC_S',
 'AAC_N',
 'AAC_F',
 'AAC_D',
 'GAAC_uncharge',
 'QSOrder_Schneider.Xd.2']

In [82]:
best_features_df = df[[feature for feature in selected_features]]

#concatenates original feature vector back into the dataframe
df = pd.concat([df.iloc[:, :11], best_features_df], axis=1)
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,AAC_K,AAC_Q,AAC_I,QSOrder_Grantham.Xd.1,AAC_S,AAC_N,AAC_F,AAC_D,GAAC_uncharge,QSOrder_Schneider.Xd.2
0,0.287582,0.217822,0.215686,160,0.792079,152,0.737864,131,206,202,...,-0.029991,-0.020571,-0.029991,-0.064588,-0.015524,-0.019898,-0.015140,0.013410,-0.033452,0.022842
1,0.319635,0.295359,0.297872,218,0.919831,226,0.969957,282,233,237,...,-0.007497,0.021821,-0.016008,-0.093397,0.004944,-0.016443,0.008946,0.035711,-0.017982,0.055143
2,0.279621,0.234127,0.218924,211,0.837302,210,0.731707,96,287,252,...,-0.017325,0.002033,-0.035134,0.034032,-0.067847,-0.018777,-0.004355,0.027003,-0.114015,0.005861
3,0.327273,0.200743,0.214712,166,0.617100,163,0.696581,175,234,269,...,0.008722,-0.000381,-0.018238,-0.082094,0.003511,-0.006323,0.010946,-0.016919,-0.020144,0.055484
4,0.338710,0.318182,0.287671,60,0.909091,71,0.887500,61,80,66,...,-0.032955,0.022348,0.001894,-0.024133,0.037500,0.007197,-0.066667,-0.005303,0.051136,0.049946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6343,0.387574,0.367978,0.360882,336,0.943820,337,0.910811,588,370,356,...,0.002855,0.030626,-0.015958,0.004543,-0.001488,-0.007319,-0.015639,0.028135,0.023504,-0.002964
6344,0.488462,0.488462,0.455197,258,0.992308,263,0.882550,619,298,260,...,-0.001962,0.004750,-0.005885,0.052205,0.010403,-0.000077,0.011461,0.001652,0.007073,-0.015008
6345,0.383838,0.376238,0.374384,296,0.976898,299,0.977124,436,306,303,...,0.009707,-0.003494,0.035462,-0.063804,0.005986,0.022811,-0.000356,-0.000453,0.040866,-0.007821
6346,0.769231,0.769231,0.769231,78,1.000000,78,1.000000,258,78,78,...,0.025641,0.000000,0.000000,0.023367,0.051282,0.000000,-0.012821,-0.012821,-0.038462,0.012262


In [83]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(5395, 31)
(953, 31)
(4585, 31)
(810, 31)


In [84]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [85]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(5395, 30) (953, 30) (5395, 1) (953, 1)


In [86]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [87]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [88]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier(n_estimators=150, max_depth=None, max_samples=0.5,
                                                max_features=0.5, min_weight_fraction_leaf=0.000215,
                                               min_samples_split=10)

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [89]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.8098765432098766
['Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'No'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'Yes' 'Yes' 'Yes' '

In [38]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fc61fa2c070>