This notebook uses iFeatureOmega, a feature generation software, to add to our feature space for a RandomForestClassifier that predicts protein pair functionality.

To do:

1) Write unit tests for iFeatureOmega
2) time trial for different descriptors - DONE
3) Figure out how to append meso and thermo descriptors
4) Make sure protein length is in training
5) try changing subject_align_len to subject_align_len/m_protein_len
6) log(ratio) for proteins approacing zero or infinity
    take distributions to assess for this.
    Update -- taking log and then droping infinites and NaN improves score to 0.86 <-- cannot repeat this

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.ensemble
import sklearn.feature_selection
import unittest
import iFeatureOmegaCLI
import Bio.SeqIO
import Bio.SeqRecord
import io
from io import StringIO
import time

In [2]:
def get_fasta_from_dataframe(dataframe, output_file_a, output_file_b):
    #adjust this to write function with BioPython
    #separate functions for each of the input sequences
    #in training, seq_a = meso and seq_b = thermo
    
    
    #meso sequence to fasta
    with open(output_file_a, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), row['m_protein_seq']))
    
    #thermo sequence to fasta
    with open(output_file_b, 'w') as f:
        for _, row in df.iterrows():
            f.write('>{}\n{}\n'.format((row['prot_pair_index']), (row['t_protein_seq'])))
   
    #return output files
    return [output_file_a, output_file_b]

In [3]:
def get_protein_descriptors(fasta_file, descriptors=[]):
    
    """
    Generates features from a protein sequence

    Parameters
    ----------
    Fasta file with amino acid sequences.

    Returns
    -------
    Vector of descriptors
    """
    
    #create iProtein object
    protein = iFeatureOmegaCLI.iProtein(fasta_file)
    
    #not sure why we need this yet. Right now it is stored in local directory.
    params = protein.import_parameters('protein_parameters.json')
    
    protein_descriptors = {}
    
    for descriptor in descriptors:
        protein.get_descriptor(descriptor)
        protein_descriptors.update({f'{descriptor}':protein.encodings})
        
    return protein_descriptors

In [4]:
def create_new_dataframe(dataframe, output_files, descriptors=[]):
    """
    Creates new dataframe with descriptors added.

    Parameters
    ----------
    Pandas dataframe, list of descriptors as strings, output file name.

    Returns
    -------
    Dataframe including vector(s) of descriptors
    """

    fasta_files = get_fasta_from_dataframe(dataframe, output_files[0], output_files[1])
    
    def compute_descriptor_ratio(fasta_files, descriptors=[]):
        """
        Generates dictionary of descriptors for each of the two input sequences.
        Computes the difference between each instance of a descriptor.
        
        Parameters
        ----------
        List of two fasta files (str) and list of descriptors (str).

        Returns
        -------
        Dictionary with difference between descriptors for each of the 
        input sequences.
        """
        desc_a = get_protein_descriptors(fasta_files[0], descriptors)
        desc_b = get_protein_descriptors(fasta_files[1], descriptors)

        feature_dict = {}

        for key in desc_a:
            
            if 'AAC' in key:
                feature_dict[key] = desc_a[key] - desc_b[key]
            elif 'GAAC' in key:
                feature_dict[key] = desc_a[key] - desc_b[key]
            else:
                feature_dict[key] = desc_a[key] / desc_b[key]

        return feature_dict
    
    feature_dict = compute_descriptor_ratio(fasta_files, descriptors)


    df = dataframe.reset_index()

    for desc in descriptors:

        feature_dict[desc].index = feature_dict[desc].index.astype(int)
        features = feature_dict[desc].reset_index()

        df = pd.merge(
            df,
            features,
            how='outer',
            left_index=True,
            right_index=True)

    return df

In [5]:
#iFeature properties

# protein = iFeatureOmegaCLI.iProtein('meso_50k.fasta')
# protein.display_feature_types()

In [6]:
cd /Users/loganroberts/Learn2Therm/ValidProt/FAFSA

/Users/loganroberts/Learn2Therm/ValidProt/FAFSA


In [7]:
#convert to pandas df
df = pd.read_csv('learn2therm_sample_50k.csv')
df.columns

Index(['Unnamed: 0', 'local_gap_compressed_percent_id',
       'scaled_local_query_percent_id', 'scaled_local_symmetric_percent_id',
       'query_align_len', 'query_align_cov', 'subject_align_len',
       'subject_align_cov', 'bit_score', 'thermo_index', 'meso_index',
       'prot_pair_index', 'meso_protein_int_index', 'thermo_protein_int_index',
       'taxa_pair_index', 'local_gap_compressed_percent_id_16s',
       'scaled_local_query_percent_id_16s',
       'scaled_local_symmetric_percent_id_16s', 'query_align_cov_16s',
       'subject_align_cov_16s', 'bit_score_16s', 'm_ogt', 't_ogt',
       'ogt_difference', 'm_protein_seq', 't_protein_seq', 'm_protein_desc',
       't_protein_desc', 'm_protein_len', 't_protein_len'],
      dtype='object')

In [8]:
cd /Users/loganroberts/Learn2Therm/ValidProt/notebooks

/Users/loganroberts/Learn2Therm/ValidProt/notebooks


In [42]:
target = pd.read_csv('protein_match_6k.csv')

In [43]:
target

Unnamed: 0,prot_pair_index,Jaccard_Score,protein_match
0,48641291,1.0,Yes
1,92992745,1.0,Yes
2,157628663,1.0,Yes
3,136708305,1.0,Yes
4,133672542,1.0,Yes
...,...,...,...
6343,55489429,0.5,Yes
6344,172293605,1.0,Yes
6345,47082975,1.0,Yes
6346,154513027,1.0,Yes


In [10]:
from sklearn.utils import resample

# Assuming your data is in a pandas DataFrame called 'data'
# Separate the majority and minority classes
majority_class = target[target['protein_match'] == 'Yes']
minority_class = target[target['protein_match'] == 'No']

# Undersample the majority class to match the number of minority class samples
n_samples = len(minority_class)
undersampled_majority = resample(majority_class, n_samples=n_samples, replace=False)

# Combine the undersampled majority class with the minority class
balanced_data = pd.concat([undersampled_majority, minority_class])

In [11]:
balanced_data

Unnamed: 0,prot_pair_index,Jaccard_Score,protein_match
1439,172839287,1.00,Yes
6091,150701143,1.00,Yes
2754,145276195,1.00,Yes
4997,87951884,1.00,Yes
3610,137382406,1.00,Yes
...,...,...,...
6328,125335561,0.25,No
6329,105441342,0.20,No
6336,87888523,0.20,No
6339,38582871,0.20,No


In [12]:
df = pd.merge(df, balanced_data, on=['prot_pair_index'])
df.shape

(2210, 32)

In [13]:
"""
this list comes from a combination of reading through the features and determining which might be useful
and timing some of the feature generations. those that took more than 30ish seconds were eliminated
Also removed those that have really high dimensionality (>4000)
"""

feature_list = ['AAC', 'GAAC', 'DistancePair',
               'CTDC', 'CTDT', 'CTDD', 'CTriad', 'GDPC type 1', 'GDPC type 2',
                'CKSAAGP type 1', 'CKSAAGP type 2', 'PseKRAAC type 2', 'PseKRAAC type 3A','PseKRAAC type 7',
                'PseKRAAC type 9', 'Geary','APAAC', 'QSOrder']

In [14]:
# df = create_new_dataframe(df, 'seq_50k.fasta', descriptors= ['EGAAC', 'CKSAAGP type 1', 'CKSAAGP type 2',
#                                                             'GDPC type 1', 'GDPC type 2', 'TPC type 1', 'TPC type 2',
#                                                             'GTPC type 1', 'GTPC type 2', 'DPC type 1', 'DPC type 2',
#                                                             'Moran', 'Geary', 'NMBroto', 'AC', 'CC', 'ACC',
#                                                             'PAAC', 'APAAC'])

In [15]:
df = create_new_dataframe(df, ['seq_50k_a.fasta', 'seq_50k_b.fasta'], descriptors=['AAC', 'GAAC', 'QSOrder'])

File imported successfully.
File imported successfully.


  df = pd.merge(


Split data into dev and test, and then split that into train and validation.

In [16]:
#drop columns that don't exihibit signficant pearson correlation with bit_score

df = df.drop(columns = ['meso_index', 'meso_protein_int_index', 'local_gap_compressed_percent_id_16s', 
                        'scaled_local_query_percent_id_16s', 'scaled_local_symmetric_percent_id_16s',
                       'bit_score_16s', 'm_ogt', 't_ogt', 'taxa_pair_index', 'thermo_protein_int_index'
                       , 'prot_pair_index', 'ogt_difference', 'Jaccard_Score',
                       'query_align_cov_16s', 'subject_align_cov_16s',
                       'Unnamed: 0', 'thermo_index','m_protein_seq', 't_protein_seq', 
                       'm_protein_desc', 't_protein_desc'])

In [17]:
df=df.drop(columns=['index_y', 'index_x'])
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,QSOrder_Grantham.Xr.T,QSOrder_Grantham.Xr.W,QSOrder_Grantham.Xr.Y,QSOrder_Grantham.Xr.V,QSOrder_Schneider.Xd.1,QSOrder_Schneider.Xd.2,QSOrder_Schneider.Xd.3,QSOrder_Grantham.Xd.1,QSOrder_Grantham.Xd.2,QSOrder_Grantham.Xd.3
0,0.283582,0.231707,0.240506,133,0.810976,134,0.881579,99,152,164,...,1.050613,inf,2.101227,0.525307,0.935529,0.992816,1.059350,0.948405,1.036564,1.020658
1,0.268482,0.190083,0.189300,248,0.683196,271,0.740437,132,366,363,...,1.094398,0.926029,0.771691,0.956897,0.996233,0.932948,1.078083,1.006552,0.982711,1.012484
2,0.265306,0.201550,0.210243,320,0.826873,294,0.828169,140,355,387,...,1.368059,inf,0.619853,0.879467,0.996096,1.021678,0.951924,1.040048,0.978314,0.983529
3,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,...,0.840951,1.324497,0.378428,1.000131,1.035213,0.999638,1.027001,0.972217,1.003861,1.025202
4,0.394495,0.346774,0.369099,108,0.870968,106,0.972477,130,109,124,...,1.221476,0.000000,,0.684027,1.034483,0.907769,0.943659,1.014137,0.917208,1.071018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205,0.241758,0.134557,0.144975,181,0.553517,190,0.678571,107,280,327,...,0.474426,1.217278,0.765146,0.753927,1.103433,0.968933,0.915878,1.012618,1.001902,0.985148
2206,0.321429,0.186391,0.223801,197,0.582840,192,0.853333,195,225,338,...,1.443473,3.127526,0.586411,1.066202,1.021185,0.915617,0.976575,1.058108,0.931608,1.007156
2207,0.290909,0.162437,0.154589,110,0.558376,112,0.516129,85,217,197,...,1.466560,0.879936,1.173248,1.613216,0.931433,1.004820,1.103878,0.981874,0.948476,1.068755
2208,0.447368,0.387833,0.373626,229,0.870722,233,0.823322,448,283,263,...,0.607808,3.781916,1.575798,1.094765,1.041988,1.014742,0.958501,1.068293,0.961154,0.973389


In [18]:
df['t_protein_len'].describe()

count    2210.000000
mean      263.893665
std        67.658484
min        46.000000
25%       228.000000
50%       260.000000
75%       313.000000
max       400.000000
Name: t_protein_len, dtype: float64

In [19]:
#maybe divide all of the AAC/GAAC columns by the protein length 

In [20]:
df.shape

(2210, 82)

In [21]:
df = df.replace([np.inf, -np.inf], np.nan)

In [22]:
nan_counts = df.isna().sum()
print(nan_counts)
nan_counts.unique()

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
                                    ..
QSOrder_Schneider.Xd.2               0
QSOrder_Schneider.Xd.3               0
QSOrder_Grantham.Xd.1                0
QSOrder_Grantham.Xd.2                0
QSOrder_Grantham.Xd.3                0
Length: 82, dtype: int64


array([  0,  51, 462,  23,   1,  34,   3,  52,  12,   2, 379,  44])

In [72]:
df.columns[36:]

Index(['QSOrder_Schneider.Xr.A', 'QSOrder_Schneider.Xr.R',
       'QSOrder_Schneider.Xr.N', 'QSOrder_Schneider.Xr.D',
       'QSOrder_Schneider.Xr.C', 'QSOrder_Schneider.Xr.Q',
       'QSOrder_Schneider.Xr.E', 'QSOrder_Schneider.Xr.G',
       'QSOrder_Schneider.Xr.H', 'QSOrder_Schneider.Xr.I',
       'QSOrder_Schneider.Xr.L', 'QSOrder_Schneider.Xr.K',
       'QSOrder_Schneider.Xr.M', 'QSOrder_Schneider.Xr.F',
       'QSOrder_Schneider.Xr.P', 'QSOrder_Schneider.Xr.S',
       'QSOrder_Schneider.Xr.T', 'QSOrder_Schneider.Xr.W',
       'QSOrder_Schneider.Xr.Y', 'QSOrder_Schneider.Xr.V',
       'QSOrder_Grantham.Xr.A', 'QSOrder_Grantham.Xr.R',
       'QSOrder_Grantham.Xr.N', 'QSOrder_Grantham.Xr.D',
       'QSOrder_Grantham.Xr.C', 'QSOrder_Grantham.Xr.Q',
       'QSOrder_Grantham.Xr.E', 'QSOrder_Grantham.Xr.G',
       'QSOrder_Grantham.Xr.H', 'QSOrder_Grantham.Xr.I',
       'QSOrder_Grantham.Xr.L', 'QSOrder_Grantham.Xr.K',
       'QSOrder_Grantham.Xr.M', 'QSOrder_Grantham.Xr.F',
       'QSO

In [110]:
log_cols = df.columns[36:]
df[log_cols] = df[log_cols].apply(lambda x: np.log(x))

In [23]:
df = df.dropna(axis=1, how='any')

In [24]:
nan_counts = df.isna().sum()
nan_counts

local_gap_compressed_percent_id      0
scaled_local_query_percent_id        0
scaled_local_symmetric_percent_id    0
query_align_len                      0
query_align_cov                      0
subject_align_len                    0
subject_align_cov                    0
bit_score                            0
m_protein_len                        0
t_protein_len                        0
protein_match                        0
AAC_A                                0
AAC_C                                0
AAC_D                                0
AAC_E                                0
AAC_F                                0
AAC_G                                0
AAC_H                                0
AAC_I                                0
AAC_K                                0
AAC_L                                0
AAC_M                                0
AAC_N                                0
AAC_P                                0
AAC_Q                                0
AAC_R                    

In [25]:
df.shape

(2210, 56)

Use MRMR to select for the best features. Going to start by grouping into different categories of features generated from iFeature Omega.

In [26]:
#use MRMR to select for the best features from PseKRAAC
df_subset = df.loc[:, df.columns != 'protein_match']
print(type(df_subset))

# select top 10 features using mRMR
from mrmr import mrmr_classif
selected_features = mrmr_classif(X=df_subset.iloc[:,10:], y=df['protein_match'], K=20)

selected_features

<class 'pandas.core.frame.DataFrame'>


100%|██████████| 20/20 [00:00<00:00, 48.64it/s]


['QSOrder_Schneider.Xr.A',
 'QSOrder_Grantham.Xd.3',
 'QSOrder_Grantham.Xr.R',
 'QSOrder_Schneider.Xd.1',
 'AAC_A',
 'QSOrder_Schneider.Xd.3',
 'QSOrder_Grantham.Xr.A',
 'AAC_I',
 'QSOrder_Schneider.Xr.R',
 'AAC_S',
 'GAAC_alphatic',
 'AAC_E',
 'QSOrder_Grantham.Xd.2',
 'AAC_N',
 'GAAC_negativecharge',
 'QSOrder_Schneider.Xr.L',
 'QSOrder_Schneider.Xr.G',
 'AAC_T',
 'GAAC_uncharge',
 'AAC_R']

In [34]:
best_features_df = df[[feature for feature in selected_features]]

#concatenates original feature vector back into the dataframe
df = pd.concat([df.iloc[:, :11], best_features_df], axis=1)
df

Unnamed: 0,local_gap_compressed_percent_id,scaled_local_query_percent_id,scaled_local_symmetric_percent_id,query_align_len,query_align_cov,subject_align_len,subject_align_cov,bit_score,m_protein_len,t_protein_len,...,GAAC_alphatic,AAC_E,QSOrder_Grantham.Xd.2,AAC_N,GAAC_negativecharge,QSOrder_Schneider.Xr.L,QSOrder_Schneider.Xr.G,AAC_T,GAAC_uncharge,AAC_R
0,0.283582,0.231707,0.240506,133,0.810976,134,0.881579,99,152,164,...,-0.015083,-0.013479,1.036564,0.006579,0.004974,1.329084,0.701461,0.003851,0.059050,-0.034981
1,0.268482,0.190083,0.189300,248,0.683196,271,0.740437,132,366,363,...,0.078422,-0.030461,0.982711,0.002394,-0.006345,0.920360,1.465759,0.010432,-0.035113,-0.030800
2,0.265306,0.201550,0.210243,320,0.826873,294,0.828169,140,355,387,...,0.030913,-0.020017,0.978314,0.011479,-0.021691,0.886583,1.065708,0.018277,0.057837,-0.010125
3,0.275281,0.182156,0.152174,185,0.687732,197,0.525333,48,375,269,...,0.064287,-0.009864,1.003861,0.005898,0.011143,1.139824,0.931587,-0.013581,-0.068788,0.016882
4,0.394495,0.346774,0.369099,108,0.870968,106,0.972477,130,109,124,...,-0.028781,-0.030334,0.917208,0.000000,-0.007547,1.961181,1.123222,0.013613,0.063332,-0.031444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2205,0.241758,0.134557,0.144975,181,0.553517,190,0.678571,107,280,327,...,0.049945,0.007678,1.001902,0.024509,0.004642,0.949185,1.416965,-0.051409,-0.086435,-0.043283
2206,0.321429,0.186391,0.223801,197,0.582840,192,0.853333,195,225,338,...,0.082104,-0.038396,0.931608,0.002998,-0.030901,1.624165,1.373086,0.014872,-0.030743,-0.005786
2207,0.290909,0.162437,0.154589,110,0.558376,112,0.516129,85,217,197,...,-0.074037,0.016959,0.948476,0.008281,0.038995,0.890999,0.712799,0.015626,0.022831,0.000327
2208,0.447368,0.387833,0.373626,229,0.870722,233,0.823322,448,283,263,...,-0.037660,-0.007833,0.961154,-0.009217,0.006610,0.900462,0.840163,-0.021430,0.020557,0.004958


In [27]:
#choosing 80/20 split instead of 85/15 because of volume of data

dev, test = sklearn.model_selection.train_test_split(df, test_size=0.15, random_state=1)

train, val = sklearn.model_selection.train_test_split(dev, test_size=0.15, random_state=1)

print(dev.shape)
print(test.shape)
print(train.shape)
print(val.shape)

(1878, 56)
(332, 56)
(1596, 56)
(282, 56)


In [35]:
#ID target and features, separate into separate arrays

target = 'protein_match'
input_features = [columns for columns in df]
input_features.remove(target)

In [36]:
#split X and y

dev_X = dev[input_features].values
test_X = test[input_features].values

dev_y = dev[target].values.reshape(-1,1)
test_y = test[target].values.reshape(-1,1)  

print(dev_X.shape, test_X.shape, dev_y.shape, test_y.shape)

(1878, 30) (332, 30) (1878, 1) (332, 1)


In [37]:
#same thing for training and validation data

train_X = train[input_features].values
val_X = val[input_features].values

train_y = train[target].values.reshape(-1,1)
val_y = val[target].values.reshape(-1,1) 

Scale the data

In [38]:
scaler = sklearn.preprocessing.StandardScaler()
dev_X = scaler.fit_transform(dev_X)
test_X = scaler.fit_transform(test_X)
train_X = scaler.fit_transform(train_X)
val_X = scaler.fit_transform(val_X)

Train the model

In [39]:
#Random Forest

#hyperparameters determiend with optuna
model = sklearn.ensemble.RandomForestClassifier(n_estimators=150, max_depth=None, max_samples=0.5,
                                                max_features=0.5, min_weight_fraction_leaf=0.000215,
                                               min_samples_split=10)

model.fit(train_X, train_y.ravel())

Test the model, report relevant statistics

In [40]:
score = model.score(val_X, val_y)
print('Model score is: {}'.format(score))

preds = model.predict(test_X)
print(preds)

Model score is: 0.6843971631205674
['Yes' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes'
 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'No'
 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes'
 'Yes' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'Yes'
 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'Yes'
 'No' 'Yes' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes'
 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No'
 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'Yes'
 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes'
 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes'
 'Yes' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes'
 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'Yes'
 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'Yes'
 'Yes' 'No' 'Yes' 'Yes' 

In [38]:
#confusion matrix

confusion_matrix = sklearn.metrics.confusion_matrix(preds, test_y)
sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix).plot()

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x7fc61fa2c070>