In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from helper_functions_pipe_testing import *
from sklearn.metrics import  f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import *


#Instructions for the pipeline
Requires two inputs for training:
    - Mass spec data with corresponding NP surface characteristics and experimental conditions (time, concentration)
    - NetsurfP and Biopython data that has been precalculated
    - X characteristics to predict

pipeline
Take mass spec spreadsheet
Accession,Enrichment,Dh,TEM,Zp,BET,Composition,Ligand,Shape,IncubationTime,IncubationConcentration
Merge with Proteome data to get file that has
Accession,Enrichment,Dh,TEM,Zp,BET,Composition,Ligand,Shape,IncubationTime,IncubationConcentration,Mass,Length,Sequence
Calculate protein features using biopython
Merge with NSP data to get all protein features

Split into X and Y dataset with Entries as labels

In [30]:
### New Data workup for RFG

# Pull together Proteomic data
in_dir="Input_data/Proteomic data/abundance/"
all_file='Input_data/Proteomic data/abundance/Intensity _all20230202.xlsx'
#combine Mass Spec data input into one excel spreadsheet - Entry - Abundance labeled by NP Unique ID
#Abundance as a percent
#take files in_dir and combine then into one pandas df (raw_MS_data)
# files = os.listdir(in_dir)
# for i,f in enumerate(files):
#     if i==0:
#         raw_MS_data=pd.read_excel(in_dir+f,header=0)
#     else:
#         temp = pd.read_excel(in_dir+f,header=0)
#         raw_MS_data=raw_MS_data.merge(temp,how='outer',on='Entry')
raw_MS_data=pd.read_excel(all_file,header=0)
# melt the df to make it an accession number, NPUNID, Abundance dataset
raw_MS_data = pd.melt(raw_MS_data, id_vars=['Entry'],var_name='Sample_num', value_name='Abundance')
#remove prots that were added due to merge
raw_MS_data=raw_MS_data.dropna()

In [31]:
raw_MS_data.shape

(10164, 3)

In [32]:

raw_MS_data.shape

(10164, 3)

In [26]:
###Bring in controls (MS data for serums)##
#Controls is the abundance of the proteins in serum that have been observed via proteomics
#look at other sources here for human proteins, but internal sources for BALF and FBS
controls=pd.read_excel('Input_data/Proteomic data/controls_combined.xlsx',header=0)
MS_data_controls = pd.merge(raw_MS_data,controls,how='Left', on='Entry')
###Bring in Uniprot_data,NSPdata and NP data##
uniprot_filepath='Input_data/BioPython_data/Combined_biopyCalcs.xlsx'
uniprot_dat=pd.read_excel(uniprot_filepath,header=0)
NSPfilePath='Input_data/NetSurfP_data/Combined.xlsx'
NSP_data=pd.read_excel(NSPfilePath)
###Bring in NP data and merge to get complete NP dataset###
NP_filepath='Input_data/NPs/NP_Database.xlsx'
NPUNdata=pd.read_excel(NP_filepath,header=0,sheet_name='NPUNID')
NPprop=pd.read_excel(NP_filepath,header=0,sheet_name='NP_Props')
NPdata=pd.merge(NPUNdata,NPprop,how="left",on='NPID')

In [40]:
controls=pd.read_excel('Input_data/Proteomic data/controls_combined.xlsx',header=0)
# MS_data_controls = pd.merge(raw_MS_data,controls, ,on='Entry')
MS_data_controls=pd.concat([raw_MS_data,controls], join='outer')
MS_data_controls.fillna(0, inplace=True)
MS_data_controls.shape


(14880, 4)

In [41]:
#calculate Enrichment
#####MAYBE add binning here to to keep negative results and improve capapbilities######
MS_data_controls['Enrichment']= np.log2(MS_data_controls['Abundance']/MS_data_controls['Abundance_Controls'])
MS_data=MS_data_controls.drop(columns=['Abundance','Abundance_Controls'])
raw_prop_data=pd.merge(MS_data, uniprot_dat, how='left',on='Entry')

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [42]:
MS_data_clean = raw_MS_data.copy()
PROT_cleaned_data = normalize_mass_length_1DF(raw_prop_data) #function found in data_prep_functions line 167, normalizes mass, length and mw by dividing all values by the max in the column
Protein_data_complete = pd.merge(PROT_cleaned_data, NSP_data, left_on='Entry', right_on='Entry') #merges netsurfp features and biopython features
#creates new column called asa_sum_normalized which is the asa_sum value divide by the mass of the protein
for df in [Protein_data_complete]:
    for col in ['asa_sum']:
        df[col+'_normalized'] = df[col] / df['Mass']

data_complete= pd.merge(Protein_data_complete,NPdata,how='left', on='Sample_num')
data_complete.drop(columns=['notes','Notes','NPUNID'],inplace=True)
#Optional here to drop all enrichment values that are NA - also have to deal with positive/negative infinity values
# data_complete.dropna(subset=['Enrichment'],inplace=True)

In [None]:
# data_complete.drop(columns='Unnamed: 0',inplace=True)

(2717, 113)

In [45]:
data_complete.fillna(0,inplace=True)
#replace -infintiy and positive infinity with -12 and 12

#####COME BACK AND FIND BETTER SOLUTION!!!##
data_complete= data_complete.replace([-np.inf],'-12')
data_complete=data_complete.replace([np.inf],'12')

In [47]:
data_complete.shape
# data_complete.columns

(16246, 113)

In [49]:
# core_mats=pd.get_dummies(data_complete['Core Material'],prefix='core_mat')
# ligands=pd.get_dummies(data_complete['Ligand'], prefix='Lig')
data_complete2=pd.get_dummies(data_complete, columns=['Core Material', 'Ligand'])
# print(data_complete2.columns)

Index(['Entry', 'Sample_num', 'Enrichment', 'Sequence', 'Length', 'Mass',
       'frac_aa_A', 'frac_aa_C', 'frac_aa_D', 'frac_aa_E',
       ...
       'Core Material_Iron Oxide', 'Core Material_Polystyrene', 'Ligand_0',
       'Ligand_Carboxylate', 'Ligand_Carboxylate BSA', 'Ligand_Citrate',
       'Ligand_PEG 2k', 'Ligand_PEG 5k', 'Ligand_Polyethelyenimine',
       'Ligand_polyvinylpyrrolidone'],
      dtype='object', length=122)


In [52]:
#set labels (what we are trying to predict) as Enrichment column
labels=data_complete2['Enrichment'].copy()
#make it one dimenisional
labels=np.ravel(labels)
#drop qualitative, not neccessary, and label columns
df=data_complete2.drop(['Entry','Sequence','NPID','Enrichment','Protein Source','Sample_num'],axis=1)


# print(df)
#these are left over metrics from the helper functions from landry paper replace with better options for looking at metrics###
first_frame = True #starting dataframe for saving metrics
correctness_frame = pd.DataFrame()
metrics_frame = pd.DataFrame()
print_metrics = 1 #0, doesn't show metrics while runnning for each model, 1 does show metrics


In [13]:
#set labels (what we are trying to predict) as Enrichment column
labels=data_complete['Enrichment'].copy()
#make it one dimenisional
labels=np.ravel(labels)
#drop qualitative, not neccessary, and label columns
df=data_complete.drop(['Entry','Sequence','Core Material','Ligand','NPID','Enrichment'],axis=1)

# print(df)
#these are left over metrics from the helper functions from landry paper replace with better options for looking at metrics###
first_frame = True #starting dataframe for saving metrics
correctness_frame = pd.DataFrame()
metrics_frame = pd.DataFrame()
print_metrics = 1 #0, doesn't show metrics while runnning for each model, 1 does show metrics


array([-2.5320414762967736, -2.100316148783357, -2.1346770999965723, ...,
       1.789022535723233, '-12', '-12'], dtype=object)

In [15]:
df

Unnamed: 0,Length,Mass,frac_aa_A,frac_aa_C,frac_aa_D,frac_aa_E,frac_aa_F,frac_aa_G,frac_aa_H,frac_aa_I,...,Incubation Time (minutes),Temperature,Core Material_E171,Core Material_Iron Oxide,Core Material_P25,Core Material_Polystyrene,Core Material_R101,Ligand_Amine,Ligand_Carboxylate BSA,Ligand_none
0,607.0,69293.0,0.079077,0.057661,0.065898,0.097199,0.049423,0.028007,0.028007,0.024712,...,30.0,25.0,0,1,0,0,0,0,1,0
1,607.0,69293.0,0.079077,0.057661,0.065898,0.097199,0.049423,0.028007,0.028007,0.024712,...,30.0,25.0,0,1,0,0,0,0,1,0
2,607.0,69293.0,0.079077,0.057661,0.065898,0.097199,0.049423,0.028007,0.028007,0.024712,...,30.0,25.0,0,1,0,0,0,0,1,0
3,607.0,69293.0,0.079077,0.057661,0.065898,0.097199,0.049423,0.028007,0.028007,0.024712,...,30.0,25.0,0,0,0,1,0,1,0,0
4,607.0,69293.0,0.079077,0.057661,0.065898,0.097199,0.049423,0.028007,0.028007,0.024712,...,30.0,25.0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3077,412.0,45823.0,0.070388,0.007282,0.050971,0.072816,0.060680,0.048544,0.036408,0.048544,...,60.0,25.0,0,0,0,0,1,0,0,1
3078,412.0,45823.0,0.070388,0.007282,0.050971,0.072816,0.060680,0.048544,0.036408,0.048544,...,60.0,25.0,0,0,1,0,0,0,0,1
3079,412.0,45823.0,0.070388,0.007282,0.050971,0.072816,0.060680,0.048544,0.036408,0.048544,...,60.0,25.0,0,0,1,0,0,0,0,1
3080,412.0,45823.0,0.070388,0.007282,0.050971,0.072816,0.060680,0.048544,0.036408,0.048544,...,60.0,25.0,0,0,1,0,0,0,0,1


In [53]:
data_complete

KeyboardInterrupt: 

In [19]:
#Run recursive feature elimination to determine top features to select
#currently selecting 15 although that is arbitrary
### COME UP with better more objective way for determining number of features####
from sklearn.feature_selection import RFE

estimator = RandomForestRegressor(n_estimators=100)
selector = RFE(estimator, n_features_to_select= 15, step=1)
selector = selector.fit(df,labels)
selector.support_

array([ 75,   1,  41,  32,  47,  71,  26,   2,  37,   1,   4,  44,   9,
        23,  65,   1,  58,  76,  55,  10,   1,  64,  84,  14,  19,  43,
         1,  38,  11,   1,   1,  24,  70,  16,  66,  17,  21,  88,  69,
        87,  77,  85,  28,  48,  79,  86,  82,   1,  51,   1,  78,  62,
        40,  74,  57,  30,  73,  33,  34,  18,  39,   8,  53,  45,   6,
        20,  56,  83,   1,  31,  80,  60,  68,  49,  36,   1,  46,  29,
         1,   1,   7,  27,  59,  13,  42,   3,  25,  63,  22,  54,   5,
        52,  67,  12,   1,   1,  15,  35,  61,  81,  97,  93,  72,  94,
        95, 100,  98,  92,  50,  99,  90,  96,  91,  89])

In [20]:
selector.ranking_

['Mass' 'frac_aa_I' 'frac_aa_Q' 'frac_aa_W' 'flexibility_std'
 'flexibility_min' 'flexibility_median' 'rsa_std'
 'fraction_total_exposed_A' 'fraction_total_exposed_Y'
 'fraction_exposed_exposed_H' 'fraction_exposed_exposed_L'
 'fraction_exposed_exposed_M' 'Sample_num' 'Raw_FileID']


In [21]:
feat_list=selector.get_feature_names_out()
print(feat_list)

Unnamed: 0,Mass,frac_aa_I,frac_aa_Q,frac_aa_W,flexibility_std,flexibility_min,flexibility_median,rsa_std,fraction_total_exposed_A,fraction_total_exposed_Y,fraction_exposed_exposed_H,fraction_exposed_exposed_L,fraction_exposed_exposed_M,Sample_num,Raw_FileID
0,69293.0,0.024712,0.032949,0.004942,0.026371,0.936357,1.003321,0.223,0.046129,0.006590,0.035503,0.038462,0.005917,76,67501.0
1,69293.0,0.024712,0.032949,0.004942,0.026371,0.936357,1.003321,0.223,0.046129,0.006590,0.035503,0.038462,0.005917,77,67502.0
2,69293.0,0.024712,0.032949,0.004942,0.026371,0.936357,1.003321,0.223,0.046129,0.006590,0.035503,0.038462,0.005917,78,67503.0
3,69293.0,0.024712,0.032949,0.004942,0.026371,0.936357,1.003321,0.223,0.046129,0.006590,0.035503,0.038462,0.005917,83,71435.0
4,69293.0,0.024712,0.032949,0.004942,0.026371,0.936357,1.003321,0.223,0.046129,0.006590,0.035503,0.038462,0.005917,84,71438.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3077,45823.0,0.048544,0.043689,0.007282,0.028258,0.936714,0.999190,0.258,0.024272,0.002427,0.049107,0.049107,0.013393,59,72280.0
3078,45823.0,0.048544,0.043689,0.007282,0.028258,0.936714,0.999190,0.258,0.024272,0.002427,0.049107,0.049107,0.013393,60,72281.0
3079,45823.0,0.048544,0.043689,0.007282,0.028258,0.936714,0.999190,0.258,0.024272,0.002427,0.049107,0.049107,0.013393,61,72282.0
3080,45823.0,0.048544,0.043689,0.007282,0.028258,0.936714,0.999190,0.258,0.024272,0.002427,0.049107,0.049107,0.013393,62,72283.0


In [None]:
df_rfe=df[feat_list].copy()
df_rfe

0.4570470777946193


In [23]:

x_train, x_test, y_train, y_test = train_test_split(df_rfe,labels, test_size = 0.33, random_state=42)
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=2016)
# for train_index, test_index in sss.split(df, labels):
# x_train = df.iloc[train_index]
# X_test = df.iloc[test_index]
# y_train = labels.iloc[train_index]
# y_test = labels.iloc[test_index]

rfg=RandomForestRegressor(n_estimators=100)
rfg.fit(x_train,y_train)
print(rfg.score(x_test,y_test))
# print(rfg.feature_importances_)

# metrics_dict = {'AUC':metrics.roc_auc_score(y_test, rfg.predict_proba(x_test)[:, 1]),
#             'Accuracy':rfg.score(x_test, y_test), 'Recall':recall_score(y_test, rfg.predict(x_test)),
#             'Precision':precision_score(y_test, rfg.predict(x_test), zero_division=0), 'F1':f1_score(y_test, rfg.predict(x_test))}
# metrics_frame = pd.DataFrame.from_dict(data=metrics_dict,orient='index').transpose()

feature: importance score
Mass 0.07059452031623174
frac_aa_I 0.055763282227362446
frac_aa_Q 0.05304417728817006
frac_aa_W 0.0611846862849525
flexibility_std 0.06770617152794556
flexibility_min 0.05990477590432002
flexibility_median 0.05326342069196897
rsa_std 0.06550148826393754
fraction_total_exposed_A 0.05703796658251008
fraction_total_exposed_Y 0.05530507037256151
fraction_exposed_exposed_H 0.04865095546598984
fraction_exposed_exposed_L 0.07157741257618268
fraction_exposed_exposed_M 0.058381776855058076
Sample_num 0.07025596663688109
Raw_FileID 0.15182832900592796


0.43730337548323595


In [28]:
feat_importances=rfg.feature_importances_
# print(feat_importances)
print('feature: importance score')
for i,col in enumerate(df_rfe.columns):
    print(col,feat_importances[i])

feature: importance score
frac_aa_I 0.06596630314032861
frac_aa_Q 0.05914304079572
frac_aa_W 0.06973760909851004
molecular_weight 0.055950814276992854
flexibility_var 0.060526394166567826
flexibility_min 0.06685478674919744
fraction_exposed_polar_exposed 0.06073350567070096
rsa_std 0.06858982375397228
fraction_total_exposed_M 0.07155243384020243
fraction_total_exposed_Y 0.062404576064288855
fraction_exposed_exposed_A 0.04876247693133964
fraction_exposed_exposed_H 0.05787822204456944
fraction_exposed_exposed_T 0.061567447024784984
Zeta Potential 0.09787628088636142
NP_incubation Concentration (mg/mL) 0.09245628555646303
