In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from helper_functions_pipe_testing import *
from sklearn.metrics import  f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import *


#Instructions for the pipeline
Requires four inputs:
    - Mass spec data with corresponding NP surface characteristics and experimental conditions (time, concentration)
    - Proteome data from uniprot (Mass, length, sequence, GO for later analysis)
    - NetsurfP data for the proteins that are to be searched
    - X characteristics to predict

pipeline
Take mass spec spreadsheet
Accession,Enrichment,Dh,TEM,Zp,BET,Composition,Ligand,Shape,IncubationTime,IncubationConcentration
Merge with Proteome data to get file that has
Accession,Enrichment,Dh,TEM,Zp,BET,Composition,Ligand,Shape,IncubationTime,IncubationConcentration,Mass,Length,Sequence
Calculate protein features using biopython
Merge with NSP data to get all protein features

Split into X and Y dataset with Entries as labels

In [16]:
### New Data workup for RFG

# Pull together Proteomic data
in_dir="Input_data/Proteomic data/abundance/"
#Mass Spec data input in one excel spreadsheet - Entry - Abundance labeled by NP Unique ID
#Abundance as a percent
files= os.listdir(in_dir)
for i,f in enumerate(files):
    if i==0:
        raw_MS_data=pd.read_excel(in_dir+f,header=0)
    else:
        temp = pd.read_excel(in_dir+f,header=0)
        raw_MS_data=raw_MS_data.merge(temp,how='outer',on='Entry')
# melt to make it an accession number, NPID, Abundance dataset
raw_MS_data = pd.melt(raw_MS_data, id_vars=['Entry'],var_name='NPUNID', value_name='Abundance')
#remove prots that were added due to merge
raw_MS_data=raw_MS_data.dropna()

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [42]:
###Bring in controls (MS data for serums)##
controls=pd.read_excel('Input_data/Proteomic data/controls.xlsx',header=0)
MS_data_controls = pd.merge(raw_MS_data,controls,how='inner', on='Entry')
###Bring in Uniprot_data,NSPdata and NP data##
uniprot_filepath='UniProt/Bovine_Mouse_proteome_biopyCalcs.xlsx'
uniprot_dat=pd.read_excel(uniprot_filepath,header=0)
NSPfilePath='NetsurfP_Proteomes/Bovine_Mouse_proteome_complete.xlsx'
NSP_data=pd.read_excel(NSPfilePath)
###Bring in NP data and merge to get complete NP dataset###
NP_filepath='Input_data/NPs/NP_Database.xlsx'
NPUNdata=pd.read_excel(NP_filepath,header=0,sheet_name='NPUNID')
NPprop=pd.read_excel(NP_filepath,header=0,sheet_name='NP_Props')
NPdata=pd.merge(NPUNdata,NPprop,how="left",on='NPID')

In [43]:
#calculate Enrichment
#####MAYBE add binning here to to keep negative results and improve capapbilities######
MS_data_controls['Enrichment']= np.log2(MS_data_controls['Abundance']/MS_data_controls['Abundance_Controls'])
MS_data=MS_data_controls.drop(columns=['Abundance','Abundance_Controls'])
raw_prop_data=pd.merge(MS_data, uniprot_dat, how='left',on='Entry')

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [44]:
MS_data_clean = raw_MS_data.copy()
PROT_cleaned_data = normalize_mass_length_1DF(raw_prop_data) #function found in data_prep_functions line 167, normalizes mass, length and mw by dividing all values by the max in the column
Protein_data_complete = pd.merge(PROT_cleaned_data, NSP_data, left_on='Entry', right_on='Entry') #merges netsurfp features and biopython features
#creates new column called asa_sum_normalized which is the asa_sum value divide by the mass of the protein
for df in [Protein_data_complete]:
    for col in ['asa_sum']:
        df[col+'_normalized'] = df[col] / df['Mass']

data_complete= pd.merge(Protein_data_complete,NPdata,how='left', on='NPUNID')
data_complete.drop(columns=['notes','Notes','BET','NPUNID'],inplace=True)
#Optional here to drop all enrichment values that are NA - also have to deal with positive/negative infinity values
# data_complete.dropna(subset=['Enrichment'],inplace=True)

In [56]:
data_complete.dropna(inplace=True)
data_complete= data_complete.replace([-np.inf],'-12')
data_complete=data_complete.replace([np.inf],'12')

In [65]:
labels=data_complete['Enrichment'].copy()
labels=np.ravel(labels)
df=data_complete.drop(['Entry','Sequence','Core Material','Ligand','NPID','Enrichment'],axis=1)

print(labels)
# print(labels)
# print(df)
first_frame = True #starting dataframe for saving metrics
correctness_frame = pd.DataFrame()
metrics_frame = pd.DataFrame()
print_metrics = 1 #0, doesn't show metrics while runnning for each model, 1 does show metrics
trials = 100


[-3.058068989758959 -2.6263436622455423 -2.6607046134587575 ...
 1.7890225357232328 '-12' '-12']


In [None]:
labels

In [55]:
data_complete

Unnamed: 0,Entry,Enrichment,Sequence,Length,Mass,frac_aa_A,frac_aa_C,frac_aa_D,frac_aa_E,frac_aa_F,...,Core Material,Ligand,Dtem,Dh,Shaken,Centrifuged,NP_incubation Concentration (mg/mL),Incubation Concentration (mg/ml),Incubation Time (minutes),Temperature
0,P02769,-3.058069,MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGL...,607.0,69293.0,0.079077,0.057661,0.065898,0.097199,0.049423,...,Iron Oxide,Carboxylate BSA,100.0,230.0,1.0,0.0,3.2,4.0,30.0,25.0
1,P02769,-2.626344,MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGL...,607.0,69293.0,0.079077,0.057661,0.065898,0.097199,0.049423,...,Iron Oxide,Carboxylate BSA,100.0,230.0,1.0,0.0,3.2,4.0,30.0,25.0
2,P02769,-2.660705,MKWVTFISLLLLFSSAYSRGVFRRDTHKSEIAHRFKDLGEEHFKGL...,607.0,69293.0,0.079077,0.057661,0.065898,0.097199,0.049423,...,Iron Oxide,Carboxylate BSA,100.0,230.0,1.0,0.0,3.2,4.0,30.0,25.0
6,P41361,4.586080,MISNGIGTVTAGKRSICLLPLLLIGLWGCVTCHRSPVEDVCTAKPR...,465.0,52347.0,0.064516,0.019355,0.045161,0.075269,0.053763,...,Iron Oxide,Carboxylate BSA,100.0,230.0,1.0,0.0,3.2,4.0,30.0,25.0
7,P41361,4.631187,MISNGIGTVTAGKRSICLLPLLLIGLWGCVTCHRSPVEDVCTAKPR...,465.0,52347.0,0.064516,0.019355,0.045161,0.075269,0.053763,...,Iron Oxide,Carboxylate BSA,100.0,230.0,1.0,0.0,3.2,4.0,30.0,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3149,Q00896,-inf,MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIAT...,412.0,45823.0,0.070388,0.007282,0.050971,0.072816,0.060680,...,R101,none,800.0,680.0,1.0,1.0,125.0,0.2,60.0,25.0
3150,Q00896,-inf,MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIAT...,412.0,45823.0,0.070388,0.007282,0.050971,0.072816,0.060680,...,P25,none,378.0,410.0,1.0,1.0,62.5,0.2,60.0,25.0
3151,Q00896,1.789023,MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIAT...,412.0,45823.0,0.070388,0.007282,0.050971,0.072816,0.060680,...,P25,none,378.0,410.0,1.0,1.0,62.5,0.2,60.0,25.0
3152,Q00896,-inf,MTPSISWGLLLLAGLCCLVPSFLAEDVQETDTSQKDQSPASHEIAT...,412.0,45823.0,0.070388,0.007282,0.050971,0.072816,0.060680,...,P25,none,378.0,410.0,1.0,1.0,62.5,0.2,60.0,25.0


In [66]:
from sklearn.feature_selection import RFE

estimator = RandomForestRegressor(n_estimators=100)
selector = RFE(estimator, n_features_to_select= 15, step=1)
selector = selector.fit(df,labels)
selector.support_

array([False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False,  True, False, False, False, False, False,  True,
       False, False,  True,  True, False, False, False, False, False,
        True, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
       False, False, False, False, False])

In [70]:
selector.ranking_

array([70, 84, 25, 35, 57, 66, 26,  3, 44,  1, 50, 19, 14, 12, 78,  1, 16,
       72, 37, 39,  6, 43, 73, 20,  8,  9, 15,  1,  4,  1, 41, 22, 52, 56,
       32,  1, 29, 85,  1,  1, 80, 61, 30, 68, 83,  1, 82, 86,  1, 33, 13,
       62, 49, 76, 69, 48,  5, 71, 27, 60,  1, 54, 59, 65, 45,  2, 38, 74,
       81,  1,  1, 79, 58, 53, 42, 67, 31, 47, 23, 10,  7, 28, 24, 34, 55,
       40,  1, 36, 63, 18, 46, 21, 64, 51, 75,  1,  1, 11, 89, 77, 17, 87,
       88, 90])

In [67]:
feat_list=selector.get_feature_names_out()
print(feat_list)

['frac_aa_I' 'frac_aa_Q' 'flexibility_var' 'flexibility_min'
 'secondary_structure_fraction_disordered' 'mass' 'Unnamed: 0'
 'fraction_exposed_polar_exposed' 'rsa_std' 'fraction_total_exposed_M'
 'fraction_total_exposed_Y' 'fraction_exposed_exposed_A'
 'fraction_exposed_exposed_T' 'Zeta Potential' 'Dtem']


In [68]:
df_rfe=df[feat_list].copy()
df_rfe

Unnamed: 0.1,frac_aa_I,frac_aa_Q,flexibility_var,flexibility_min,secondary_structure_fraction_disordered,mass,Unnamed: 0,fraction_exposed_polar_exposed,rsa_std,fraction_total_exposed_M,fraction_total_exposed_Y,fraction_exposed_exposed_A,fraction_exposed_exposed_T,Zeta Potential,Dtem
0,0.024712,0.032949,0.000695,0.936357,0.275124,0.136020,4766,0.742604,0.223,0.003295,0.006590,0.082840,0.073964,-38.0,100.0
1,0.024712,0.032949,0.000695,0.936357,0.275124,0.136020,4766,0.742604,0.223,0.003295,0.006590,0.082840,0.073964,-38.0,100.0
2,0.024712,0.032949,0.000695,0.936357,0.275124,0.136020,4766,0.742604,0.223,0.003295,0.006590,0.082840,0.073964,-38.0,100.0
6,0.062366,0.034409,0.000804,0.931119,0.193548,0.102756,279,0.691176,0.252,0.004301,0.008602,0.040441,0.062500,-38.0,100.0
7,0.062366,0.034409,0.000804,0.931119,0.193548,0.102756,279,0.691176,0.252,0.004301,0.008602,0.040441,0.062500,-38.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3149,0.048544,0.043689,0.000799,0.936714,0.162621,0.089949,19815,0.741071,0.258,0.007282,0.002427,0.044643,0.084821,-52.0,800.0
3150,0.048544,0.043689,0.000799,0.936714,0.162621,0.089949,19815,0.741071,0.258,0.007282,0.002427,0.044643,0.084821,-21.0,378.0
3151,0.048544,0.043689,0.000799,0.936714,0.162621,0.089949,19815,0.741071,0.258,0.007282,0.002427,0.044643,0.084821,-21.0,378.0
3152,0.048544,0.043689,0.000799,0.936714,0.162621,0.089949,19815,0.741071,0.258,0.007282,0.002427,0.044643,0.084821,-21.0,378.0


In [69]:

x_train, x_test, y_train, y_test = train_test_split(df,labels, test_size = 0.33, random_state=42)
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=2016)
# for train_index, test_index in sss.split(df, labels):
# x_train = df.iloc[train_index]
# X_test = df.iloc[test_index]
# y_train = labels.iloc[train_index]
# y_test = labels.iloc[test_index]

rfg=RandomForestRegressor(n_estimators=100)
rfg.fit(x_train,y_train)
print(rfg.score(x_test,y_test))
print(rfg.feature_importances_)
# metrics_dict = {'AUC':metrics.roc_auc_score(y_test, rfg.predict_proba(x_test)[:, 1]),
#             'Accuracy':rfg.score(x_test, y_test), 'Recall':recall_score(y_test, rfg.predict(x_test)),
#             'Precision':precision_score(y_test, rfg.predict(x_test), zero_division=0), 'F1':f1_score(y_test, rfg.predict(x_test))}
# metrics_frame = pd.DataFrame.from_dict(data=metrics_dict,orient='index').transpose()

0.4842597983369027
[0.00278302 0.01030632 0.01313748 0.00875351 0.00683374 0.00498888
 0.00796487 0.01251578 0.01133626 0.01843671 0.01336709 0.01117691
 0.00953963 0.00808017 0.00633013 0.02551137 0.00680407 0.0072462
 0.0124571  0.00805565 0.01321333 0.0067056  0.00746923 0.01131588
 0.00887882 0.01156137 0.01620082 0.01228784 0.01350195 0.02472102
 0.00711607 0.00778677 0.00615023 0.00591355 0.00860451 0.00880385
 0.00864762 0.00271799 0.00863806 0.03504366 0.0027549  0.00329766
 0.00451402 0.00649205 0.0038334  0.00535787 0.00313132 0.00301483
 0.02121946 0.00734019 0.01365409 0.00623983 0.00609741 0.00650613
 0.00411996 0.00666445 0.00879869 0.0048202  0.00757055 0.00852602
 0.02106027 0.00683431 0.00653906 0.00685978 0.00693153 0.0105419
 0.01069131 0.00524909 0.00643607 0.01594246 0.0158922  0.00476728
 0.00617462 0.01078512 0.00416028 0.00651023 0.01069121 0.00579392
 0.00614301 0.00792539 0.01731439 0.00465425 0.00989679 0.00894439
 0.00590931 0.00736137 0.01059044 0.00662479 