In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from helper_functions_pipe_testing import *
from sklearn.metrics import  f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import *


#Instructions for the pipeline
Requires four inputs:
    - Mass spec data with corresponding NP surface characteristics and experimental conditions (time, concentration)
    - Proteome data from uniprot (Mass, length, sequence, GO for later analysis)
    - NetsurfP data for the proteins that are to be searched
    - X characteristics to predict

pipeline
Take mass spec spreadsheet
Accession,Enrichment,Dh,TEM,Zp,BET,Composition,Ligand,Shape,IncubationTime,IncubationConcentration
Merge with Proteome data to get file that has
Accession,Enrichment,Dh,TEM,Zp,BET,Composition,Ligand,Shape,IncubationTime,IncubationConcentration,Mass,Length,Sequence
Calculate protein features using biopython
Merge with NSP data to get all protein features

Split into X and Y dataset with Entries as labels

In [3]:
in_dir="Input_data/Proteomic data/"
out_dir="Output_data/"
NSP_dir="NetsurfP_Proteomes/"
uniprot_dir="UniProt/"

datfile='test1a.xlsx'

#netsurfp data file
NSPfilePath=NSP_dir+'Bovine_Proteome.xlsx'
NSP_data=pd.read_excel(NSPfilePath)
#Uniprot_data file
uniprot_filepath=uniprot_dir+'Bovine_Proteome_082322.xlsx'
uniprot_dat=pd.read_excel(uniprot_filepath,header=0)

raw_MS_data=pd.read_excel(in_dir+datfile,header=0)
print(raw_MS_data.shape)
raw_prop_data=pd.merge(raw_MS_data, uniprot_dat, left_on='Entry', right_on='Entry')
print(raw_prop_data.shape)
# print(raw_prop_data.columns)
# raw_prop_data=raw_prop_data[['Entry','Enrichment','Length','Mass','Sequence']]
# print(raw_prop_data)
# MS_data_clean = clean_up_data_mass_spec(raw_MS_data)
Accesions_IDs = raw_prop_data["Entry"].to_frame()
# print(Accesions_IDs)

# print(raw_prop_data['Accession'])
#replace Xs and Cs
# raw_prop_data=raw_prop_data[['Entry','Length','Mass','Sequence']]
PROT_cleaned_data = clean_up_data_biopy(raw_prop_data, Accesions_IDs) #calculates biopython features from protein sequences, and removes proteins removed during mass spec clean up
print(PROT_cleaned_data.shape)
#
PROT_cleaned_data = normalize_mass_length_1DF(PROT_cleaned_data) #function found in data_prep_functions line 167, normalizes mass, length and mw by dividing all values by the max in the column
#
Protein_data_complete = pd.merge(PROT_cleaned_data, NSP_data, left_on='Entry', right_on='Entry') #merges netsurfp features and biopython features
#
#
# #creates new column called asa_sum_normalized which is the asa_sum value divide by the mass of the protein
for df in [Protein_data_complete]:
    for col in ['asa_sum']:
        df[col+'_normalized'] = df[col] / df['Mass']
print(PROT_cleaned_data.shape)
print(Protein_data_complete.columns)
Accesions_IDs = Protein_data_complete["Entry"].to_frame()
MS_clean=Accesions_IDs.merge(raw_MS_data,left_on='Entry',right_on='Entry')
print(MS_clean)
#eventually merge NP and Experimental data with protein data to make X_data






(72, 2)
(72, 7)
(71, 39)
(71, 41)
Index(['Entry', 'Sequence', 'Length', 'Mass', 'frac_aa_A', 'frac_aa_C',
       'frac_aa_D', 'frac_aa_E', 'frac_aa_F', 'frac_aa_G', 'frac_aa_H',
       'frac_aa_I', 'frac_aa_K', 'frac_aa_L', 'frac_aa_M', 'frac_aa_N',
       'frac_aa_P', 'frac_aa_Q', 'frac_aa_R', 'frac_aa_S', 'frac_aa_T',
       'frac_aa_V', 'frac_aa_W', 'frac_aa_Y', 'molecular_weight',
       'aromaticity', 'instability_index', 'flexibility_mean',
       'flexibility_std', 'flexibility_var', 'flexibility_max',
       'flexibility_min', 'flexibility_median', 'isoelectric_point',
       'secondary_structure_fraction_helix',
       'secondary_structure_fraction_turn',
       'secondary_structure_fraction_sheet',
       'secondary_structure_fraction_disordered', 'gravy', 'length', 'mass',
       'Unnamed: 0', 'fraction_exposed', 'fraction_buried',
       'fraction_exposed_nonpolar_total', 'fraction_exposed_nonpolar_exposed',
       'fraction_exposed_polar_total', 'fraction_exposed_polar_exp

In [4]:
print(Protein_data_complete.shape)

(61, 97)


In [None]:
labels=MS_clean.drop(['Entry'], axis=1)
labels=np.ravel(labels)
df=Protein_data_complete.drop(['Entry','Sequence'],axis=1)

print(labels)
# print(labels)
# print(df)
first_frame = True #starting dataframe for saving metrics
correctness_frame = pd.DataFrame()
metrics_frame = pd.DataFrame()
print_metrics = 1 #0, doesn't show metrics while runnning for each model, 1 does show metrics
trials = 100


x_train, x_test, y_train, y_test = train_test_split(df,labels, test_size = 0.33, random_state=42)
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=2016)
# for train_index, test_index in sss.split(df, labels):
# x_train = df.iloc[train_index]
# X_test = df.iloc[test_index]
# y_train = labels.iloc[train_index]
# y_test = labels.iloc[test_index]

rfg=RandomForestRegressor(n_estimators=100)
rfg.fit(x_train,y_train)
print(rfg.score(x_test,y_test))
print(rfg.feature_importances_)
# metrics_dict = {'AUC':metrics.roc_auc_score(y_test, rfg.predict_proba(x_test)[:, 1]),
#             'Accuracy':rfg.score(x_test, y_test), 'Recall':recall_score(y_test, rfg.predict(x_test)),
#             'Precision':precision_score(y_test, rfg.predict(x_test), zero_division=0), 'F1':f1_score(y_test, rfg.predict(x_test))}
# metrics_frame = pd.DataFrame.from_dict(data=metrics_dict,orient='index').transpose()


In [30]:
in_dir="Input_data/Proteomic data/"
out_dir="Output_data/"
NSP_dir="NetsurfP_Proteomes/"
uniprot_dir="UniProt/"

#Mass Spec data input in one excel spreadsheet
datfile='test_withNPID.xlsx'
uniprot_filepath=uniprot_dir+'Bovine_Proteome_082322.xlsx'
uniprot_dat=pd.read_excel(uniprot_filepath,header=0)
# Protein data input in one excel spreadsheet with two sheets

datfile='test_withNPID.xlsx'
# prot_prop = 'Protein Properties'
# mass_spec = 'Mass Sped Details'



#netsurfp data file
NSPfilePath=NSP_dir+'Bovine_Proteome.xlsx'

#NP data file
NPdata=pd.read_excel(in_dir+"NP_Database.xlsx",header=0)

raw_MS_data=pd.read_excel(in_dir+datfile,header=0)
raw_prop_data=pd.merge(raw_MS_data, uniprot_dat, left_on='Entry', right_on='Entry')
print(raw_prop_data)
NSP_data=pd.read_excel(NSPfilePath)
#
# #clean up and calculate % protein abundance and enrichment, function found in data_prep_functions line 470
#
MS_data_clean = raw_MS_data.copy()
Accesions_IDs = MS_data_clean["Entry"].to_frame()


# clean up protein data, function found in data_prep_functions.py line 367

PROT_cleaned_data = clean_up_data_biopy(raw_prop_data, Accesions_IDs) #calculates biopython features from protein sequences, and removes proteins removed during mass spec clean up

PROT_cleaned_data = normalize_mass_length_1DF(PROT_cleaned_data) #function found in data_prep_functions line 167, normalizes mass, length and mw by dividing all values by the max in the column

Protein_data_complete = pd.merge(PROT_cleaned_data, NSP_data, left_on='Entry', right_on='Entry') #merges netsurfp features and biopython features


#creates new column called asa_sum_normalized which is the asa_sum value divide by the mass of the protein
for df in [Protein_data_complete]:
    for col in ['asa_sum']:
        df[col+'_normalized'] = df[col] / df['Mass']
# print(Protein_data_complete.columns)
# print(NPdata)

data_complete= pd.merge(Protein_data_complete,NPdata,how='left', on='NPID')
print(data_complete.columns)
data_complete.drop(labels=['notes','NPID'],inplace=True,axis=1)

print(data_complete.columns)

(72, 2)
(72, 7)
(71, 39)
(71, 41)
(61, 97)
     Entry  Enrichment
0   P02769   -2.768846
1   P00735    6.352583
2   Q9N2I2    6.606206
3   Q28085    6.430654
4   P01044    4.083944
..     ...         ...
56  G3MYZ3   -1.207773
57  O02659    3.694238
58  Q29RQ1    0.610974
59  P56651    1.687743
60  Q32PJ2    1.511005

[61 rows x 2 columns]


In [19]:
print(raw_MS_data)

     Entry  Enrichment
0   P02769   -2.768846
1   P41361    4.335954
2   P00735    6.352583
3   Q9N2I2    6.606206
4   P12763   -1.479128
..     ...         ...
67  O02659    3.694238
68  Q29RQ1    0.610974
69  P56651    1.687743
70  P07224    0.241398
71  Q32PJ2    1.511005

[72 rows x 2 columns]


In [36]:
labels=MS_clean.drop(['Entry'], axis=1)
labels=np.ravel(labels)
df=Protein_data_complete.drop(['Entry','Sequence'],axis=1)

print(labels)
# print(labels)
# print(df)
first_frame = True #starting dataframe for saving metrics
correctness_frame = pd.DataFrame()
metrics_frame = pd.DataFrame()
print_metrics = 1 #0, doesn't show metrics while runnning for each model, 1 does show metrics
trials = 100


x_train, x_test, y_train, y_test = train_test_split(df,labels, test_size = 0.33, random_state=42)
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=2016)
# for train_index, test_index in sss.split(df, labels):
# x_train = df.iloc[train_index]
# X_test = df.iloc[test_index]
# y_train = labels.iloc[train_index]
# y_test = labels.iloc[test_index]

rfg=RandomForestRegressor(n_estimators=100)
rfg.fit(x_train,y_train)

metrics_dict = {'AUC':metrics.roc_auc_score(y_test, rfg.predict_proba(x_test)[:, 1]),
            'Accuracy':rfg.score(x_test, y_test), 'Recall':recall_score(y_test, rfg.predict(x_test)),
            'Precision':precision_score(y_test, rfg.predict(x_test), zero_division=0), 'F1':f1_score(y_test, rfg.predict(x_test))}
metrics_frame = pd.DataFrame.from_dict(data=metrics_dict,orient='index').transpose()


[-2.76884625  6.35258301  6.60620628  6.43065406  4.08394386  4.40732889
 -1.30000425 11.46009002  5.60916061  7.46149993  7.56458367  8.53955573
  4.32444503  4.46284158  3.34860284  5.56321463  9.48529466 -0.76584813
 11.93067379  9.18919046  3.87285529 10.93767607  2.81087734  6.6379646
 -0.75827398 10.69164752 -1.07751603 -4.66042685  7.28418398 -0.49704936
 -1.46081005 -2.58848542  0.05016747 -1.18140954  3.07292054  4.44220764
  0.09845458 -0.8624439  -0.60080095  1.47094187 -0.76131074  4.78269676
 -2.54348802  3.83456732 -1.50702277  3.70222029  0.2279234   4.03626105
  2.32670862 -0.8420308   0.05758126  0.08919988  4.83971541  4.58869108
  2.77213737 -2.43876775 -1.20777339  3.69423846  0.61097355  1.68774307
  1.51100458]


In [41]:
print(rfg.score(x_test,y_test))
print(rfg.feature_importances_)

0.15560164896493955
[0.00513707 0.00350741 0.00797886 0.01333913 0.00534708 0.02983831
 0.02573964 0.01215556 0.00451627 0.00397026 0.00190447 0.0050219
 0.00775726 0.00167601 0.00209276 0.01449517 0.0276024  0.00197442
 0.00876043 0.00210115 0.0152543  0.00307915 0.00235378 0.00199087
 0.00801014 0.00564767 0.00364288 0.00581822 0.01020752 0.00519147
 0.00938884 0.02548748 0.02545685 0.00129803 0.01520308 0.00332034
 0.02192912 0.00248955 0.00355795 0.00732782 0.00295544 0.00286757
 0.00068237 0.00151711 0.00221792 0.00332085 0.00108479 0.00093347
 0.00818404 0.00294311 0.00283465 0.0031595  0.00093302 0.01143347
 0.00340068 0.00720392 0.01865518 0.0781181  0.00323711 0.02558677
 0.04585004 0.00293531 0.00275345 0.00374175 0.02810957 0.00307741
 0.00872537 0.00595159 0.00411879 0.04666915 0.00196165 0.00170326
 0.01437289 0.00689167 0.00139472 0.00178564 0.00257786 0.0531542
 0.02220144 0.06271009 0.02218767 0.0051249  0.00291569 0.00548969
 0.01041776 0.00450503 0.01455326 0.00284474