In [34]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import PLSRegression
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn import model_selection

four_phosphoProteins = ['p-ERK1/2', 'p-HSP27', 'p-Akt', 'p-p70 S6K']

In [2]:
X = pd.read_csv('Cosgrove_X_zscore_appended.csv', index_col=0)
Y = pd.read_csv('Cosgrove_Y_zscore.csv', index_col = 0)

In [25]:
# To measure percent variance explained, through the regression coefficient, run cross-fold validation 
# from 1 to 20 components
perc_var = []
for i in np.arange(1, 20):
    pls = PLSRegression(n_components=i)
    pls.fit(X, Y)
    y_fit = pls.predict(X)
    perc_var.append(r2_score(y_fit, Y))



In [26]:
# PLSR with 17 components and full signaling matrix
ncomp = 17
pls = PLSRegression(n_components=ncomp)
pls.fit(X, Y)

#loadings, scores, and coefficients are part of the pls class 
#example, x_loadings
x_loadings = pls.x_loadings_

#get percent variance explained in X
total_variance_in_x = np.var(X, axis = 0)
variance_in_x = np.var(pls.x_scores_, axis = 0) 
fractions_of_explained_variance_x = variance_in_x / sum(total_variance_in_x)


#find the predicted outputs 
y_fit = pls.predict(X)




In [38]:
# PLSR with 4 phosphoprotein model, 4 principal components

# First get the columns of X that are from the four phosphoprotein models
timeCols = ['0.33hr', '4hr',  '24hr', '48hr', 'lateAvg', 'integral']
fourProteinCols = []
for signal in four_phosphoProteins:
    for time in timeCols:
        fourProteinCols.append("%s_%s"%(signal,time))
X_sub = X[fourProteinCols]

ncomp = 4
pls = PLSRegression(n_components=ncomp)
pls.fit(X_sub, Y)

#loadings, scores, and coefficients are part of the pls class 
#example, x_loadings
x_loadings = pls.x_loadings_

#get percent variance explained in X
total_variance_in_x = np.var(X_sub, axis = 0)
variance_in_x = np.var(pls.x_scores_, axis = 0) 
fractions_of_explained_variance_x = variance_in_x / sum(total_variance_in_x)


#find the predicted outputs 
y_fit = pls.predict(X_sub)

In [45]:
# Example of leave one out cross-validation, get y_fit in a leave-one-out cross-validation
ncomp = 4
pls = PLSRegression(n_components=ncomp)

X_data = X_sub.to_numpy()
Y_data = Y.to_numpy()

loo = model_selection.LeaveOneOut()
loo.get_n_splits(X)
y_fit = []
y_true = []
for train_index, test_index in loo.split(X_sub):
    pls.fit(X_data[train_index], Y_data[train_index])
    y_true.append(Y_data[test_index])
    y_fit.append(pls.predict(X_data[test_index]))