In [None]:
import pandas as pd
import numpy as np
from sklearn.cross_decomposition import PLSRegression
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn import model_selection

four_phosphoProteins = ['p-ERK1/2', 'p-HSP27', 'p-Akt', 'p-p70 S6K']

In [None]:
X = pd.read_csv('Cosgrove_X_zscore_appended.csv', index_col=0)
Y = pd.read_csv('Cosgrove_Y_zscore.csv', index_col = 0)
toxic_drugs = pd.read_csv('Cosgrove_drug_toxicity.csv', index_col=0)

In [None]:
# inspect the Y data in a heatmap
plt.figure(figsize=(4,12))
plt.imshow(Y.values, aspect='auto', cmap='viridis')
plt.colorbar(label='Z-score')
plt.yticks(ticks=np.arange(Y.shape[0]), labels=Y.index)
plt.xticks(ticks=np.arange(Y.shape[1]), labels=Y.columns, rotation=90)
plt.title('Heatmap of Y Data (Z-scores)')
plt.xlabel('Signaling Proteins')

In [None]:
# PLSR with full feature model 
ncomp = X.shape[1]  #number of components = number of features
pls_model = PLSRegression(n_components=ncomp)
pls_model.fit(X, Y) # train the model

Y_full_pred = pls_model.predict(X) # use the model to make a prediction

perc_var_explained = plsr_variance_explained(pls_model, X, Y)




In [None]:
def plsr_variance_explained(pls_model, X, Y):
    """
    Given a fitted PLSRegression model, calculate the variance explained (R^2)
    for the response variable.
    Parameters:
    pls_model (PLSRegression): A fitted PLSRegression model.
    X (pd.DataFrame): The predictor variables used in the model.
    Y (pd.DataFrame): The response variable used in the model.
    Returns:
    perc_var_explained: list of floats, The R^2 value indicating variance explained for each component.
    """
    max_components = pls_model.n_components
    
    perc_var_explained = []
    for i in range(0,max_components):
        Y_pred2=np.dot(pls_model.x_scores_[:,i].reshape(-1,1),pls_model.y_loadings_[:,i].reshape(-1,1).T)+Y.mean(axis=0)[0]
        perc_var_explained.append(round(r2_score(Y,Y_pred2),3)) 
    return perc_var_explained

In [None]:
# PLSR with 4 phosphoprotein model, 4 principal components

# First get the columns of X that are from the four phosphoprotein models
timeCols = ['0.33hr', '4hr',  '24hr', '48hr', 'lateAvg', 'integral']
fourProteinCols = []
for signal in four_phosphoProteins:
    for time in timeCols:
        fourProteinCols.append("%s_%s"%(signal,time))
X_sub = X[fourProteinCols]

ncomp = 4
pls = PLSRegression(n_components=ncomp)
pls.fit(X_sub, Y)

#loadings, scores, and coefficients are part of the pls class 
#example, x_loadings
x_loadings = pls.x_loadings_


#find the predicted outputs 
y_fit = pls.predict(X_sub)

In [None]:
# Example of leave one out cross-validation, get y_fit in a leave-one-out cross-validation
ncomp = 4
pls = PLSRegression(n_components=ncomp)

X_data = X_sub.to_numpy()
Y_data = Y.to_numpy()

loo = model_selection.LeaveOneOut()
loo.get_n_splits(X)
y_fit = []
y_true = []
for train_index, test_index in loo.split(X_sub):
    pls.fit(X_data[train_index], Y_data[train_index])
    y_true.append(Y_data[test_index])
    y_fit.append(pls.predict(X_data[test_index]))