<a href="https://colab.research.google.com/github/hwankang/chemometrics-tutorials/blob/master/chemometrics_2022_08_24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multivariate Analysis

In [None]:
!pip uninstall -y tensorflow keras
!git clone https://github.com/hwankang/chemometrics-tutorials

In [None]:
%cd chemometrics-tutorials
!pip install -r requirements.txt

# Code import

In [None]:
# Import the required python packages including 
# the custom Chemometric Model objects
import numpy as np
from sklearn import preprocessing
import pandas as pds
import matplotlib.pyplot as plt

from pyChemometrics.ChemometricsPCA import ChemometricsPCA
from pyChemometrics.ChemometricsScaler import ChemometricsScaler

# Use to obtain same values as in the text
np.random.seed(350)

In [None]:
# Import the datasets from the /data directory
# X for the NMR spectra and Y for the 2 outcome variables
X = np.genfromtxt("./data/X_spectra.csv", delimiter=',', dtype=None)
Y = pds.read_csv("./data/worm_yvars.csv",delimiter=',',dtype=None, header=None)
ppm = np.loadtxt("./data/ppm.csv",delimiter=',')

# Use pandas Categorical type to generate the dummy enconding of the Y vector (0 and 1) 
Y1 = pds.Categorical(Y.iloc[:, 0]).codes
Y2 = pds.Categorical(Y.iloc[:, 1]).codes

In [None]:
Y_DataFrame = pds.read_csv("./data/worm_yvars.csv",delimiter=',',dtype=None, header=None)
YY=Y_DataFrame
#print(X_DataFrame[1,0])
YY.info()
YY.head()
YY.tail()

In [None]:
ppm_DataFrame = pds.read_csv("./data/ppm.csv",delimiter=',')
PP=ppm_DataFrame
#print(X_DataFrame[1,0])
PP.info()
PP.head()
PP.tail()

In [None]:
ppm.info()

In [None]:
X_DataFrame = pds.read_csv("./data/X_spectra.csv")
X = X_DataFrame.values
XX=X_DataFrame
#print(X_DataFrame[1,0])
XX.info()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure()
plt.plot(ppm, X.T)
plt.title("X matrix of spectra")
plt.xlabel("$\delta$ppm")
plt.gca().invert_xaxis()
plt.ylabel("Intensity")
plt.show()

# PCA model fitting and scaling
##Scaling options and preliminary model fitting 


In [None]:
# Select the scaling options: 
# Here we are generating 3 scaling objects to explore the effect of scaling in PCA:

# Unit-Variance (UV) scaling:
scaling_object_uv = ChemometricsScaler(scale_power=1)

# Mean Centering (MC):
scaling_object_mc = ChemometricsScaler(scale_power=0)

# Pareto scaling (Par):
scaling_object_par = ChemometricsScaler(scale_power=0.5)

In [None]:
# Create and fit the PCA model - starting with UV
PCA_model_uv = ChemometricsPCA(ncomps=2, scaler=scaling_object_uv)
PCA_model_uv.fit(X)

In [None]:
# Create and fit the PCA model - MC
PCA_model_mc = ChemometricsPCA(ncomps=2, scaler=scaling_object_mc)
PCA_model_mc.fit(X)

In [None]:
# Create and fit the PCA model - Par
PCA_model_par = ChemometricsPCA(ncomps=2, scaler=scaling_object_par)
PCA_model_par.fit(X)

# Effecr of scaling on PCA Score plots

In [None]:
# PCA score plot for the mean centered model
PCA_model_mc.plot_scores(comps=[0, 1], plot_title='Mean centering')

In [None]:
# Score plot for the Pareto scaled model
PCA_model_par.plot_scores(comps=[0, 1], plot_title='Pareto scaling')

In [None]:
# PCA score plot for UV scaled model
PCA_model_uv.plot_scores(comps=[0, 1], plot_title='UV scaling')

Effect of scaling on PCA loadings

In [None]:
# Plot of first principal component loadings of mean centering model
ax = PCA_model_mc.plot_loadings(component=1, x=ppm)
ax.invert_xaxis()

In [None]:
# Plot of first principal component loadings of Pareto scaled model
ax = PCA_model_par.plot_loadings(component=1, x=ppm)
# Invert the axis to match the raw nmr spectra
ax.invert_xaxis()

In [None]:
# Plot of first principal component loadings of Unit Variance scaled model
ax = PCA_model_uv.plot_loadings(component=1, x=ppm)
ax.invert_xaxis()



# 2)   Model cross-validation and component selection




In [None]:
PCA_model_uv.scree_plot(X, total_comps=10)

In [None]:
rep_cv = PCA_model_uv.repeated_cv(X, repeats=5)

In [None]:
# Create and fit the PCA model - UV scaling
PCA_model_uv = ChemometricsPCA(ncomps=4, scaler=scaling_object_uv)
PCA_model_uv.fit(X)
PCA_model_uv.cross_validation(X)

In [None]:
PCA_model_uv.plot_loadings(component=1, sigma=2)

#3) Outlier detection and model interpretation

In [None]:
PCA_model_uv.plot_scores(comps=[0, 1])

In [None]:
outlier_idx = PCA_model_uv.outlier(X)
print("Outliers for the full 4 component model : {0}".format(outlier_idx))
outlier_idx = PCA_model_uv.outlier(X, comps=[1])
print("Outliers for the 2nd principal component : {0}".format(outlier_idx))

In [None]:
# plot the mean spectrum calculated from the raw data (blue) and the outlying spectra (red)
plt.figure()
plt.plot(ppm, X[outlier_idx, :].T, 'r')
plt.plot(ppm, np.mean(X, axis=0), 'b')
plt.gca().invert_xaxis()
plt.show()

In [None]:
ax = PCA_model_uv.plot_loadings(component=1, x=ppm)
ax.invert_xaxis()

In [None]:
# Use the center of the model as control 
model_center_sample = PCA_model_uv.inverse_transform([0, 0, 0 ,0])
# Reconstruct spectra from the 5 outliers in PC2
outlier_idx = PCA_model_uv.outlier(X, comps=[1])
out_scores = PCA_model_uv.scores[outlier_idx, :]
outliers = PCA_model_uv.inverse_transform(out_scores)
# Reconstruct a spectrum for the "mean" of these outliers
mean_outlier = PCA_model_uv.inverse_transform(out_scores.mean(axis=0))

plt.figure()
# "center" representative "normal" sample plotted in blue
plt.plot(ppm, model_center_sample, 'b')
# The outliers plotted in dashed red line
plt.plot(ppm, outliers.T, 'r--',)
# The mean outlier plotted in green
plt.plot(ppm, mean_outlier, 'g')

plt.gca().invert_xaxis()
plt.show()

In [None]:
PCA_model_uv.plot_dmodx(X, label_outliers=True)

In [None]:
# The outlier function can also be used to obtain the DmodX measure and outliers detected with it
outlier_idx = PCA_model_uv.outlier(X, measure='DmodX', alpha=0.05)
print(outlier_idx)

#Model interpretation

In [None]:
outlier_idx = PCA_model_uv.outlier(X, comps=[1])

print("The following samples (row index) have been detected as outliers: {0}".format(outlier_idx))
#Delete the outlier observations (rows)
X_rem = np.delete(X, outlier_idx, axis=0)
Y1_rem = np.delete(Y1, outlier_idx, axis=0)
Y2_rem = np.delete(Y2, outlier_idx, axis=0)

In [None]:
# Create and fit the PCA model - UV scaling
PCA_model_uv = ChemometricsPCA(ncomps=7, scaler=scaling_object_uv)
PCA_model_uv.fit(X_rem)
PCA_model_uv.scree_plot(X_rem, total_comps=10)

In [None]:
rep_cv = PCA_model_uv.repeated_cv(X_rem, repeats=5, total_comps=10)

In [None]:
PCA_model_uv.cross_validation(X_rem)
print("The estimated Q2X from the model is {0}".format(PCA_model_uv.cvParameters['Q2']))

In [None]:
PCA_model_uv.plot_scores()

#Exploring the trends in scope plots

In [None]:
# Age seems to be one of the main driving forces of variation in the dataset, judging from component 1.
PCA_model_uv.plot_scores(color=Y2_rem, discrete=True)

In [None]:
# The loadings for component number 1
PCA_model_uv.plot_loadings(component=1)

In [None]:
PCA_model_uv.plot_scores(color=Y1_rem, discrete=True, comps=[1, 2])

###{PCA is a very usefull exploratory data analysis tool, especially valuable to visualise the main trends in complex multivariate datasets. It can be very usefull for outlier detection and for preliminary data quality assessement and presence of batch or run-order effects.}

#