<a href="https://colab.research.google.com/github/hwankang/chemometrics-tutorials/blob/master/chemometrics_08_27.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -y tensorflow keras
!git clone https://github.com/hwankang/chemometrics-tutorials

In [None]:
%cd chemometrics-tutorials
!pip install -r requirements.txt

In [None]:
# Import the required python packages including 
# the custom Chemometric Model objects
import numpy as np
from sklearn import preprocessing
import pandas as pds
import matplotlib.pyplot as plt
import warnings
from sklearn.exceptions import DataConversionWarning

from pyChemometrics.ChemometricsPLSDA import ChemometricsPLSDA
from pyChemometrics.ChemometricsScaler import ChemometricsScaler
from pyChemometrics.ChemometricsOrthogonalPLSDA import ChemometricsOrthogonalPLSDA

# Use to obtain same values as in the text
np.random.seed(350)

In [None]:
# Set the data conversion warnings to appear only once to avoid repetition during CV
warnings.filterwarnings("ignore", category=DataConversionWarning)

In [None]:
# Set the plot backend to support interactive plotting
%matplotlib notebook

In [None]:
# Load the dataset
X = np.genfromtxt("./data/X_spectra.csv", delimiter=',', dtype=None)
Y = pds.read_csv("./data/worm_yvars.csv",delimiter=',',dtype=None, header=None)
ppm = np.loadtxt("./data/ppm.csv",delimiter=',')

# Use pandas Categorical type to generate the dummy enconding of the Y vector (0 and 1) 
Y1 = pds.Categorical(Y.iloc[:, 0]).codes
Y2 = pds.Categorical(Y.iloc[:, 1]).codes

In [None]:
# Plot the spectra in the dataset
%matplotlib inline
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure()
plt.plot(ppm, X.T)
plt.title("X matrix of spectra")
plt.xlabel("$\delta$ppm")
plt.gca().invert_xaxis()
plt.ylabel("Intensity")
plt.show()

In [None]:
# Select the scaling options: 

# Unit-Variance (UV) scaling:
scaling_object_uv = ChemometricsScaler(scale_power=1)

# Pareto scaling:
scaling_object_par = ChemometricsScaler(scale_power=1/2)

# Mean Centring:
scaling_object_mc = ChemometricsScaler(scale_power=0)

In [None]:
# Create and fit PLS-DA model
pls_da = ChemometricsPLSDA(n_components=2, x_scaler=scaling_object_uv)
pls_da.fit(X, Y1)

In [None]:
# Plot the scores
pls_da.plot_scores(color=Y1, discrete=True, label_outliers=True, plot_title=None)

In [None]:
# Plot the weights and loadings.
# w for weights, p for loadings,
# ws for X rotations (rotated version of w) 
pls_da.plot_model_parameters(parameter='p', component=1)

In [None]:
# Plot the weights and loadings.
# w for weights, p for loadings,
# ws for X rotations (rotated version of w) 
pls_da.plot_model_parameters(parameter='w', component=1)

# 2) Model selection

In [None]:
pls_da.scree_plot(X, Y1, total_comps=10)

In [None]:
# Repeated cross_validation
rep_cv = pls_da.repeated_cv(X, Y1, repeats=5, total_comps=10)

##Outlier detection

In [None]:
pls_da.plot_scores(label_outliers=True)
pls_da.outlier(X)

In [None]:
pca_outliers = np.array([36, 100, 106, 113, 117])
X = np.delete(X, pca_outliers, axis=0)
Y1 = np.delete(Y1, pca_outliers, axis=0)
Y2 = np.delete(Y2, pca_outliers, axis=0)

In [None]:
pls_da.scree_plot(X, Y1, total_comps=10)

In [None]:
# Repeated cross_validation
rep_cv = pls_da.repeated_cv(X, Y1, repeats=5, total_comps=10)

 Refit the model

In [None]:
# Refit the model with the selected number of components
pls_da = ChemometricsPLSDA(n_components=4, x_scaler=scaling_object_uv)
pls_da.fit(X, Y1)

In [None]:
pls_da.plot_scores(color=Y1, discrete=True)

In [None]:
# Cross-validated ROC curve
pls_da.cross_validation(X, Y1)
pls_da.plot_cv_ROC()