In [None]:
import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from natsort import natsorted

In [None]:
spectra = glob.glob("*.csv")
spectra

In [None]:
natsorted(spectra)

In [None]:
fig, ax = plt.subplots()
for spectrum in natsorted(spectra):
    data = pd.read_csv(spectrum, header=0)
    data.columns = ["logM", "Intensity"]
    ax.plot(data["logM"], data["Intensity"], label=spectrum)
ax.legend()
ax.set_xlabel("logM")
ax.set_ylabel("Intensity")

In [None]:
x_max = 0
x_min = 10
for spectrum in natsorted(spectra):
    data = pd.read_csv(spectrum, header=0)
    data.columns = ["logM", "Intensity"]
    if x_max < data["logM"].max():
        x_max = data["logM"].max()
    if x_min > data["logM"].min():
        x_min = data["logM"].min()


In [None]:
x_min, x_max

In [None]:
x_resample = np.linspace(x_min - 1, x_max + 1, 1000)
resampled_spectra = []
for spectrum in natsorted(spectra):
    data = pd.read_csv(spectrum, header=0)
    data.columns = ["logM", "Intensity"]
    y_resample = np.interp(x_resample, data["logM"], data["Intensity"])
    resampled_spectra.append(y_resample)

resampled_spectra = pd.DataFrame(
    resampled_spectra, columns=[str(i) for i in range(len(x_resample))]
)
resampled_spectra["label"] = natsorted(spectra)

In [None]:
resampled_spectra

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

spectra_data = resampled_spectra.loc[:, resampled_spectra.columns != "label"]
sc = StandardScaler()
spectra_data_scaled = sc.fit_transform(spectra_data)

pca = PCA(n_components=5)
pca_components = pca.fit_transform(spectra_data_scaled)

In [None]:
pca_components

In [None]:
fig, ax = plt.subplots()
for i, spectrum in enumerate(natsorted(spectra)):
    ax.scatter(pca_components[i, 0], pca_components[i, 1], label=spectrum)
    ax.text(pca_components[i, 0], pca_components[i, 1], spectrum[:4])


In [None]:
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(6, 3))

x_pos = np.arange(1, len(explained_variance_ratio) + 1)
ax1.bar(x_pos, explained_variance_ratio * 100)
ax1.set_xlabel("Num. PC")
ax1.set_ylabel("Variance ratio (%)")
ax1.grid(True, alpha=0.3)
ax1.set_xticks(x_pos)

ax2.plot(x_pos, cumulative_variance_ratio * 100, marker="o", color="blue")
ax2.set_xlabel("Num. PC")
ax2.set_ylabel("Cumulative variance ratio (%)")
ax2.grid(True, alpha=0.3)
ax2.set_xticks(x_pos)

fig.tight_layout()