In [1]:
# Load necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
from scipy.optimize import minimize
from scipy.optimize import curve_fit
from sklearn.decomposition import PCA

In [None]:
# Set the directory where your data files are located
data_dir = '/home/jwm/data/JASCO_FP_6500_Spectrofluorometer/2024april13_50mM_DAB2_418_630_150mM_NaCl_1DCVJ_slow_hysteresis/'

# Set the number of the first and last spectrum to use in the plot -1
first_spectrum = 0
last_spectrum = 30

# Define the range of titles you want to load
start_title = 7
end_title = 30

# Define the position of the box
box_x = 10
box_y = 300

titles = []

# Function to load (x, y) data from txt file starting from the 19th line
def load_data(file_path):
    return np.loadtxt(file_path, skiprows=18)

# Get all txt files in the directory
data_files = glob.glob(data_dir + "*.txt")

# Initialize a list to store loaded data
loaded_data = []

# Initialize variables for baseline data
baseline_data = None
baseline_title = None

# Load data from each file
for file_path in data_files:
    loaded_data.append(load_data(file_path))

# Extract 6th to 4th last digits from file name and use as title
titles = [int(file_path.split('/')[-1].split('.')[0][-7:-4]) for file_path in data_files]

print(titles)

# Sort the titles and data_files in ascending order
sorted_indices = np.argsort(titles)
titles = [titles[i] for i in sorted_indices]
data_files = [data_files[i] for i in sorted_indices]
loaded_data = [loaded_data[i] for i in sorted_indices]

# Extract x-values from the first spectrum in loaded_data
wavelengths = loaded_data[0][2:, 0].astype(float)
    
# Check if file name contains "999" and assign it as baseline data
if "999" in file_path:
    baseline_data = loaded_data[-1]
    baseline_title = title

# Sort the files by their title
data_files.sort(key=lambda x: int(x.split('/')[-1].split('.')[0][-6:-4]))

# print(titles)
# print(loaded_data)

# Convert extracted digits back to temperature values
temperatures = [10 + i * 0.5 for i in range(len(titles))]


# Sample loaded_data structure
# loaded_data = [
#    np.array([[470.0, 422.133], [470.1, 422.08], [470.2, 422.001], [650.0, 7.49391]]),
#    np.array([[470.0, 430.678], [470.1, 430.75], [470.2, 430.611], [650.0, 7.97855]])
#]

# Function to create DataFrame
def create_dataframe(loaded_data, titles):  # spectra in rows
    # Initialize DataFrame with the first spectrum
    df = pd.DataFrame(loaded_data[0], columns=['wavelength', titles[0]])
    
    # Iterate over the remaining spectra and concatenate them to the DataFrame
    for i in range(1, len(loaded_data)):
        spectrum_df = pd.DataFrame(loaded_data[i], columns=['wavelength', titles[i]])
        df = pd.merge(df, spectrum_df, on='wavelength', how='outer')
    
    return df

# Example usage
# included_titles = [int(title) for title in input("Enter the titles to include (separated by space): ").split()]
# selected_data = [loaded_data[titles.index(title)] for title in included_titles]

title_inputs = input("Enter the titles to include (separated by space), and ranges separated by '-': ").split()
included_titles = [title for title_input in title_inputs for title in (list(range(int(title_input.split('-')[0]), int(title_input.split('-')[-1])+1)) if '-' in title_input else [int(title_input)])]

def get_selected_data(loaded_data, titles_input):
    selected_data = []
    selected_temperatures = []
    for title_input in titles_input:
        if '-' in str(title_input):
            start, end = map(int, title_input.split('-'))
            selected_data.extend(loaded_data[start-1:end])
            selected_temperatures.extend(temperatures[start-1:end])
        else:
            selected_data.append(loaded_data[int(title_input)-1])
            selected_temperatures.append(temperatures[int(title_input)-1])
    return selected_data, selected_temperatures
    
selected_data, selected_temperatures = get_selected_data(loaded_data, included_titles)

df = create_dataframe(selected_data, included_titles)
df = df.transpose()
print(df)


n_components=4

# Extract spectra and wavelengths
spectra = df.iloc[1:, 2:].values.astype(float)  # Convert to numpy array
# wavelengths = df.iloc[1, 2:].values.astype(float)
# wavelengths = df.columns[2:].astype(float)

def perform_pca(df, n_components):
    # Extracting the spectra data from the DataFrame
    spectra = df.values[2:, 2:]
    
    # Initialize PCA with the desired number of components
    pca = PCA(n_components=n_components)
    
    # Fit PCA to the spectra data
    pca.fit(spectra)
    
    # Get the explained variance ratio and singular values
    explained_variance_ratio = pca.explained_variance_ratio_
    singular_values = pca.singular_values_
    components = pca.components_
    
    return explained_variance_ratio, singular_values, components

# Example usage:
explained_variance_ratio, singular_values, components = perform_pca(df, n_components)

print(explained_variance_ratio)

def plot_singular_values(explained_variance_ratio, singular_values):
    # Plotting singular values
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(singular_values) + 1), singular_values, marker='o', linestyle='-')
    plt.xlabel('Component', fontsize=12)
    plt.ylabel('Singular Value', fontsize=12)
    plt.title('Singular Values for Each Component', fontsize=14)
    plt.grid(True)
    plt.show()

    # Initialize scaling factors list
    scaling_factors = []
    
def calculate_scaling_factors(df, n_components, components):
    scaling_factors = []
    max_length = 0
    
    for spectrum in spectra:
        # Objective function to minimize the reconstruction error
        def objective(x):
            reconstructed_spectrum = np.dot(components.T, x)
            return np.sum((spectrum - reconstructed_spectrum)**2)
        
        # Initial guess for the scaling factors
        x0 = np.ones(n_components)
        
        # Minimize the reconstruction error
        res = minimize(objective, x0)
        
        # Get the length of the optimized scaling factors
        length = len(res.x)
        max_length = max(max_length, length)
        
        # Append the optimized scaling factors to the list
        scaling_factors.append(res.x)
    
    # Pad shorter arrays with zeros to match the length of the longest array
    for i in range(len(scaling_factors)):
        scaling_factors[i] = np.pad(scaling_factors[i], (0, max_length - len(scaling_factors[i])), 'constant')
    
    return np.array(scaling_factors)


# Example usage:
scaling_factors = calculate_scaling_factors(df, n_components, components)
print(scaling_factors)


def plot_components_with_explained_variances(components, explained_variance, wavelengths):
    plt.figure(figsize=(12, 8))
    for i, component in enumerate(components):
        plt.plot(wavelengths, component, label=f'Component {i+1} (Explained Variance: {explained_variance[i]:.2f})')

    plt.xlabel('Wavelength', fontsize=12)
    plt.ylabel('Intensity', fontsize=12)
    plt.title('Principal Components with Explained Variances', fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.show()

# Example usage:
plot_components_with_explained_variances(components, singular_values, wavelengths)

print(components)
print(wavelengths)


def plot_scaling_factors(scaling_factors, selected_temperatures, n_components):
    plt.figure(figsize=(10, 6))
    for i in range(n_components):
        scaling_factor_values = [factor[i] for factor in scaling_factors]
        plt.plot(selected_temperatures, scaling_factor_values, label=f'Component {i+1}')

    plt.xlabel('Temperature', fontsize=12)
    plt.ylabel('Scaling Factor', fontsize=12)
    plt.title('Scaling Factors for Each Spectrum', fontsize=14)
    plt.legend()
    plt.grid(True)
    plt.show()

# Example usage:
plot_scaling_factors(scaling_factors, selected_temperatures, n_components)




[16, 11, 29, 30, 17, 23, 22, 3, 12, 19, 6, 15, 28, 18, 5, 8, 25, 14, 7, 24, 4, 10, 2, 9, 27, 31, 26, 999, 20, 1, 21, 13]
