In [1]:
import os
import linecache

import random
import math
import copy
import sys
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from tqdm import tqdm
from tqdm import trange

from itertools import chain, cycle, combinations

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score, train_test_split
from sklearn.preprocessing import label_binarize, StandardScaler, OrdinalEncoder, LabelEncoder, OneHotEncoder
from joblib import parallel_backend

## Step 1: get 1x1 dataset

In [2]:
WORKING_PATH = r"C:\Users\zhao-\Jiaheng\BSA protein\data"
df = pd.read_csv(WORKING_PATH + r'\BSA-thiol_dataset_corrected.csv')
df

  df = pd.read_csv(WORKING_PATH + r'\BSA-thiol_dataset_corrected.csv')


Unnamed: 0,401,402,403,404,405,406,407,408,409,410,...,1795,1796,1797,1798,1799,label,conc,batch,original_filename,spectrum_id
0,0.070472,0.019269,0.049263,0.061706,0.092834,0.047193,0.043878,0.064576,0.047192,0.045215,...,0.576888,0.530230,0.537368,0.515348,0.554981,Cysteamine-BSA,0.0,,,
1,0.052520,0.015020,0.073222,0.129306,0.074067,0.051379,0.087747,0.126177,0.070515,0.067214,...,0.721957,0.759064,0.741285,0.725900,0.732966,Cysteamine-BSA,0.0,,,
2,0.058144,0.061932,0.067105,0.079980,0.103823,0.079320,0.041960,0.079235,0.040840,0.004985,...,0.682711,0.688279,0.659778,0.666903,0.655693,Cysteamine-BSA,0.0,,,
3,0.041245,0.078781,0.064747,0.100347,0.141697,0.052391,0.059124,0.002798,0.032093,0.088595,...,0.550651,0.579228,0.591954,0.533540,0.564195,Cysteamine-BSA,0.0,,,
4,0.044159,0.025640,0.080267,0.119340,0.137719,0.097517,0.151911,0.150072,0.083216,0.125389,...,0.646457,0.653475,0.597139,0.597022,0.600918,Cysteamine-BSA,0.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489,0.295345,0.458984,0.336862,0.368042,0.370117,0.278304,0.328628,0.192342,0.201891,0.291367,...,0.860684,1.088699,0.905192,0.788690,0.743406,Bare-BSA,0.0,blank,blank.csv-Corrected-Normalized_by_Mean.csv,blank_56
1490,0.039019,0.012957,0.080528,0.208564,0.243892,0.091088,0.076445,0.177045,0.339447,0.376123,...,1.068873,0.373567,1.000725,0.902617,0.594679,Bare-BSA,0.0,blank,blank.csv-Corrected-Normalized_by_Mean.csv,blank_57
1491,0.421393,0.543113,0.370042,0.364926,0.254257,0.251780,0.498346,0.613888,0.519034,0.187689,...,0.971034,1.093031,0.890048,0.900299,0.867759,Bare-BSA,0.0,blank,blank.csv-Corrected-Normalized_by_Mean.csv,blank_58
1492,0.297975,0.191096,0.285573,0.328852,0.507006,0.384780,0.196296,0.159697,0.270925,0.279471,...,0.881148,1.126206,1.056573,0.852025,1.067942,Bare-BSA,0.0,blank,blank.csv-Corrected-Normalized_by_Mean.csv,blank_59


#### Read the whole dataset.
We assume the dataset is **one csv file** containing all the spectra.

This csv file must satisfy:
1. Every row is a spectrum.
2. Every column except for the last two represents a wavenumber.
3. The header of the last two columns are ```Label``` (virus type) and ```Conc``` (concentration).

If you have multiple txt files or csv files, please use corresponding code **(to be done)** to process it first.

Please remember to change the path in ```r' '```, and the path should begin with ```\```.

In [None]:
wavenumbers = np.array([int(x) for x in df.columns.tolist()[:-2]])
wavenumbers

In [None]:
labels = np.unique(df['label'])
labels

### Dataset split

In [5]:
# set the number of total iterations (groups)
iteration_number = 10
START = 1 # Included

# Adjust the proportions of test set (default = 1/3).
proportion_test = 1/3

In [7]:
for i in range(START, iteration_number+START):
    # split (train & val) and test.
    train = df.groupby(['label','conc'], group_keys=False).apply(lambda x: x.sample(frac=1-proportion_test))
    test = df.loc[df.index.difference(train.index)]
    # train.shape, test.shape
    
    # create subfolder if not exists
    subfolder_path = os.path.join(WORKING_PATH, f"BSAML101_test_train_dataset", f"train_test_set{i}")
    os.makedirs(subfolder_path, exist_ok=True)

    # save train & test to csv
    train.to_csv(os.path.join(subfolder_path, f"train_dataset{i}.csv"), index=False)
    test.to_csv(os.path.join(subfolder_path, f"test_dataset{i}.csv"), index=False)

### Save the train and test to 4 files for 4 substrates

In [8]:
for i in range(START, iteration_number+START):
    # train
    original_file_path = f"{WORKING_PATH}\\BSAML101_test_train_dataset\\train_test_set{i}\\train_dataset{i}.csv"

    df = pd.read_csv(original_file_path)

    # Get the name of the second to last column
    label_column = df.columns[-2]

    # Determine the path of the original file
    directory = os.path.dirname(original_file_path)

    # Create a new folder named 'split' in the same directory as the original file
    split_folder = os.path.join(directory, 'train_split')
    os.makedirs(split_folder, exist_ok=True)

    # Group the data by the label column
    grouped = df.groupby(label_column)

    # Iterate over each group and save to a separate CSV file in the 'split' folder
    for label, group in grouped:
        # Create a valid file name by replacing invalid characters
        label_name = str(label).replace('/', '_').replace('\\', '_')

        # Define the new file path
        new_file_path = os.path.join(split_folder, f'{label_name}.csv')

        # Save the group to a CSV file
        group.to_csv(new_file_path, index=False)

    print(f"The {i}-th TRAINING CSV files have been created for each label and saved in the 'train_split' directory.")
    
    
    # test
    original_file_path = f"{WORKING_PATH}\\BSAML101_test_train_dataset\\train_test_set{i}\\test_dataset{i}.csv"
    df = pd.read_csv(original_file_path)

    # Get the name of the second to last column
    label_column = df.columns[-2]

    # Determine the path of the original file
    directory = os.path.dirname(original_file_path)

    # Create a new folder named 'split' in the same directory as the original file
    split_folder = os.path.join(directory, 'test_split')
    os.makedirs(split_folder, exist_ok=True)

    # Group the data by the label column
    grouped = df.groupby(label_column)

    # Iterate over each group and save to a separate CSV file in the 'split' folder
    for label, group in grouped:
        # Create a valid file name by replacing invalid characters
        label_name = str(label).replace('/', '_').replace('\\', '_')

        # Define the new file path
        new_file_path = os.path.join(split_folder, f'{label_name}.csv')

        # Save the group to a CSV file
        group.to_csv(new_file_path, index=False)

    print(f"The {i}-th TESTING CSV files have been created for each label and saved in the 'test_split' directory.")

The 1-th TRAINING CSV files have been created for each label and saved in the 'train_split' directory.
The 1-th TESTING CSV files have been created for each label and saved in the 'test_split' directory.
The 2-th TRAINING CSV files have been created for each label and saved in the 'train_split' directory.
The 2-th TESTING CSV files have been created for each label and saved in the 'test_split' directory.
The 3-th TRAINING CSV files have been created for each label and saved in the 'train_split' directory.
The 3-th TESTING CSV files have been created for each label and saved in the 'test_split' directory.
The 4-th TRAINING CSV files have been created for each label and saved in the 'train_split' directory.
The 4-th TESTING CSV files have been created for each label and saved in the 'test_split' directory.
The 5-th TRAINING CSV files have been created for each label and saved in the 'train_split' directory.
The 5-th TESTING CSV files have been created for each label and saved in the 'tes

## Step 2: Generate super spectra

In [9]:
# draw from the spectra with replacement, but without duplicate
# note: without replacement = every spectra will only appear at most once, this will result in at most 40 spectra for training, 20 spectra for testing
# with replacement = spectra may appear more than once. If without duplicate, this will result in at most 40*40 spectra for training, 20*20 spectra for testing
def draw_with_replacement(*dataframes, conc_levels, num_super_spectra):
    super_spectra_list = []

    # Iterate over each concentration level
    for conc in conc_levels:
        generated_spectra = set()  # Set to keep track of generated spectra to avoid duplicates
        count = 0

        while count < num_super_spectra:
            current_spectra = []
            all_spectra_present = True

            for df in dataframes:
                # Filter the dataframe for the current concentration
                df_conc = df[df['conc'] == conc].reset_index(drop=True)

                # Check if any dataframe does not contain the concentration
                if df_conc.empty:
                    all_spectra_present = False
                    print(f"No spectra found for concentration {conc} in one of the dataframes.")
                    break  # Break out of the inner loop

                # Draw a random index with replacement
                chosen_index = np.random.choice(df_conc.index, 1)[0]

                # Select the spectrum without the last two columns ('Label', 'Conc')
                spectrum = df_conc.iloc[chosen_index, :-2].values
                current_spectra.extend(spectrum)

            # Convert list to tuple for immutability and set operations
            current_spectra_tuple = tuple(current_spectra)

            # If a spectrum was found in each dataframe for the current concentration
            if all_spectra_present:
                if current_spectra_tuple not in generated_spectra:
                    generated_spectra.add(current_spectra_tuple)
                    # Assume all dataframes have the same Label for the same concentration
                    label = df_conc.iloc[0]['label']
                    super_spectra_list.append(list(current_spectra_tuple) + [label, conc])
                    count += 1

    # Determine the column names for the super spectrum dataframe
    col_names = []
    for i, df in enumerate(dataframes, 1):
        col_names.extend([f'wavenumber_{i}_{col}' for col in range(len(df.columns) - 2)])
    col_names.extend(['label', 'conc'])

    # Combine all the super spectra into a single DataFrame
    super_spectra_df = pd.DataFrame(super_spectra_list, columns=col_names)
    
    return super_spectra_df

### Output all possible combinations

In [10]:
def save_all_combinations(START, iteration_number, WORKING_PATH, conc_levels):
    for i in range(START, iteration_number + START):
        # Initialize dictionaries to store training and testing DataFrames
        train_dfs = {}
        test_dfs = {}

        # List of modification types
        modifications = ["Bare-BSA", "Cysteamine-BSA", "Cysteine-BSA", "MCH-BSA"]
        
        # Loop through each modification and load its CSV into a DataFrame for both training and testing
        for modification in modifications:
            # Process train DataFrames
            train_path = os.path.join(WORKING_PATH, f"BSAML101_test_train_dataset\\train_test_set{i}\\train_split", f"{modification}.csv")
            train_dfs[modification] = pd.read_csv(train_path)
            
            # Process test DataFrames
            test_path = os.path.join(WORKING_PATH, f"BSAML101_test_train_dataset\\train_test_set{i}\\test_split", f"{modification}.csv")
            test_dfs[modification] = pd.read_csv(test_path)

        # Process and save combinations of 2, 3, and 4 modifications
        for num_mods in [2, 3, 4]:
            for phase, dfs in [('train', train_dfs), ('test', test_dfs)]:
                # Iterate over combinations of modifications
                for combo in combinations(dfs.items(), num_mods):
                    # Concatenate the DataFrames along the columns
                    dataframes = [df.iloc[:, :-2] for mod, df in combo[:-1]] + [dfs[combo[-1][0]]]  # Include all columns for the last DataFrame
                    combined_df = pd.concat(dataframes, axis=1)
                    
                    # Clean and sort modification names for the filename
                    mods_clean = sorted([mod.replace('-BSA', '') for mod, _ in combo])
                    mods_filename = '_'.join(mods_clean)
                    
                    # Define file path for saving the CSV
                    filename = os.path.join(WORKING_PATH, f"Superspectra20250705/{num_mods}x{num_mods}_{i}", f"{mods_filename}_{phase}{i}.csv")
                    os.makedirs(os.path.dirname(filename), exist_ok=True)
                    combined_df.to_csv(filename, index=False)
                    print(f"Saved: {filename}")


In [11]:
# Example usage
START = 1
iteration_number = 10
conc_levels = [0, 0.05, 0.001, 0.00002, 4e-7, 8e-9]

save_all_combinations(START, iteration_number, WORKING_PATH, conc_levels)

Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Bare_Cysteamine_train1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Bare_Cysteine_train1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Bare_MCH_train1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Cysteamine_Cysteine_train1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Cysteamine_MCH_train1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Cysteine_MCH_train1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Bare_Cysteamine_test1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Bare_Cysteine_test1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Bare_MCH_test1.csv
Saved: C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107/2x2_1\Cysteamine_Cysteine_test1.csv
Saved: C

## Step 3: regression models

### 1x1 single spectra

In [12]:
# set the number of total iterations (groups)
iteration_number = 5
START = 1 # Included

In [13]:
for i in range(START, START+iteration_number):
    modification_combination_list = ['Bare-BSA', 'Cysteamine-BSA', 'Cysteine-BSA', 'MCH-BSA']
    
    for modification in modification_combination_list:
        # Step 1: reformat the train and test sets
        # train
        file_path = f"{WORKING_PATH}\\BSAML101_test_train_dataset\\train_test_set{i}\\train_dataset{i}.csv"

        # Read the CSV file into a DataFrame
        train = pd.read_csv(file_path)

#         # Display the first few rows of the data to verify
#         print(train.head())

        # Select training data with a specific label
        selected_label = train['label'].iloc[0] # always choose the first label. This label has no use, but just to ensure that the code can run.
        selected_data_train = train[train['label'] == selected_label] # split training data
        # selected_data_train

        # Get the indices of the selected data
        selected_indices_train = selected_data_train.index
        # selected_indices_train

        # Get new X_train and y_train_reg
        X_train_split = selected_data_train.values[:,:-2]
        y_train_reg_split = np.log10(selected_data_train.values[:,-1].astype(float)+1e-20) # we only need the regression part.

        # test
        tfile_path = f"{WORKING_PATH}\\BSAML101_test_train_dataset\\train_test_set{i}\\test_dataset{i}.csv"

        # Read the CSV file into a DataFrame
        test = pd.read_csv(tfile_path)

#         # Display the first few rows of the data to verify
#         print(test.head())

        # Similarly, select testing data with the specific label
        selected_data_test = test[test['label'] == selected_label] # split testing data
        # selected_data_test

        # Get the indices of the selected data
        selected_indices_test = selected_data_test.index
        # selected_indices_test

        # Get new X_test and y_test_reg
        X_test_split = selected_data_test.values[:,:-2]
        y_test_reg_split = np.log10(selected_data_test.values[:,-1].astype(float)+1e-20) # we only need the regression part.
     
    
        # Step 2: SVR calculation
        # Create a svr
        svr = svm.SVR(kernel = 'rbf', C = 1000, epsilon = 0.1)
        # sklearn.svr only accept label vectors as input, not one-hot vectors.
        y_train_reg_split_label = y_train_reg_split
        with parallel_backend('threading', n_jobs=-1):
            # Fit the SVR model
            svr.fit(X_train_split, y_train_reg_split_label)
        svr_pred = svr.predict(X_test_split)
        
        # Comparing results
        y_test_reg_split_label = y_test_reg_split

        # Saving performances and pred vs real curve
        MAE = mean_absolute_error(y_test_reg_split_label, svr_pred)
        RMSE = np.sqrt(mean_squared_error(y_test_reg_split_label, svr_pred))
        R_squared = r2_score(y_test_reg_split_label, svr_pred)

        print("MAE:", MAE)
        print("RMSE:", RMSE)
        print("R^2:", R_squared)
        
        # New directory where you want to save the file
        save_directory = f"{WORKING_PATH}\\Superspectra20250705\\Results\\1x1_{i}_FigureData"
        os.makedirs(save_directory, exist_ok=True)
#         # Extract the base filename from the original file_path
#         base_filename = os.path.basename(file_path)

        # Create a DataFrame with these metrics
        performance_metrics_df = pd.DataFrame({
            'MAE': [MAE],
            'RMSE': [RMSE],
            'R^2': [R_squared]
        })
        
        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_performance_svr{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        performance_metrics_df.to_csv(save_file_path, index=False)
        
        # Actual and predicted values
#         plt.figure(figsize=(6, 6))
#         plt.scatter(y_test_reg_split, svr_pred, edgecolors="grey", linewidths=0.2, alpha = 0.3)
#         plt.axline([plt.xlim()[0], plt.xlim()[0]], [plt.xlim()[1], plt.xlim()[1]], linestyle='--', color="grey")
#         plt.xlabel("Actual log10(C)")
#         plt.ylabel("Predicted log10(C)")
#         plt.title(modification)
        
        plot_data_df = pd.DataFrame({
            'Actual_log10(C)': y_test_reg_split,
            'Predicted_log10(C)': svr_pred
        })

        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_svr{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        plot_data_df.to_csv(save_file_path, index=False)

        print(f"SVR plot data saved to {save_file_path}")

        
        # Step 3: RF calculation
        # Create an rf
        rf = RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=None, n_jobs=-1, verbose=0)
        rf.fit(X_train_split, y_train_reg_split_label)
        rf_pred = rf.predict(X_test_split)

        # Saving performances and pred vs real curve
        MAE = mean_absolute_error(y_test_reg_split_label, rf_pred)
        RMSE = np.sqrt(mean_squared_error(y_test_reg_split_label, rf_pred))
        R_squared = r2_score(y_test_reg_split_label, rf_pred)

        print("MAE:", MAE)
        print("RMSE:", RMSE)
        print("R^2:", R_squared)
       
        # Create a DataFrame with these metrics
        performance_metrics_df = pd.DataFrame({
            'MAE': [MAE],
            'RMSE': [RMSE],
            'R^2': [R_squared]
        })
        
        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_performance_rf{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        performance_metrics_df.to_csv(save_file_path, index=False)
        
        # Actual and predicted values
#         plt.figure(figsize=(6, 6))
#         plt.scatter(y_test_reg_split, rf_pred, edgecolors="grey", linewidths=0.2, alpha = 0.3)
#         plt.axline([plt.xlim()[0], plt.xlim()[0]], [plt.xlim()[1], plt.xlim()[1]], linestyle='--', color="grey")
#         plt.xlabel("Actual log10(C)")
#         plt.ylabel("Predicted log10(C)")
#         plt.title(modification)

        plot_data_df = pd.DataFrame({
            'Actual_log10(C)': y_test_reg_split,
            'Predicted_log10(C)': rf_pred
        })

        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_rf{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        plot_data_df.to_csv(save_file_path, index=False)

        print(f"RF plot data saved to {save_file_path}")


MAE: 1.2609676979141937
RMSE: 1.942515770299907
R^2: 0.8536016429220932
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\1x1_1_FigureData\Bare-BSA_svr1.csv
MAE: 1.4281223364611553
RMSE: 2.152962050300328
R^2: 0.8201626701292901
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\1x1_1_FigureData\Bare-BSA_rf1.csv
MAE: 1.2609676979141937
RMSE: 1.942515770299907
R^2: 0.8536016429220932
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\1x1_1_FigureData\Cysteamine-BSA_svr1.csv
MAE: 1.3896189031422765
RMSE: 2.0745481293218053
R^2: 0.833023971920293
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\1x1_1_FigureData\Cysteamine-BSA_rf1.csv
MAE: 1.2609676979141937
RMSE: 1.942515770299907
R^2: 0.8536016429220932
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\1x1_1_FigureData\Cysteine-BSA_sv

### 2x2 superspectra

In [14]:
# set the number of total iterations (groups)
iteration_number = 10
START = 1 # Included

In [15]:
from sklearn.impute import SimpleImputer

def preprocess_data(df):
    # Check if any NaNs are in the DataFrame
    if df.isna().any().any():
        print("NaN values found in the DataFrame. Imputing with median values...")
        # Impute NaN values using median, which is robust to outliers
        imputer = SimpleImputer(strategy='median')
        df.iloc[:, :-2] = imputer.fit_transform(df.iloc[:, :-2])  # Assume last two columns are 'label' and 'conc'
    return df

for i in range(START, START+iteration_number):
    modification_combination_list = ['Bare_Cysteamine', 'Bare_Cysteine', 'Bare_MCH', 'Cysteamine_Cysteine', 'Cysteamine_MCH', 'Cysteine_MCH']
    
    for modification in modification_combination_list:
        # Step 1: reformat the train and test sets
        # train
        file_path = f"{WORKING_PATH}\\Superspectra20250705\\2x2_{i}\\{modification}_train{i}.csv"

        # Read the CSV file into a DataFrame
        train = pd.read_csv(file_path)
        train = preprocess_data(train)

#         # Display the first few rows of the data to verify
#         print(train.head())

        # Select training data with a specific label
        selected_label = train['label'].iloc[0] # always choose the first label. This label has no use, but just to ensure that the code can run.
        selected_data_train = train[train['label'] == selected_label] # split training data
        # selected_data_train

        # Get the indices of the selected data
        selected_indices_train = selected_data_train.index
        # selected_indices_train

        # Get new X_train and y_train_reg
        X_train_split = selected_data_train.values[:,:-2]
        y_train_reg_split = np.log10(selected_data_train.values[:,-1].astype(float)+1e-20) # we only need the regression part.

        # test
        tfile_path = f"{WORKING_PATH}\\Superspectra20250705\\2x2_{i}\\{modification}_test{i}.csv"  # Replace with the path to your CSV file

        # Read the CSV file into a DataFrame
        test = pd.read_csv(tfile_path)
        test = preprocess_data(test)

#         # Display the first few rows of the data to verify
#         print(test.head())

        # Similarly, select testing data with the specific label
        selected_data_test = test[test['label'] == selected_label] # split testing data
        # selected_data_test

        # Get the indices of the selected data
        selected_indices_test = selected_data_test.index
        # selected_indices_test

        # Get new X_test and y_test_reg
        X_test_split = selected_data_test.values[:,:-2]
        y_test_reg_split = np.log10(selected_data_test.values[:,-1].astype(float)+1e-20) # we only need the regression part.
     
    
        # Step 2: SVR calculation
        # Create a svr
        svr = svm.SVR(kernel = 'rbf', C = 1000, epsilon = 0.1)
        # sklearn.svr only accept label vectors as input, not one-hot vectors.
        y_train_reg_split_label = y_train_reg_split
        with parallel_backend('threading', n_jobs=-1):
            # Fit the SVR model
            svr.fit(X_train_split, y_train_reg_split_label)
        svr_pred = svr.predict(X_test_split)
        
        # Comparing results
        y_test_reg_split_label = y_test_reg_split

        # Saving performances and pred vs real curve
        MAE = mean_absolute_error(y_test_reg_split_label, svr_pred)
        RMSE = np.sqrt(mean_squared_error(y_test_reg_split_label, svr_pred))
        R_squared = r2_score(y_test_reg_split_label, svr_pred)

        print("MAE:", MAE)
        print("RMSE:", RMSE)
        print("R^2:", R_squared)
        
        # New directory where you want to save the file
        save_directory = f"{WORKING_PATH}\\Superspectra20250705\\Results\\2x2_{i}_FigureData"
        os.makedirs(save_directory, exist_ok=True)
        
#         # Extract the base filename from the original file_path
#         base_filename = os.path.basename(file_path)

        # Create a DataFrame with these metrics
        performance_metrics_df = pd.DataFrame({
            'MAE': [MAE],
            'RMSE': [RMSE],
            'R^2': [R_squared]
        })
        
        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_performance_svr{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        performance_metrics_df.to_csv(save_file_path, index=False) 
        
        # Actual and predicted values
#         plt.figure(figsize=(6, 6))
#         plt.scatter(y_test_reg_split, svr_pred, edgecolors="grey", linewidths=0.2, alpha = 0.3)
#         plt.axline([plt.xlim()[0], plt.xlim()[0]], [plt.xlim()[1], plt.xlim()[1]], linestyle='--', color="grey")
#         plt.xlabel("Actual log10(C)")
#         plt.ylabel("Predicted log10(C)")
#         plt.title(modification)
        
        plot_data_df = pd.DataFrame({
            'Actual_log10(C)': y_test_reg_split,
            'Predicted_log10(C)': svr_pred
        })

        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_svr{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        plot_data_df.to_csv(save_file_path, index=False)

        print(f"SVR plot data saved to {save_file_path}")

        
        # Step 3: RF calculation
        # Create an rf
        rf = RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=None, n_jobs=-1, verbose=0)
        rf.fit(X_train_split, y_train_reg_split_label)
        rf_pred = rf.predict(X_test_split)

        # Saving performances and pred vs real curve
        MAE = mean_absolute_error(y_test_reg_split_label, rf_pred)
        RMSE = np.sqrt(mean_squared_error(y_test_reg_split_label, rf_pred))
        R_squared = r2_score(y_test_reg_split_label, rf_pred)

        print("MAE:", MAE)
        print("RMSE:", RMSE)
        print("R^2:", R_squared)
        
        # Create a DataFrame with these metrics
        performance_metrics_df = pd.DataFrame({
            'MAE': [MAE],
            'RMSE': [RMSE],
            'R^2': [R_squared]
        })
        
        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_performance_rf{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        performance_metrics_df.to_csv(save_file_path, index=False)
        
        # Actual and predicted values
#         plt.figure(figsize=(6, 6))
#         plt.scatter(y_test_reg_split, rf_pred, edgecolors="grey", linewidths=0.2, alpha = 0.3)
#         plt.axline([plt.xlim()[0], plt.xlim()[0]], [plt.xlim()[1], plt.xlim()[1]], linestyle='--', color="grey")
#         plt.xlabel("Actual log10(C)")
#         plt.ylabel("Predicted log10(C)")
#         plt.title(modification)

        plot_data_df = pd.DataFrame({
            'Actual_log10(C)': y_test_reg_split,
            'Predicted_log10(C)': rf_pred
        })

        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_rf{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        plot_data_df.to_csv(save_file_path, index=False)

        print(f"RF plot data saved to {save_file_path}")


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 3.2091488778761783
RMSE: 4.698540119177747
R^2: 0.1625249136157637
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_1_FigureData\Bare_Cysteamine_svr1.csv
MAE: 2.1136249553807556
RMSE: 3.8974566299995854
R^2: 0.4237531032253342
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_1_FigureData\Bare_Cysteamine_rf1.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.131583159431404
RMSE: 3.4404062800580495
R^2: 0.5719569547010224
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_1_FigureData\Bare_Cysteine_svr1.csv
MAE: 0.26346159624534776
RMSE: 0.48621122015580137
R^2: 0.9914509484647269
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_1_FigureData\Bare_Cysteine_rf1.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 1.7573360047063409
RMSE: 2.4491600292669395
R^2: 0.7659060967110038
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_1_FigureData\Bare_MCH_svr1.csv
MAE: 0.9170010580127841
RMSE: 1.6270387991853386
R^2: 0.8966878032956691
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_1_FigureData\Bare_MCH_rf1.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.7237195774136683
RMSE: 1.1126384118605248
R^2: 0.9552311354337956
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_1_FigureData\Cysteamine_Cysteine_svr1.csv
MAE: 0.28448919299389835
RMSE: 0.507929051843465
R^2: 0.9906701622951576
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA prote

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.9926292485727473
RMSE: 4.634458869806313
R^2: 0.18521301773482235
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_2_FigureData\Bare_Cysteamine_svr2.csv
MAE: 1.9297284860726425
RMSE: 3.4315585957825836
R^2: 0.5532867281710134
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_2_FigureData\Bare_Cysteamine_rf2.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.785080786114189
RMSE: 4.115909262272806
R^2: 0.3873681584871288
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_2_FigureData\Bare_Cysteine_svr2.csv
MAE: 0.22014975856571897
RMSE: 0.36959009165346995
R^2: 0.9950602084707517
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_2_FigureData\Bare_Cysteine_rf2.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.092849153641896
RMSE: 3.6472036303052335
R^2: 0.48087039096181017
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_2_FigureData\Bare_MCH_svr2.csv
MAE: 1.289994766817996
RMSE: 3.0507508945870487
R^2: 0.6367804035165032
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_2_FigureData\Bare_MCH_rf2.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.5909456610088448
RMSE: 0.7973410709736206
R^2: 0.977009085203676
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_2_FigureData\Cysteamine_Cysteine_svr2.csv
MAE: 0.17995796276888598
RMSE: 0.313495460403723
R^2: 0.9964458930336816
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.451194890769983
RMSE: 4.115719817131831
R^2: 0.3574045868914083
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_3_FigureData\Bare_Cysteamine_svr3.csv
MAE: 1.6375087663641077
RMSE: 2.6862716762020145
R^2: 0.7262551968226505
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_3_FigureData\Bare_Cysteamine_rf3.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.2568158450352165
RMSE: 3.498195969315563
R^2: 0.557456209098417
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_3_FigureData\Bare_Cysteine_svr3.csv
MAE: 0.2798988113597196
RMSE: 0.47359827146311545
R^2: 0.99188874230941
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_3_FigureData\Bare_Cysteine_rf3.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.098739672594697
RMSE: 3.523606407183968
R^2: 0.5154589690020623
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_3_FigureData\Bare_MCH_svr3.csv
MAE: 1.0233099200746187
RMSE: 2.063739569023016
R^2: 0.8337867814030829
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_3_FigureData\Bare_MCH_rf3.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.5817349534190344
RMSE: 0.8682933001579952
R^2: 0.9727352900671086
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_3_FigureData\Cysteamine_Cysteine_svr3.csv
MAE: 0.2444612542660358
RMSE: 0.44604941753068683
R^2: 0.9928049480358597
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 3.052978992698543
RMSE: 4.720914625478685
R^2: 0.1545297889034003
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_4_FigureData\Bare_Cysteamine_svr4.csv
MAE: 1.5449543088920792
RMSE: 2.3528704760563306
R^2: 0.7899889928242533
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_4_FigureData\Bare_Cysteamine_rf4.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.290694910480159
RMSE: 3.453646360962349
R^2: 0.568656048254769
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_4_FigureData\Bare_Cysteine_svr4.csv
MAE: 0.25558953070383683
RMSE: 0.4648752358674596
R^2: 0.9921847872501172
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_4_FigureData\Bare_Cysteine_rf4.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 1.9461783052675719
RMSE: 3.212794859922565
R^2: 0.5971700340409246
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_4_FigureData\Bare_MCH_svr4.csv
MAE: 0.970673334579205
RMSE: 2.1024708687293714
R^2: 0.8274894137286068
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_4_FigureData\Bare_MCH_rf4.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.6822536636281177
RMSE: 0.8520262733965134
R^2: 0.9737473012079886
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_4_FigureData\Cysteamine_Cysteine_svr4.csv
MAE: 0.24846260273798684
RMSE: 0.4660354853121496
R^2: 0.9921457276868582
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protei

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 3.42764062294453
RMSE: 5.041327624138663
R^2: 0.03586941180205061
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_5_FigureData\Bare_Cysteamine_svr5.csv
MAE: 1.7527939146263805
RMSE: 2.6823305738702743
R^2: 0.7270578445086151
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_5_FigureData\Bare_Cysteamine_rf5.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.1290923593435243
RMSE: 3.228677616354889
R^2: 0.6230208356664346
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_5_FigureData\Bare_Cysteine_svr5.csv
MAE: 0.2495686801181841
RMSE: 0.457121533488163
R^2: 0.9924433146080542
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_5_FigureData\Bare_Cysteine_rf5.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.161415573620431
RMSE: 3.8781005229310224
R^2: 0.4130597385179662
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_5_FigureData\Bare_MCH_svr5.csv
MAE: 1.220528429844393
RMSE: 2.367057255877205
R^2: 0.7813380065279852
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_5_FigureData\Bare_MCH_rf5.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.6375614581783585
RMSE: 0.8643004488557632
R^2: 0.9729854673444361
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_5_FigureData\Cysteamine_Cysteine_svr5.csv
MAE: 0.23807503378697986
RMSE: 0.3922007846059938
R^2: 0.9944373094888537
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 3.184391448637248
RMSE: 4.731855716456666
R^2: 0.15060636030556795
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_6_FigureData\Bare_Cysteamine_svr6.csv
MAE: 2.0192222943049356
RMSE: 2.867460811027193
R^2: 0.6880815962185854
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_6_FigureData\Bare_Cysteamine_rf6.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.1888177287698825
RMSE: 3.1484828654502053
R^2: 0.6415152792828673
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_6_FigureData\Bare_Cysteine_svr6.csv
MAE: 0.22782584889389376
RMSE: 0.3786512027591967
R^2: 0.9948150250443251
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_6_FigureData\Bare_Cysteine_rf6.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.321358602390296
RMSE: 3.4708210373396158
R^2: 0.5298675600542533
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_6_FigureData\Bare_MCH_svr6.csv
MAE: 1.17113881545009
RMSE: 2.3975776936108435
R^2: 0.7756628719173189
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_6_FigureData\Bare_MCH_rf6.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.511425970629577
RMSE: 0.685950566158311
R^2: 0.9829841511370891
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_6_FigureData\Cysteamine_Cysteine_svr6.csv
MAE: 0.18599099609895356
RMSE: 0.3742958619527901
R^2: 0.9949336168196626
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\d

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 3.293537259933572
RMSE: 4.737882487888053
R^2: 0.14844130641388953
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_7_FigureData\Bare_Cysteamine_svr7.csv
MAE: 1.6444023481870231
RMSE: 2.2338413522662495
R^2: 0.8106999756861075
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_7_FigureData\Bare_Cysteamine_rf7.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.137798378131083
RMSE: 2.916318273180853
R^2: 0.6924343574299167
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_7_FigureData\Bare_Cysteine_svr7.csv
MAE: 0.2789909242373047
RMSE: 0.4902262257006743
R^2: 0.9913091738234496
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_7_FigureData\Bare_Cysteine_rf7.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.4187817277604373
RMSE: 3.611552642134751
R^2: 0.4909696512038627
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_7_FigureData\Bare_MCH_svr7.csv
MAE: 1.0641391593423266
RMSE: 1.8706413114759552
R^2: 0.8634358143575083
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_7_FigureData\Bare_MCH_rf7.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.73890349124987
RMSE: 1.0445442412955017
R^2: 0.9605432182262453
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_7_FigureData\Cysteamine_Cysteine_svr7.csv
MAE: 0.22600232199352052
RMSE: 0.4011744193761947
R^2: 0.9941798463887395
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 3.0906145432991763
RMSE: 4.504706758929403
R^2: 0.23019794237978608
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_8_FigureData\Bare_Cysteamine_svr8.csv
MAE: 1.8204690471865266
RMSE: 2.7136388612031452
R^2: 0.7206490726868557
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_8_FigureData\Bare_Cysteamine_rf8.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.3002807505390783
RMSE: 3.293533289074354
R^2: 0.6077236763999396
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_8_FigureData\Bare_Cysteine_svr8.csv
MAE: 0.23530817905317078
RMSE: 0.4025577083958936
R^2: 0.9941396402617424
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_8_FigureData\Bare_Cysteine_rf8.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.0795137612103995
RMSE: 3.4647159872782165
R^2: 0.5315199977781782
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_8_FigureData\Bare_MCH_svr8.csv
MAE: 1.1963289428968324
RMSE: 2.561702848785278
R^2: 0.7438978175202215
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_8_FigureData\Bare_MCH_rf8.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.616578430239674
RMSE: 0.7969510870139477
R^2: 0.9770315696727027
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_8_FigureData\Cysteamine_Cysteine_svr8.csv
MAE: 0.22781698871613631
RMSE: 0.4022584231930303
R^2: 0.994148350898419
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 3.0216449599212747
RMSE: 4.488131860571227
R^2: 0.23585243564391523
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_9_FigureData\Bare_Cysteamine_svr9.csv
MAE: 1.719772774361029
RMSE: 2.692061384209798
R^2: 0.7250739235662458
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_9_FigureData\Bare_Cysteamine_rf9.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.491124872520748
RMSE: 3.643911030087969
R^2: 0.5198206150239393
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_9_FigureData\Bare_Cysteine_svr9.csv
MAE: 0.24169439953223298
RMSE: 0.46972770992978113
R^2: 0.9920207817737853
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_9_FigureData\Bare_Cysteine_rf9.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.0653614347014746
RMSE: 3.5269386785307018
R^2: 0.5145420756373655
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_9_FigureData\Bare_MCH_svr9.csv
MAE: 1.287603487545659
RMSE: 3.059677541572897
R^2: 0.6346516969512438
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_9_FigureData\Bare_MCH_rf9.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.6650502750743384
RMSE: 0.8919093688873752
R^2: 0.9712320156698183
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_9_FigureData\Cysteamine_Cysteine_svr9.csv
MAE: 0.24718713067773032
RMSE: 0.48202885633408116
R^2: 0.9915973928904999
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA prote

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 3.158129762546185
RMSE: 4.677286233038254
R^2: 0.17008442842632931
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_10_FigureData\Bare_Cysteamine_svr10.csv
MAE: 2.0135853541404503
RMSE: 2.6042302617723094
R^2: 0.7427207389459461
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_10_FigureData\Bare_Cysteamine_rf10.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.7293801132268336
RMSE: 3.9872261033803076
R^2: 0.4250769651619757
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_10_FigureData\Bare_Cysteine_svr10.csv
MAE: 0.2879280888368856
RMSE: 0.5096620410123128
R^2: 0.9906063892541225
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_10_FigureData\Bare_Cysteine_rf10.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 2.2946860868584924
RMSE: 3.662563862347412
R^2: 0.4764885448957631
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_10_FigureData\Bare_MCH_svr10.csv
MAE: 1.309697592494323
RMSE: 3.284484146063358
R^2: 0.5789922244696688
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_10_FigureData\Bare_MCH_rf10.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.5895248102963022
RMSE: 0.7923815631198247
R^2: 0.9772942053607913
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\2x2_10_FigureData\Cysteamine_Cysteine_svr10.csv
MAE: 0.23202981771249162
RMSE: 0.42790554000377334
R^2: 0.9933783869187732
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA 

### 3x3 superspectra

In [16]:
# set the number of total iterations (groups)
iteration_number = 10
START = 1 # Included

In [17]:
for i in range(START, START+iteration_number):
    modification_combination_list = ['Bare_Cysteamine_Cysteine', 'Bare_Cysteamine_MCH', 'Bare_Cysteine_MCH', 'Cysteamine_Cysteine_MCH']
    
    for modification in modification_combination_list:
        # Step 1: reformat the train and test sets
        # train
        file_path = f"{WORKING_PATH}\\Superspectra20250705\\3x3_{i}\\{modification}_train{i}.csv"

        # Read the CSV file into a DataFrame
        train = pd.read_csv(file_path)
        train = preprocess_data(train)

#         # Display the first few rows of the data to verify
#         print(train.head())

        # Select training data with a specific label
        selected_label = train['label'].iloc[0] # always choose the first label. This label has no use, but just to ensure that the code can run.
        selected_data_train = train[train['label'] == selected_label] # split training data
        # selected_data_train

        # Get the indices of the selected data
        selected_indices_train = selected_data_train.index
        # selected_indices_train

        # Get new X_train and y_train_reg
        X_train_split = selected_data_train.values[:,:-2]
        y_train_reg_split = np.log10(selected_data_train.values[:,-1].astype(float)+1e-20) # we only need the regression part.

        # test
        tfile_path = f"{WORKING_PATH}\\Superspectra20250705\\3x3_{i}\\{modification}_test{i}.csv"  # Replace with the path to your CSV file

        # Read the CSV file into a DataFrame
        test = pd.read_csv(tfile_path)
        test = preprocess_data(test)

#         # Display the first few rows of the data to verify
#         print(test.head())

        # Similarly, select testing data with the specific label
        selected_data_test = test[test['label'] == selected_label] # split testing data
        # selected_data_test

        # Get the indices of the selected data
        selected_indices_test = selected_data_test.index
        # selected_indices_test

        # Get new X_test and y_test_reg
        X_test_split = selected_data_test.values[:,:-2]
        y_test_reg_split = np.log10(selected_data_test.values[:,-1].astype(float)+1e-20) # we only need the regression part.
     
    
        # Step 2: SVR calculation
        # Create a svr
        svr = svm.SVR(kernel = 'rbf', C = 1000, epsilon = 0.1)
        # sklearn.svr only accept label vectors as input, not one-hot vectors.
        y_train_reg_split_label = y_train_reg_split
        with parallel_backend('threading', n_jobs=-1):
            # Fit the SVR model
            svr.fit(X_train_split, y_train_reg_split_label)
        svr_pred = svr.predict(X_test_split)
        
        # Comparing results
        y_test_reg_split_label = y_test_reg_split

        # Saving performances and pred vs real curve
        MAE = mean_absolute_error(y_test_reg_split_label, svr_pred)
        RMSE = np.sqrt(mean_squared_error(y_test_reg_split_label, svr_pred))
        R_squared = r2_score(y_test_reg_split_label, svr_pred)

        print("MAE:", MAE)
        print("RMSE:", RMSE)
        print("R^2:", R_squared)
        
        # New directory where you want to save the file
        save_directory = f"{WORKING_PATH}\\Superspectra20250705\\Results\\3x3_{i}_FigureData"
        os.makedirs(save_directory, exist_ok=True)
        
#         # Extract the base filename from the original file_path
#         base_filename = os.path.basename(file_path)

        # Create a DataFrame with these metrics
        performance_metrics_df = pd.DataFrame({
            'MAE': [MAE],
            'RMSE': [RMSE],
            'R^2': [R_squared]
        })
        
        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_performance_svr{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        performance_metrics_df.to_csv(save_file_path, index=False) 
        
        # Actual and predicted values
#         plt.figure(figsize=(6, 6))
#         plt.scatter(y_test_reg_split, svr_pred, edgecolors="grey", linewidths=0.2, alpha = 0.3)
#         plt.axline([plt.xlim()[0], plt.xlim()[0]], [plt.xlim()[1], plt.xlim()[1]], linestyle='--', color="grey")
#         plt.xlabel("Actual log10(C)")
#         plt.ylabel("Predicted log10(C)")
#         plt.title(modification)
        
        plot_data_df = pd.DataFrame({
            'Actual_log10(C)': y_test_reg_split,
            'Predicted_log10(C)': svr_pred
        })

        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_svr{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        plot_data_df.to_csv(save_file_path, index=False)

        print(f"SVR plot data saved to {save_file_path}")

        
        # Step 3: RF calculation
        # Create an rf
        rf = RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=None, n_jobs=-1, verbose=0)
        rf.fit(X_train_split, y_train_reg_split_label)
        rf_pred = rf.predict(X_test_split)

        # Saving performances and pred vs real curve
        MAE = mean_absolute_error(y_test_reg_split_label, rf_pred)
        RMSE = np.sqrt(mean_squared_error(y_test_reg_split_label, rf_pred))
        R_squared = r2_score(y_test_reg_split_label, rf_pred)

        print("MAE:", MAE)
        print("RMSE:", RMSE)
        print("R^2:", R_squared)
        
        # Create a DataFrame with these metrics
        performance_metrics_df = pd.DataFrame({
            'MAE': [MAE],
            'RMSE': [RMSE],
            'R^2': [R_squared]
        })
        
        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_performance_rf{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        performance_metrics_df.to_csv(save_file_path, index=False)
        
        # Actual and predicted values
#         plt.figure(figsize=(6, 6))
#         plt.scatter(y_test_reg_split, rf_pred, edgecolors="grey", linewidths=0.2, alpha = 0.3)
#         plt.axline([plt.xlim()[0], plt.xlim()[0]], [plt.xlim()[1], plt.xlim()[1]], linestyle='--', color="grey")
#         plt.xlabel("Actual log10(C)")
#         plt.ylabel("Predicted log10(C)")
#         plt.title(modification)

        plot_data_df = pd.DataFrame({
            'Actual_log10(C)': y_test_reg_split,
            'Predicted_log10(C)': rf_pred
        })

        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_rf{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        plot_data_df.to_csv(save_file_path, index=False)

        print(f"RF plot data saved to {save_file_path}")


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.103009271684485
RMSE: 3.1399892603184694
R^2: 0.6434468258754898
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_1_FigureData\Bare_Cysteamine_Cysteine_svr1.csv
MAE: 0.2899642037838865
RMSE: 0.5040048229979135
R^2: 0.9908137689089521
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_1_FigureData\Bare_Cysteamine_Cysteine_rf1.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.69618576295662
RMSE: 2.3910175133474145
R^2: 0.7768888414482639
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_1_FigureData\Bare_Cysteamine_MCH_svr1.csv
MAE: 1.2072195467912712
RMSE: 3.0633698678182877
R^2: 0.6337693823498349
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_1_FigureData\Bare_Cysteamine_MCH_rf1.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.7357221821981454
RMSE: 2.4412516021913526
R^2: 0.7674154513957
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_1_FigureData\Bare_Cysteine_MCH_svr1.csv
MAE: 0.5950113984290158
RMSE: 0.8978321551549006
R^2: 0.968540914949394
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_1_FigureData\Bare_Cysteine_MCH_rf1.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.7294801263801544
RMSE: 1.0549184102637996
R^2: 0.9565696309016287
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_1_FigureData\Cysteamine_Cysteine_MCH_svr1.csv
MAE: 0.5315282088539408
RMSE: 0.8664301576469
R^2: 0.970703017100568
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_1_FigureDat

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.661482400304473
RMSE: 3.681582192572316
R^2: 0.509840998010173
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_2_FigureData\Bare_Cysteamine_Cysteine_svr2.csv
MAE: 0.2086450370123225
RMSE: 0.35147218582667156
R^2: 0.995532650749486
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_2_FigureData\Bare_Cysteamine_Cysteine_rf2.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.8602266534959195
RMSE: 3.19716827950802
R^2: 0.6010791203584617
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_2_FigureData\Bare_Cysteamine_MCH_svr2.csv
MAE: 1.1164908475600708
RMSE: 2.4039376177113567
R^2: 0.7744711195372905
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_2_FigureData\Bare_Cysteamine_MCH_rf2.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.907229078918932
RMSE: 2.8734238081847034
R^2: 0.6777780298400995
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_2_FigureData\Bare_Cysteine_MCH_svr2.csv
MAE: 0.47805688068008556
RMSE: 0.7124104179542997
R^2: 0.9801931095050598
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_2_FigureData\Bare_Cysteine_MCH_rf2.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.6679097749037741
RMSE: 0.9283513216263146
R^2: 0.9663658468735894
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_2_FigureData\Cysteamine_Cysteine_MCH_svr2.csv
MAE: 0.42428265530433634
RMSE: 0.6605597574213765
R^2: 0.9829713585492542
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_2_

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.9167484988183765
RMSE: 2.9641679640565797
R^2: 0.6822587497131165
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_3_FigureData\Bare_Cysteamine_Cysteine_svr3.csv
MAE: 0.27131040382024907
RMSE: 0.45930577380632076
R^2: 0.9923709266339032
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_3_FigureData\Bare_Cysteamine_Cysteine_rf3.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.7407727885850985
RMSE: 2.8319127622586056
R^2: 0.687020769547359
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_3_FigureData\Bare_Cysteamine_MCH_svr3.csv
MAE: 0.829026484241953
RMSE: 1.5402235158144941
R^2: 0.9074186995862245
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_3_FigureData\Bare_Cysteamine_MCH_rf3.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.8482362369633198
RMSE: 2.8900473501733948
R^2: 0.6740389609578654
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_3_FigureData\Bare_Cysteine_MCH_svr3.csv
MAE: 0.5891072506979113
RMSE: 0.9148996593395095
R^2: 0.967333492206393
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_3_FigureData\Bare_Cysteine_MCH_rf3.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.5526220281629198
RMSE: 0.828467801788106
R^2: 0.9732140505025767
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_3_FigureData\Cysteamine_Cysteine_MCH_svr3.csv
MAE: 0.5274044952511818
RMSE: 0.8865746580220333
R^2: 0.9693248706001042
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_3_Fig

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.5522327688701267
RMSE: 3.5275573976859826
R^2: 0.549996222262306
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_4_FigureData\Bare_Cysteamine_Cysteine_svr4.csv
MAE: 0.2740418178430358
RMSE: 0.5098707476164924
R^2: 0.9905986943117642
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_4_FigureData\Bare_Cysteamine_Cysteine_rf4.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.9793542982964916
RMSE: 3.022570634962301
R^2: 0.6434596433919151
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_4_FigureData\Bare_Cysteamine_MCH_svr4.csv
MAE: 0.8603567076650023
RMSE: 1.6517118600079232
R^2: 0.8935307115898341
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_4_FigureData\Bare_Cysteamine_MCH_rf4.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.8247833328745955
RMSE: 2.611115815895156
R^2: 0.733922563492349
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_4_FigureData\Bare_Cysteine_MCH_svr4.csv
MAE: 0.45641762153213117
RMSE: 0.6495564999162216
R^2: 0.9835339419184179
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_4_FigureData\Bare_Cysteine_MCH_rf4.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.8247616324325411
RMSE: 1.1670058608730636
R^2: 0.946850173483743
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_4_FigureData\Cysteamine_Cysteine_MCH_svr4.csv
MAE: 0.43862217868823583
RMSE: 0.6649633038081827
R^2: 0.982743562787019
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_4_Fig

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.104692764800888
RMSE: 3.05045589442854
R^2: 0.6634903818600364
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_5_FigureData\Bare_Cysteamine_Cysteine_svr5.csv
MAE: 0.23861312092711368
RMSE: 0.43481868361513737
R^2: 0.9931627040989179
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_5_FigureData\Bare_Cysteamine_Cysteine_rf5.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.168417491515877
RMSE: 3.375334721537968
R^2: 0.5553795120909579
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_5_FigureData\Bare_Cysteamine_MCH_svr5.csv
MAE: 0.9549452231328913
RMSE: 1.9805244695008148
R^2: 0.8469207936114226
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_5_FigureData\Bare_Cysteamine_MCH_rf5.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.8930930347900436
RMSE: 2.916671912173301
R^2: 0.6680054642701652
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_5_FigureData\Bare_Cysteine_MCH_svr5.csv
MAE: 0.5577430274179943
RMSE: 0.7798893447845644
R^2: 0.9762632229470086
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_5_FigureData\Bare_Cysteine_MCH_rf5.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.6997162936812378
RMSE: 1.0436821008888768
R^2: 0.9574898881348124
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_5_FigureData\Cysteamine_Cysteine_MCH_svr5.csv
MAE: 0.47312242429782964
RMSE: 0.7173063946394428
R^2: 0.9799199318691685
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_5_F

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.2725800589304566
RMSE: 3.1065452593957326
R^2: 0.6510016672961096
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_6_FigureData\Bare_Cysteamine_Cysteine_svr6.csv
MAE: 0.20973893164809773
RMSE: 0.36889657779772206
R^2: 0.9950787295284272
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_6_FigureData\Bare_Cysteamine_Cysteine_rf6.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.292300446623379
RMSE: 3.1466204038111147
R^2: 0.6135934443267037
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_6_FigureData\Bare_Cysteamine_MCH_svr6.csv
MAE: 1.0261050453623737
RMSE: 1.8946637477543729
R^2: 0.8599058282604766
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_6_FigureData\Bare_Cysteamine_MCH_rf6.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.6507153115393611
RMSE: 2.18003362573689
R^2: 0.814526374289369
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_6_FigureData\Bare_Cysteine_MCH_svr6.csv
MAE: 0.500309934198108
RMSE: 0.7365961262780154
R^2: 0.9788254281231439
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_6_FigureData\Bare_Cysteine_MCH_rf6.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.5521449844885933
RMSE: 0.7306165137545502
R^2: 0.979167818759851
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_6_FigureData\Cysteamine_Cysteine_MCH_svr6.csv
MAE: 0.4503335253200612
RMSE: 0.7130388601951736
R^2: 0.9801581493851176
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_6_Figure

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.3430740879619822
RMSE: 3.064011786299426
R^2: 0.6604929126448469
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_7_FigureData\Bare_Cysteamine_Cysteine_svr7.csv
MAE: 0.2775394123636554
RMSE: 0.4788560324702041
R^2: 0.9917076445940306
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_7_FigureData\Bare_Cysteamine_Cysteine_rf7.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.923317758066436
RMSE: 2.832457737769137
R^2: 0.6869002980200549
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_7_FigureData\Bare_Cysteamine_MCH_svr7.csv
MAE: 0.7757121046458768
RMSE: 1.3202628937220595
R^2: 0.9319737372738172
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_7_FigureData\Bare_Cysteamine_MCH_rf7.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.927250906052135
RMSE: 2.5060472304229067
R^2: 0.7549050966179165
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_7_FigureData\Bare_Cysteine_MCH_svr7.csv
MAE: 0.5066009787774792
RMSE: 0.7425174577972197
R^2: 0.9784836244863065
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_7_FigureData\Bare_Cysteine_MCH_rf7.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.6822297096363513
RMSE: 0.9346803299140688
R^2: 0.965905683895868
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_7_FigureData\Cysteamine_Cysteine_MCH_svr7.csv
MAE: 0.5133928689639166
RMSE: 0.7807113598823583
R^2: 0.9762131587367304
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_7_Fig

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.3601540980893634
RMSE: 3.147353533951389
R^2: 0.6417724034259088
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_8_FigureData\Bare_Cysteamine_Cysteine_svr8.csv
MAE: 0.23604224140664087
RMSE: 0.41012110984553773
R^2: 0.9939173583749585
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_8_FigureData\Bare_Cysteamine_Cysteine_rf8.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.8379459704955208
RMSE: 2.809465423887332
R^2: 0.6919628048692494
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_8_FigureData\Bare_Cysteamine_MCH_svr8.csv
MAE: 0.8913810338988027
RMSE: 1.6581503454750772
R^2: 0.892699044702269
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_8_FigureData\Bare_Cysteamine_MCH_rf8.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.873190910829678
RMSE: 2.6374468845845063
R^2: 0.7285291383819339
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_8_FigureData\Bare_Cysteine_MCH_svr8.csv
MAE: 0.5752075731778655
RMSE: 0.8098566420570495
R^2: 0.9744040016291319
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_8_FigureData\Bare_Cysteine_MCH_rf8.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.637090776440324
RMSE: 0.8483548306028035
R^2: 0.9719126444952636
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_8_FigureData\Cysteamine_Cysteine_MCH_svr8.csv
MAE: 0.5012986254669697
RMSE: 0.795192497502545
R^2: 0.9753225474873938
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_8_Figu

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.387365065906686
RMSE: 3.2916493764053123
R^2: 0.6081723149358496
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_9_FigureData\Bare_Cysteamine_Cysteine_svr9.csv
MAE: 0.2656360952502729
RMSE: 0.4845175272733742
R^2: 0.9915104051283355
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_9_FigureData\Bare_Cysteamine_Cysteine_rf9.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.8960776312155645
RMSE: 2.9550089752901965
R^2: 0.6592205594294958
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_9_FigureData\Bare_Cysteamine_MCH_svr9.csv
MAE: 1.0524348126810252
RMSE: 2.1909614692135735
R^2: 0.812662268471085
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_9_FigureData\Bare_Cysteamine_MCH_rf9.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.0972121545573468
RMSE: 3.070278678650599
R^2: 0.6321156014712257
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_9_FigureData\Bare_Cysteine_MCH_svr9.csv
MAE: 0.5648248148610459
RMSE: 0.8413829851958258
R^2: 0.9723723956716058
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_9_FigureData\Bare_Cysteine_MCH_rf9.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.6808967910612427
RMSE: 0.9831710533311045
R^2: 0.9622763293953407
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_9_FigureData\Cysteamine_Cysteine_MCH_svr9.csv
MAE: 0.4539653932778659
RMSE: 0.7231977862275221
R^2: 0.979588733499395
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_9_Fig

  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.703025420425835
RMSE: 3.738562972458092
R^2: 0.49455095269720817
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_10_FigureData\Bare_Cysteamine_Cysteine_svr10.csv
MAE: 0.26966402425548613
RMSE: 0.46111879326618643
R^2: 0.9923105792220336
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_10_FigureData\Bare_Cysteamine_Cysteine_rf10.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.0327553782058967
RMSE: 3.0838644873925047
R^2: 0.6288526631190338
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_10_FigureData\Bare_Cysteamine_MCH_svr10.csv
MAE: 1.1479720195861767
RMSE: 2.7693195142143514
R^2: 0.7007033133762302
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_10_FigureData\Bare_Cysteamine_MCH_rf10.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.1613510842468027
RMSE: 2.9894893850407915
R^2: 0.651221417422801
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_10_FigureData\Bare_Cysteine_MCH_svr10.csv
MAE: 0.43400261945735424
RMSE: 0.6736427943125962
R^2: 0.9822901405570359
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_10_FigureData\Bare_Cysteine_MCH_rf10.csv
NaN values found in the DataFrame. Imputing with median values...
NaN values found in the DataFrame. Imputing with median values...
MAE: 0.7317920231140762
RMSE: 0.9591374530322574
R^2: 0.9640980963638202
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\3x3_10_FigureData\Cysteamine_Cysteine_MCH_svr10.csv
MAE: 0.45561687871054596
RMSE: 0.6984061222084024
R^2: 0.9809641684742179
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\

### 4x4 superspectra

In [18]:
# set the number of total iterations (groups)
iteration_number = 10
START = 1 # Included

In [19]:
for i in range(START, START+iteration_number):
    modification_combination_list = ['Bare_Cysteamine_Cysteine_MCH']
    
    for modification in modification_combination_list:
        # Step 1: reformat the train and test sets
        # train
        file_path = f"{WORKING_PATH}\\Superspectra20250705\\4x4_{i}\\{modification}_train{i}.csv"

        # Read the CSV file into a DataFrame
        train = pd.read_csv(file_path)
        train = preprocess_data(train)

#         # Display the first few rows of the data to verify
#         print(train.head())

        # Select training data with a specific label
        selected_label = train['label'].iloc[0] # always choose the first label. This label has no use, but just to ensure that the code can run.
        selected_data_train = train[train['label'] == selected_label] # split training data
        # selected_data_train

        # Get the indices of the selected data
        selected_indices_train = selected_data_train.index
        # selected_indices_train

        # Get new X_train and y_train_reg
        X_train_split = selected_data_train.values[:,:-2]
        y_train_reg_split = np.log10(selected_data_train.values[:,-1].astype(float)+1e-20) # we only need the regression part.

        # test
        tfile_path = f"{WORKING_PATH}\\Superspectra20250705\\4x4_{i}\\{modification}_test{i}.csv"  # Replace with the path to your CSV file

        # Read the CSV file into a DataFrame
        test = pd.read_csv(tfile_path)
        test = preprocess_data(test)

#         # Display the first few rows of the data to verify
#         print(test.head())

        # Similarly, select testing data with the specific label
        selected_data_test = test[test['label'] == selected_label] # split testing data
        # selected_data_test

        # Get the indices of the selected data
        selected_indices_test = selected_data_test.index
        # selected_indices_test

        # Get new X_test and y_test_reg
        X_test_split = selected_data_test.values[:,:-2]
        y_test_reg_split = np.log10(selected_data_test.values[:,-1].astype(float)+1e-20) # we only need the regression part.
     
    
        # Step 2: SVR calculation
        # Create a svr
        svr = svm.SVR(kernel = 'rbf', C = 1000, epsilon = 0.1)
        # sklearn.svr only accept label vectors as input, not one-hot vectors.
        y_train_reg_split_label = y_train_reg_split
        with parallel_backend('threading', n_jobs=-1):
            # Fit the SVR model
            svr.fit(X_train_split, y_train_reg_split_label)
        svr_pred = svr.predict(X_test_split)
        
        # Comparing results
        y_test_reg_split_label = y_test_reg_split

        
        # Saving performances and pred vs real curve
        MAE = mean_absolute_error(y_test_reg_split_label, svr_pred)
        RMSE = np.sqrt(mean_squared_error(y_test_reg_split_label, svr_pred))
        R_squared = r2_score(y_test_reg_split_label, svr_pred)

        print("MAE:", MAE)
        print("RMSE:", RMSE)
        print("R^2:", R_squared)

        save_directory = f"{WORKING_PATH}\\Superspectra20250705\\Results\\4x4_{i}_FigureData"
        os.makedirs(save_directory, exist_ok=True)
        
        # Create a DataFrame with these metrics
        performance_metrics_df = pd.DataFrame({
            'MAE': [MAE],
            'RMSE': [RMSE],
            'R^2': [R_squared]
        })
        
        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_performance_svr{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        performance_metrics_df.to_csv(save_file_path, index=False)
        
        # Actual and predicted values
#         plt.figure(figsize=(6, 6))
#         plt.scatter(y_test_reg_split, svr_pred, edgecolors="grey", linewidths=0.2, alpha = 0.3)
#         plt.axline([plt.xlim()[0], plt.xlim()[0]], [plt.xlim()[1], plt.xlim()[1]], linestyle='--', color="grey")
#         plt.xlabel("Actual log10(C)")
#         plt.ylabel("Predicted log10(C)")
#         plt.title(modification)
        
        plot_data_df = pd.DataFrame({
            'Actual_log10(C)': y_test_reg_split,
            'Predicted_log10(C)': svr_pred
        })

        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_svr{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        plot_data_df.to_csv(save_file_path, index=False)

        print(f"SVR plot data saved to {save_file_path}")

        
        # Step 3: RF calculation
        # Create an rf
        rf = RandomForestRegressor(n_estimators=100, criterion="squared_error", max_depth=None, n_jobs=-1, verbose=0)
        rf.fit(X_train_split, y_train_reg_split_label)
        rf_pred = rf.predict(X_test_split)

        # Saving performances and pred vs real curve
        MAE = mean_absolute_error(y_test_reg_split_label, rf_pred)
        RMSE = np.sqrt(mean_squared_error(y_test_reg_split_label, rf_pred))
        R_squared = r2_score(y_test_reg_split_label, rf_pred)

        print("MAE:", MAE)
        print("RMSE:", RMSE)
        print("R^2:", R_squared)

        # Create a DataFrame with these metrics
        performance_metrics_df = pd.DataFrame({
            'MAE': [MAE],
            'RMSE': [RMSE],
            'R^2': [R_squared]
        })
        
        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_performance_rf{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        performance_metrics_df.to_csv(save_file_path, index=False)
        
        # Actual and predicted values
#         plt.figure(figsize=(6, 6))
#         plt.scatter(y_test_reg_split, rf_pred, edgecolors="grey", linewidths=0.2, alpha = 0.3)
#         plt.axline([plt.xlim()[0], plt.xlim()[0]], [plt.xlim()[1], plt.xlim()[1]], linestyle='--', color="grey")
#         plt.xlabel("Actual log10(C)")
#         plt.ylabel("Predicted log10(C)")
#         plt.title(modification)

        plot_data_df = pd.DataFrame({
            'Actual_log10(C)': y_test_reg_split,
            'Predicted_log10(C)': rf_pred
        })

        # Construct the new file path with the same filename in the new directory
        save_file_path = os.path.join(save_directory, f"{modification}_rf{i}.csv")
        # Save the DataFrame to a CSV file in the new location with the same name
        plot_data_df.to_csv(save_file_path, index=False)

        print(f"RF plot data saved to {save_file_path}")


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.6115564998852203
RMSE: 2.2061109746861254
R^2: 0.8100626006753281
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_1_FigureData\Bare_Cysteamine_Cysteine_MCH_svr1.csv
MAE: 0.5932159643638169
RMSE: 0.8792681360253783
R^2: 0.9698283926696337
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_1_FigureData\Bare_Cysteamine_Cysteine_MCH_rf1.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.8111780710100756
RMSE: 2.6648899677908893
R^2: 0.722850345961717
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_2_FigureData\Bare_Cysteamine_Cysteine_MCH_svr2.csv
MAE: 0.4079347495131935
RMSE: 0.6417870811213342
R^2: 0.9839254909164699
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_2_FigureData\Bare_Cysteamine_Cysteine_MCH_rf2.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.5231561315719613
RMSE: 2.2963072295813465
R^2: 0.7942140295220824
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_3_FigureData\Bare_Cysteamine_Cysteine_MCH_svr3.csv
MAE: 0.5501503944099647
RMSE: 0.835961349389882
R^2: 0.972727297587638
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_3_FigureData\Bare_Cysteamine_Cysteine_MCH_rf3.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.857413558150811
RMSE: 2.541608402044091
R^2: 0.7478998802078431
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_4_FigureData\Bare_Cysteamine_Cysteine_MCH_svr4.csv
MAE: 0.48036216031445955
RMSE: 0.6767151915762235
R^2: 0.9821282274287327
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_4_FigureData\Bare_Cysteamine_Cysteine_MCH_rf4.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.7648783426670949
RMSE: 2.60194973691239
R^2: 0.7357873647709554
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_5_FigureData\Bare_Cysteamine_Cysteine_MCH_svr5.csv
MAE: 0.4833472340410921
RMSE: 0.7186815566564374
R^2: 0.9798428662909425
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_5_FigureData\Bare_Cysteamine_Cysteine_MCH_rf5.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.5447856250882839
RMSE: 1.9775543360557357
R^2: 0.8473795859700812
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_6_FigureData\Bare_Cysteamine_Cysteine_MCH_svr6.csv
MAE: 0.48068105744898454
RMSE: 0.7411423556231639
R^2: 0.9785632450190644
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_6_FigureData\Bare_Cysteamine_Cysteine_MCH_rf6.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.6294911632166258
RMSE: 2.125322908918064
R^2: 0.8237189518741322
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_7_FigureData\Bare_Cysteamine_Cysteine_MCH_svr7.csv
MAE: 0.46717427675195733
RMSE: 0.7440292535764513
R^2: 0.9783959188576171
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_7_FigureData\Bare_Cysteamine_Cysteine_MCH_rf7.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.6077790590211707
RMSE: 2.2266319492839375
R^2: 0.8065126174164158
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_8_FigureData\Bare_Cysteamine_Cysteine_MCH_svr8.csv
MAE: 0.5140026633508329
RMSE: 0.8218558177053303
R^2: 0.9736399005419976
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_8_FigureData\Bare_Cysteamine_Cysteine_MCH_rf8.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 1.755025881522555
RMSE: 2.4763544450614985
R^2: 0.760678680141083
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_9_FigureData\Bare_Cysteamine_Cysteine_MCH_svr9.csv
MAE: 0.5125361263780763
RMSE: 0.7625605918341166
R^2: 0.9773063426873821
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_9_FigureData\Bare_Cysteamine_Cysteine_MCH_rf9.csv


  train = pd.read_csv(file_path)


NaN values found in the DataFrame. Imputing with median values...


  test = pd.read_csv(tfile_path)


NaN values found in the DataFrame. Imputing with median values...
MAE: 2.028620025788081
RMSE: 2.731548009602658
R^2: 0.7088120136598731
SVR plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_10_FigureData\Bare_Cysteamine_Cysteine_MCH_svr10.csv
MAE: 0.44785629717103836
RMSE: 0.6613502698593456
R^2: 0.9829305767500988
RF plot data saved to C:\Users\zhao-\Jiaheng\BSA protein\data\Superspectra20231107\Results\4x4_10_FigureData\Bare_Cysteamine_Cysteine_MCH_rf10.csv
