# Introduction

The original dataset have 3406 features (wavenumber), in which all features are not important and highly correlated to each other. Hence, feature extraction is used to extract the features. This code file is for feature extraction.

## Loading Libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler, scale

In [None]:
plt.rcParams['font.family'] = 'Times New Roman'

## Loading Dataset

To run other dataset: Just change Dataset P1, P2, ..., OFE, PFE.

In [None]:
filename='Dataset O.xlsx'
df=pd.read_excel(filename, index_col=0)
df.head()

In [None]:
X, Y = df.iloc[:, :-1], df.iloc[:, -1]

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
for train_index, test_index in stratified_split.split(X, Y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

print('X_train: {}     Y_train: {} \nX_test: {}     Y_test: {}'.format(X_train.shape, Y_train.shape,
                                                                       X_test.shape, Y_test.shape))

## Feature Extraction

### 1. PCA

Principal Component Analysis (PCA) is a dimensionality reduction technique used in machine learning and data analysis to transform a high-dimensional dataset into a lower-dimensional space while preserving as much variance (information) as possible.

In [None]:
df.columns=df.columns.astype(str)

In [None]:
scaler = StandardScaler()
standardized_data = scaler.fit_transform(df)

In [None]:
pca = PCA(n_components=None)
principal_components = pca.fit_transform(standardized_data)

In [None]:
# Scree Plot
explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
plt.plot(explained_variance_ratio_cumsum, label='Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
#plt.title('Scree Plot')
plt.grid(True)
plt.legend()

# Save the figure with 600 DPI as a JPEG image
plt.savefig('F3.jpg', dpi=600, format='jpg')

plt.show()

In [None]:
# Set a threshold for cumulative explained variance (e.g., 95%)
cumulative_variance_threshold = 0.95

# Determine the optimal number of components based on the threshold
n_optimal_components = np.argmax(explained_variance_ratio_cumsum >= cumulative_variance_threshold) + 1

In [None]:
print(n_optimal_components)

In [None]:
print(f'Number of Principal Components to Explain {cumulative_variance_threshold * 100}% Variance: {n_optimal_components}')

In [None]:
# Combine principal components and target variable into a single DataFrame
pc_df = pd.DataFrame(data=principal_components[:, :n_optimal_components],
                     columns=[f'PC{i}' for i in range(1, n_optimal_components + 1)])
pc_df['Target'] = Y

# Create regression plots for each principal component
plt.figure(figsize=(10, 6))
for i in range(n_optimal_components):
    plt.subplot(1, n_optimal_components, i + 1)
    sns.regplot(x=f'PC{i+1}', y='Target', data=pc_df, scatter_kws={'alpha':0.5})
    #plt.title(f'PC{i+1} vs. Target')
    plt.xlabel(f'PC{i+1}')
    plt.ylabel('Target')

plt.tight_layout()

# Save the figure with 600 DPI as a JPEG image
plt.savefig('F4.jpg', dpi=600, format='jpg')

plt.show()

In [None]:
# Variance Explained Bar Plot
plt.bar(range(1, n_optimal_components + 1), pca.explained_variance_ratio_[:n_optimal_components],
        alpha=0.8, align='center', label='Individual Explained Variance')
plt.step(range(1, n_optimal_components + 1), explained_variance_ratio_cumsum[:n_optimal_components],
         where='mid', label='Cumulative Explained Variance', color='red')
plt.xlabel('Number of Principal Components')
plt.ylabel('Explained Variance Ratio')
#plt.title('Variance Explained Bar Plot (PCA)')
plt.legend(loc='center right')

# Save the figure with 600 DPI as a JPEG image
plt.savefig('F5.jpg', dpi=600, format='jpg')

plt.show()

In [None]:
# Create a DataFrame with all optimal principal components
df_ofe = pd.DataFrame(principal_components[:, :n_optimal_components],
                      columns=[f'PC{i}' for i in range(1, n_optimal_components + 1)])

# Include the 'Target' column
df_ofe['Target'] = df['Target']

# Display the updated dataframe
df_ofe.head()

In [None]:
df_ofe.to_excel('Dataset OFE.xlsx')

### 2. LDA

Linear Discriminant Analysis (LDA) is a supervised dimensionality reduction and classification technique used in machine learning to find linear combinations of features that best separate classes in a dataset.

In [None]:
# Set a threshold for binary classification (e.g., 0.5)
threshold = 0.5

# Convert 'Target' variable to binary labels
df['Target'] = (df['Target'] > threshold).astype(int)

# Check the unique values in the 'Target' variable after conversion
print("Unique values in 'Target' variable:", df['Target'].unique())

# Now, you can apply LDA
lda = LDA(n_components=None)
lda_components = lda.fit_transform(standardized_data, df['Target'])

In [None]:
# LDA Scatter Plot
plt.figure(figsize=(8, 6))

# Scatter plot for class 0
plt.scatter(lda_components[df['Target'] == 0], np.zeros_like(lda_components[df['Target'] == 0]),
            label='Class 0', alpha=0.8)

# Scatter plot for class 1
plt.scatter(lda_components[df['Target'] == 1], np.zeros_like(lda_components[df['Target'] == 1]),
            label='Class 1', alpha=0.8)

plt.xlabel('Linear Discriminant 1')
plt.title('LDA Scatter Plot')
plt.legend()
plt.show()

In [None]:
# Set a threshold for binary classification (e.g., 0.5)
threshold = 0.5

# Convert 'Target' variable to binary labels
df['Binary_Target'] = (df['Target'] > threshold).astype(int)

# Check the unique values in the 'Binary_Target' variable after conversion
print("Unique values in 'Binary_Target' variable:", df['Binary_Target'].unique())

In [None]:
# Now, you can apply LDA
lda = LDA(n_components=None)
lda_components = lda.fit_transform(standardized_data, df['Target'])

# Number of LDA components
n_lda_components = lda_components.shape[1]

# Create a DataFrame with all LDA components
df_lda = pd.DataFrame(lda_components, columns=[f'LDA{i}' for i in range(1, n_lda_components + 1)])

In [None]:
# Include the 'Target' column
df_lda['Target'] = df['Target']

In [None]:
# Display the updated dataframe for LDA
print("\nLDA Features:")
print(df_lda.head())