In [7]:
import pandas as pd
import numpy as np
import mne
import os
from pygsp import graphs, utils
from scipy.spatial import distance_matrix
from scipy.stats import entropy
import networkx as nx
from sklearn.cluster import spectral_clustering
def compute_total_variation(W, data_values):
    N = data_values.shape[0]
    TV = 0
    for i in range(N):
        for j in range(N):
            wij = W[i, j]
            difference = data_values[j] - data_values[i]
            TV += wij * np.linalg.norm(difference)**2
    return np.sqrt(TV)

dir_path = r'C:\Users\xmoot\Desktop\VSCode\gsp-eeg-alz\features_tv.csv'
file_list = [entry.path for entry in os.scandir(dir_path) if entry.is_file() and entry.path.endswith(".set")]

n_files = len(file_list)
if n_files == 0:
    raise ValueError(f"No .set files found in directory {dir_path}.")

print(f'Found {n_files} .set files.')

channel_names = ['Fp1', 'Fp2', 'F3', 'F4', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2', 'F7', 'F8', 'T3', 'T4', 'T5', 'T6', 'Fz', 'Cz', 'Pz']

data_list = []
features = {}

for i, file in enumerate(file_list):
    raw = mne.io.read_raw_eeglab(file)
    data = raw.get_data(picks=channel_names)
    transposed_data = np.transpose(data)
    data = pd.DataFrame(transposed_data, columns=channel_names)
    data = data.groupby(data.index // 50).median()
    data_list.append(data)

    # GSP analysis
    distances = distance_matrix(data.values, data.values)
    theta, k = 1.0, 1.0 
    W = np.exp(-distances**2 / theta**2)
    W[distances > k] = 0
    np.fill_diagonal(W, 0)
    G = graphs.Graph(W)
    L = G.L.toarray()
    eigenvalues, eigenvectors = np.linalg.eigh(L)
    X_GdataT = eigenvectors.T @ data.values
    C = np.cov(X_GdataT)
    T = eigenvectors.T.conj() @ C @ eigenvectors
    r = np.linalg.norm(np.diag(T)) / np.linalg.norm(T, 'fro')
    P = L @ data.values
    Y = np.sum(data.values * P)**2
    TV = compute_total_variation(W, data.values)

    # Spectral Graph Features
    graph_energy = np.sum(np.abs(eigenvalues))
    spectral_entropy = entropy(np.square(eigenvectors))

    # Graph Signal Features
    signal_energy = np.sum(np.square(data.values))
    signal_power = np.var(data.values)

    # Graph Modularity and Community Structure
    labels = spectral_clustering(W)
    unique_labels = len(np.unique(labels))

    # Graph Degree Distribution
    degree_distribution = np.sum(W, axis=0)

    # Graph Diffusion Characteristics
    heat_trace = np.trace(np.exp(-L))
    diffusion_distance = np.sum(np.exp(-L))

    # Aggregating Features
    features[os.path.basename(file)] = {
        'stationary_ratio': r, 
        'Tik-norm': Y, 
        'Total_Variation': TV,
        'graph_energy': graph_energy,
        'spectral_entropy': spectral_entropy,
        'signal_energy': signal_energy,
        'signal_power': signal_power,
        'unique_clusters': unique_labels,
        'avg_degree': np.mean(degree_distribution),
        'heat_trace': heat_trace,
        'diffusion_distance': diffusion_distance
    }

features_data = pd.DataFrame(features).T
features_data.to_csv('features_tv.csv', index_label='participant_id')

import pandas as pd
import numpy as np

features = pd.read_csv("features_tv.csv", )
participants = pd.read_csv("participants.tsv", delimiter='\t')
data = features.merge(participants, left_index=True, right_index=True)

from sklearn.preprocessing import RobustScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Create a robust scaler object
scaler = RobustScaler()

# Fit the scaler to the data and transform it
data[['stationary_ratio', 'Tik-norm']] = scaler.fit_transform(data[['stationary_ratio', 'Tik-norm']])

# Set the aesthetic style of the plot
sns.set(style="whitegrid", font_scale=1.2, rc={"grid.linewidth": 0.5})

# Create a color palette with three colors
palette = sns.color_palette("Set2", n_colors=3)

# Use seaborn lmplot function to generate the scatter plot
plot = sns.lmplot(data=data, x='stationary_ratio', y='Tik-norm', hue='Group', palette=palette,
                  fit_reg=False, legend=False, scatter_kws={'s': 50, 'alpha': 0.7}, height=5, aspect=1.15)  # Adjust point size and transparency

plot.set_axis_labels('Stationary Ratio', 'Tik-norm', fontsize=16)  # Set new axis labels and increase font size

# Change legend labels
new_labels = ['AD', 'HC', 'FTD']
legend = plt.legend(title='Group', labels=new_labels, title_fontsize='16', fontsize='14', loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
legend.get_title().set_fontsize('16')  # Set the fontsize of the legend title

# Remove title
plt.title('')

# Adjust layout so legend does not get cut off
plt.tight_layout()

# Show the plot
plt.savefig('enhanced_plot.png', dpi=300, bbox_inches='tight')  
from sklearn.preprocessing import RobustScaler
import seaborn as sns
import matplotlib.pyplot as plt

# Create a new column that maps 'A' and 'F' to the same value
data['Group_color'] = data['Group'].map({'A': 'Group1', 'F': 'Group1', 'C': 'Group2'})

# Create a robust scaler object
scaler = RobustScaler()

# Fit the scaler to the data and transform it
data[['stationary_ratio', 'Tik-norm']] = scaler.fit_transform(data[['stationary_ratio', 'Tik-norm']])

# Set the aesthetic style of the plot
sns.set(style="whitegrid", font_scale=1.2, rc={"grid.linewidth": 0.5})

# Create a color palette with two colors since there are two groups now
palette = sns.color_palette("Set2", n_colors=2)

# Use seaborn lmplot function to generate the scatter plot
plot = sns.lmplot(data=data, x='stationary_ratio', y='Tik-norm', hue='Group_color', palette=palette,
                  fit_reg=False, legend=False, scatter_kws={'s': 50, 'alpha': 0.7},
                  height=5, aspect=1.15)
plot.set_axis_labels('Stationary Ratio', 'Tik-norm', fontsize=16)  # Set new axis labels and increase font size

# Change legend labels
new_labels = ['Dementia', 'HC']
legend = plt.legend(title='Group', labels=new_labels, title_fontsize='16', fontsize='14', loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
legend.get_title().set_fontsize('16')  # Set the fontsize of the legend title

# Remove title
plt.title('')

# Adjust layout so legend does not get cut off
plt.tight_layout()

# Save the plot to a file
plt.savefig('enhanced_plot.png', dpi=300, bbox_inches='tight')  # Optionally, display the plot
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler

# Assuming 'data' is a DataFrame that contains the columns 'Group' and the features we want to plot
columns_to_scale = ['stationary_ratio', 'Tik-norm', 'Total_Variation', 'graph_energy', 'spectral_entropy', 'signal_energy', 'signal_power', 'unique_clusters', 'avg_degree', 'heat_trace', 'diffusion_distance', 'MMSE', 'Age']

# Detect problematic columns (i.e., columns that contain strings which look like lists)
problematic_columns = [col for col in columns_to_scale if isinstance(data[col].iloc[0], str)]

# Convert those columns
for col in problematic_columns:
    # Modify the string to have commas and then convert to list
    data[col] = data[col].apply(lambda x: eval('[' + ','.join(x.strip('[]').split()) + ']')[0] if isinstance(x, str) else x)

# Create a robust scaler object
scaler = RobustScaler()

# Scale all columns
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

features_to_plot = {
    'stationary_ratio': 'Stationary Ratio',
    'Tik-norm': 'Tik-norm',
    'Total_Variation': 'Total Variation',
    'graph_energy': 'Graph Energy',
    'spectral_entropy': 'Spectral Entropy',
    'signal_energy': 'Signal Energy',
    'signal_power': 'Signal Power',
    'unique_clusters': 'Unique Clusters',
    'avg_degree': 'Average Degree',
    'heat_trace': 'Heat Trace',
    'diffusion_distance': 'Diffusion Distance',
    'MMSE': 'MMSE',
    'Age': 'Age'
}

group_order = ['C', 'F', 'A']

for feature, title in features_to_plot.items():
    plt.figure(figsize=(10, 6))
    sns.boxplot(x="Group", y=feature, data=data, order=group_order)
    plt.title(f'Box Plot of {title} by Group')
    plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler

# Assuming 'data' is a DataFrame that contains the columns 'Group' and the features we want to plot
columns_to_scale = ['stationary_ratio', 'Tik-norm', 'Total_Variation', 'graph_energy', 'spectral_entropy', 'signal_energy', 'signal_power', 'unique_clusters', 'avg_degree', 'heat_trace', 'diffusion_distance', 'MMSE', 'Age']

# Detect problematic columns (i.e., columns that contain strings which look like lists)
problematic_columns = [col for col in columns_to_scale if isinstance(data[col].iloc[0], str)]

# Convert those columns
for col in problematic_columns:
    # Modify the string to have commas and then convert to list
    data[col] = data[col].apply(lambda x: eval('[' + ','.join(x.strip('[]').split()) + ']')[0] if isinstance(x, str) else x)

# Create a robust scaler object
scaler = RobustScaler()

# Scale all columns
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

features_to_plot = {
    'stationary_ratio': 'Stationary Ratio',
    'Tik-norm': 'Tik-norm',
    'Total_Variation': 'Total Variation',
    'graph_energy': 'Graph Energy',
    'spectral_entropy': 'Spectral Entropy',
    'signal_energy': 'Signal Energy',
    'signal_power': 'Signal Power',
    'unique_clusters': 'Unique Clusters',
    'avg_degree': 'Average Degree',
    'heat_trace': 'Heat Trace',
    'diffusion_distance': 'Diffusion Distance',
    'MMSE': 'MMSE',
    'Age': 'Age'
}


data['Group'] = data['Group'].replace({'C': 'HC', 'F': 'FTD', 'A': 'AD'})
group_order = ['HC', 'FTD', 'AD']

for feature, title in features_to_plot.items():
    plt.figure(figsize=(6, 5))
    sns.boxplot(x="Group", y=feature, data=data, order=group_order)
    plt.ylabel(features_to_plot[feature], fontsize=20)
    plt.xlabel("Group", fontsize=20)
    plt.xticks(fontsize=18, rotation=0)
    plt.yticks(fontsize=18)
    plt.title("")
    plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
    plt.show()
import umap
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the columns you want from the data DataFrame

features = {
    'stationary_ratio': 'Stationary Ratio',
    'Tik-norm': 'Tik-norm',
    'Total_Variation': 'Total Variation',
    'graph_energy': 'Graph Energy',
    'spectral_entropy': 'Spectral Entropy',
    'signal_energy': 'Signal Energy',
    'signal_power': 'Signal Power',
    'avg_degree': 'Average Degree',
    'diffusion_distance': 'Diffusion Distance',
}

selected_data = data[list(features.keys())]

# Create the UMAP object and fit_transform the data to get a 2D representation
reducer = umap.UMAP()
embedding = reducer.fit_transform(selected_data)

# Define a custom color palette
color_palette = {"HC": "blue", "FTD": "red", "AD": "green"}

# Plot the UMAP representation
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=embedding[:, 0], 
    y=embedding[:, 1], 
    hue=data['Group'],
    palette=color_palette, 
    s=60,
    alpha=0.8# Increased size
)
plt.gca().set_aspect('equal', 'datalim')
plt.title("")
plt.legend(loc="upper right", fontsize=16)
plt.xlabel('UMAP 1', fontsize=18)
plt.ylabel('UMAP 2', fontsize=18)
plt.gca().tick_params(axis='both', which='major', labelsize=16)
plt.show()    
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import RobustScaler

# Extract the columns you want from the data DataFrame
features = {
    'stationary_ratio': 'Stationary Ratio',
    'Tik-norm': 'Tik-norm',
    'Total_Variation': 'Total Variation',
    'graph_energy': 'Graph Energy',
    'spectral_entropy': 'Spectral Entropy',
    'signal_energy': 'Signal Energy',
    'signal_power': 'Signal Power',
    'avg_degree': 'Average Degree',
    'diffusion_distance': 'Diffusion Distance',
}

color_mapping = {"AD": "#3498db", "HC": "#2ecc71", "FTD": "#e74c3c"}  # Blue, Green, Red


data['Group'] = data['Group'].map({'A': 'AD', 'C': 'HC', 'F': 'FTD'})


selected_data = data[list(features.keys())]

# Create a robust scaler object and fit-transform the data
scaler = RobustScaler()
scaled_data = scaler.fit_transform(selected_data)

# Create the UMAP object and fit_transform the data to get a 2D representation
reducer = umap.UMAP()
embedding = reducer.fit_transform(scaled_data)

# Set the aesthetic style of the plot
sns.set(style="whitegrid", font_scale=1.2, rc={"grid.linewidth": 0.5})

# Create a color palette with three colors
palette = sns.color_palette(["#3498db", "#2ecc71", "#e74c3c"])  # Blue, Green, Red

# Plot the UMAP representation
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=embedding[:, 0], 
    y=embedding[:, 1], 
    hue=data['Group'],
    palette=palette, 
    s=50,  # Increased size
    alpha=0.7  # Adjusted alpha
)
plt.gca().set_aspect('equal', 'datalim')

# Set new axis labels and increase font size
plt.xlabel('UMAP 1', fontsize=16)
plt.ylabel('UMAP 2', fontsize=16)

# Change legend labels
new_labels = ['AD', 'HC', 'FTD']
legend = plt.legend(title='Group', labels=new_labels, title_fontsize='16', fontsize='14', loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
legend.get_title().set_fontsize('16')  # Set the fontsize of the legend title

# Remove title
plt.title('')

# Adjust layout so legend does not get cut off
plt.tight_layout()

# Save the plot
plt.savefig('umap_plot.png', dpi=300, bbox_inches='tight')

# Show the plot
plt.show()

features = {
    'stationary_ratio': 'Stationary Ratio',
    'Tik-norm': 'Tik-norm',
    'Total_Variation': 'Total Variation',
    'graph_energy': 'Graph Energy',
    'spectral_entropy': 'Spectral Entropy',
    'signal_energy': 'Signal Energy',
    'signal_power': 'Signal Power',
    'avg_degree': 'Average Degree',
    'diffusion_distance': 'Diffusion Distance',
}
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# The features are 'stationary_ratio' and 'Tik_norm'
X = data[list(features.keys())]

# The target is 'Group'
y = data['Group']

# Split the data into train+validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Encode labels
le = LabelEncoder()
y_train_int = le.fit_transform(y_train)
y_val_int = le.transform(y_val)
y_test_int = le.transform(y_test)

# Hyperparameters
params = {
    'n_neighbors': [3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski', 'canberra']
}

clf = KNeighborsClassifier(n_jobs=-1)
grid_search = GridSearchCV(clf, params, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_int)

best_clf = grid_search.best_estimator_

# Validate and test metrics
for label, group_name in enumerate(le.classes_):
    y_val_label = (y_val_int == label).astype(int)
    y_test_label = (y_test_int == label).astype(int)
    y_val_pred_prob = best_clf.predict_proba(X_val_scaled)[:, label]
    y_test_pred_prob = best_clf.predict_proba(X_test_scaled)[:, label]

    fpr_val, tpr_val, _ = roc_curve(y_val_label, y_val_pred_prob)
    fpr_test, tpr_test, _ = roc_curve(y_test_label, y_test_pred_prob)

    print(f"Validation ROC AUC for class {group_name} (Class {label}): {auc(fpr_val, tpr_val)}")
    print(f"Test ROC AUC for class {group_name} (Class {label}): {auc(fpr_test, tpr_test)}")

y_val_pred = le.inverse_transform(best_clf.predict(X_val_scaled))
y_test_pred = le.inverse_transform(best_clf.predict(X_test_scaled))

print("\nValidation Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("\nTest Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# The features are 'stationary_ratio' and 'Tik_norm'
X = data[list(features.keys())]

# The target is 'Group'
y = data['Group']

# Split the data into train+validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Encode labels
le = LabelEncoder()
y_train_int = le.fit_transform(y_train)
y_val_int = le.transform(y_val)
y_test_int = le.transform(y_test)

# Hyperparameters
params = {
    'n_estimators': [50, 100, 150, 200, 250, 300, 350],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

clf = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search = GridSearchCV(clf, params, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_int)

best_clf = grid_search.best_estimator_

# Validate and test metrics
for label, group_name in enumerate(le.classes_):
    y_val_label = (y_val_int == label).astype(int)
    y_test_label = (y_test_int == label).astype(int)
    y_val_pred_prob = best_clf.predict_proba(X_val_scaled)[:, label]
    y_test_pred_prob = best_clf.predict_proba(X_test_scaled)[:, label]

    fpr_val, tpr_val, _ = roc_curve(y_val_label, y_val_pred_prob)
    fpr_test, tpr_test, _ = roc_curve(y_test_label, y_test_pred_prob)

    print(f"Validation ROC AUC for class {group_name} (Class {label}): {auc(fpr_val, tpr_val)}")
    print(f"Test ROC AUC for class {group_name} (Class {label}): {auc(fpr_test, tpr_test)}")

y_val_pred = le.inverse_transform(best_clf.predict(X_val_scaled))
y_test_pred = le.inverse_transform(best_clf.predict(X_test_scaled))

print("\nValidation Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("\nTest Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))

import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# The features are 'stationary_ratio' and 'Tik_norm'
X = data[list(features.keys())]

# The target is 'Group'
y = data['Group']

# Split the data into train+validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Encode labels
le = LabelEncoder()
y_train_int = le.fit_transform(y_train)
y_val_int = le.transform(y_val)
y_test_int = le.transform(y_test)

# Hyperparameters
params = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 10],
    'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

clf = xgb.XGBClassifier(random_state=42, eval_metric='mlogloss')
grid_search = GridSearchCV(clf, params, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train_int)

best_clf = grid_search.best_estimator_

# Validate and test metrics
for label, group_name in enumerate(le.classes_):
    y_val_label = (y_val_int == label).astype(int)
    y_test_label = (y_test_int == label).astype(int)
    y_val_pred_prob = best_clf.predict_proba(X_val_scaled)[:, label]
    y_test_pred_prob = best_clf.predict_proba(X_test_scaled)[:, label]

    fpr_val, tpr_val, _ = roc_curve(y_val_label, y_val_pred_prob)
    fpr_test, tpr_test, _ = roc_curve(y_test_label, y_test_pred_prob)

    print(f"Validation ROC AUC for class {group_name} (Class {label}): {auc(fpr_val, tpr_val)}")
    print(f"Test ROC AUC for class {group_name} (Class {label}): {auc(fpr_test, tpr_test)}")

y_val_pred = le.inverse_transform(best_clf.predict(X_val_scaled))
y_test_pred = le.inverse_transform(best_clf.predict(X_test_scaled))

print("\nValidation Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("\nTest Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))

from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

# The features are 'stationary_ratio' and 'Tik-norm'
X = data[list(features.keys())]

# The target is 'Group'
y = data['Group']

# Split the data into train+validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Encode labels
le = LabelEncoder()
y_train_int = le.fit_transform(y_train)
y_val_int = le.transform(y_val)
y_test_int = le.transform(y_test)

clf = LogisticRegression(random_state=42, multi_class='auto', solver='liblinear', n_jobs=-1)
clf.fit(X_train_scaled, y_train_int)

# Validate and test metrics
for label, group_name in enumerate(le.classes_):
    y_val_label = (y_val_int == label).astype(int)
    y_test_label = (y_test_int == label).astype(int)
    y_val_pred_prob = clf.predict_proba(X_val_scaled)[:, label]
    y_test_pred_prob = clf.predict_proba(X_test_scaled)[:, label]

    fpr_val, tpr_val, _ = roc_curve(y_val_label, y_val_pred_prob)
    fpr_test, tpr_test, _ = roc_curve(y_test_label, y_test_pred_prob)

    print(f"Validation ROC AUC for class {group_name} (Class {label}): {auc(fpr_val, tpr_val)}")
    print(f"Test ROC AUC for class {group_name} (Class {label}): {auc(fpr_test, tpr_test)}")

y_val_pred = le.inverse_transform(clf.predict(X_val_scaled))
y_test_pred = le.inverse_transform(clf.predict(X_test_scaled))

print("\nValidation Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("\nTest Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))

from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

# The features are 'stationary_ratio' and 'Tik-norm'
X = data[list(features.keys())]

# The target is 'Group'
y = data['Group']

# Split the data into train+validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Encode labels
le = LabelEncoder()
y_train_int = le.fit_transform(y_train)
y_val_int = le.transform(y_val)
y_test_int = le.transform(y_test)

clf = SVC(probability=True, kernel='linear', random_state=42) # You can change the kernel as needed
clf.fit(X_train_scaled, y_train_int)

# Validate and test metrics
for label, group_name in enumerate(le.classes_):
    y_val_label = (y_val_int == label).astype(int)
    y_test_label = (y_test_int == label).astype(int)
    y_val_pred_prob = clf.predict_proba(X_val_scaled)[:, label]
    y_test_pred_prob = clf.predict_proba(X_test_scaled)[:, label]

    fpr_val, tpr_val, _ = roc_curve(y_val_label, y_val_pred_prob)
    fpr_test, tpr_test, _ = roc_curve(y_test_label, y_test_pred_prob)

    print(f"Validation ROC AUC for class {group_name} (Class {label}): {auc(fpr_val, tpr_val)}")
    print(f"Test ROC AUC for class {group_name} (Class {label}): {auc(fpr_test, tpr_test)}")

y_val_pred = le.inverse_transform(clf.predict(X_val_scaled))
y_test_pred = le.inverse_transform(clf.predict(X_test_scaled))

print("\nValidation Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("\nTest Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))

from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

# The features are 'stationary_ratio' and 'Tik-norm'
X = data[list(features.keys())]

# The target is 'Group'
y = data['Group']

# Split the data into train+validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Encode labels
le = LabelEncoder()
y_train_int = le.fit_transform(y_train)
y_val_int = le.transform(y_val)
y_test_int = le.transform(y_test)

clf = GaussianNB()
clf.fit(X_train_scaled, y_train_int)

# Validate and test metrics
for label, group_name in enumerate(le.classes_):
    y_val_label = (y_val_int == label).astype(int)
    y_test_label = (y_test_int == label).astype(int)
    y_val_pred_prob = clf.predict_proba(X_val_scaled)[:, label]
    y_test_pred_prob = clf.predict_proba(X_test_scaled)[:, label]

    fpr_val, tpr_val, _ = roc_curve(y_val_label, y_val_pred_prob)
    fpr_test, tpr_test, _ = roc_curve(y_test_label, y_test_pred_prob)

    print(f"Validation ROC AUC for class {group_name} (Class {label}): {auc(fpr_val, tpr_val)}")
    print(f"Test ROC AUC for class {group_name} (Class {label}): {auc(fpr_test, tpr_test)}")

y_val_pred = le.inverse_transform(clf.predict(X_val_scaled))
y_test_pred = le.inverse_transform(clf.predict(X_test_scaled))

print("\nValidation Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("\nTest Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))

import time
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import label_binarize
import pytorch_tabular
from pytorch_tabular.models.tab_transformer import TabTransformerConfig
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig

# Assuming you've loaded your data and created the 'Group_color' column...

# Extracting features
X = data[list(features.keys())]

# Target variable
y = data['Group']


# Splitting data 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Encoding labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)
y_test_encoded = le.transform(y_test)

# Convert datasets to DataFrames
train_df = pd.concat([X_train, pd.Series(y_train_encoded, name='Group', index=X_train.index)], axis=1)
val_df = pd.concat([X_val, pd.Series(y_val_encoded, name='Group', index=X_val.index)], axis=1)
test_df = pd.concat([X_test, pd.Series(y_test_encoded, name='Group', index=X_test.index)], axis=1)

# Define the configurations for TabTransformer
data_config = DataConfig(
    target=['Group'],
    continuous_cols=list(features.keys()),  # Assuming all features are continuous
    categorical_cols=[]
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Automatically find a suitable learning rate
    batch_size=1024,
    max_epochs=1000,
    gpus=1,
    gradient_clip_val=1,  # Gradient clipping
    early_stopping_patience=200  # Early stopping if validation performance doesn't improve for 5 epochs
)
optimizer_config = OptimizerConfig()

# Define the head configuration for TabTransformer
head_config = {}  # Use default configurations for the LinearHead

# Improved TabTransformer Configuration
model_config = TabTransformerConfig(
    task="classification",
    head="LinearHead",
    head_config=head_config,
    num_heads=16,
    num_attn_blocks=16,
    transformer_head_dim=256,
    share_embedding=True,
    share_embedding_strategy='fraction',
    shared_embedding_fraction=0.5,
    attn_dropout=0.2,
    add_norm_dropout=0.2,
    ff_dropout=0.2,
    embedding_dropout=0.2,
    batch_norm_continuous_input=True
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

# Train the model using train_df and val_df
tabular_model.fit(train=train_df, validation=val_df)

time.sleep(3) 

# Evaluate on test set using test_df
test_metrics = tabular_model.evaluate(test_df)
print("Test Metrics for TabTransformer Model on Test Set:", test_metrics)

# Evaluate on validation set using val_df
val_metrics = tabular_model.evaluate(val_df)
print("Test Metrics for TabTransformer Model on Validation Set:", val_metrics)

# Getting predictions on the test dataset
test_predictions = tabular_model.predict(test_df)

# Getting predictions on the validation dataset
val_predictions = tabular_model.predict(val_df)

# Extracting the predicted values for test
test_predicted_values = test_predictions['prediction'].values

# Extracting the predicted values for validation
val_predicted_values = val_predictions['prediction'].values

# Calculate metrics for both test and validation
def calculate_metrics(y_true, predicted_values):
    accuracy = accuracy_score(y_true, predicted_values)
    precision, recall, fscore, _ = precision_recall_fscore_support(y_true, predicted_values, average='weighted')

    if len(np.unique(y_true)) > 2:  # Multi-class case
        y_true_binarized = label_binarize(y_true, classes=np.unique(y_true))
        predicted_values_binarized = label_binarize(predicted_values, classes=np.unique(predicted_values))
        roc_auc = roc_auc_score(y_true_binarized, predicted_values_binarized, average="weighted", multi_class="ovr")
    else:  # Binary case
        roc_auc = roc_auc_score(y_true, predicted_values)
    
    return accuracy, precision, recall, fscore, roc_auc

test_accuracy, test_precision, test_recall, test_fscore, test_roc_auc = calculate_metrics(y_test_encoded, test_predicted_values)
print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-score: {test_fscore:.4f}")
print(f"ROC-AUC: {test_roc_auc:.4f}")

val_accuracy, val_precision, val_recall, val_fscore, val_roc_auc = calculate_metrics(y_val_encoded, val_predicted_values)
print("\nValidation Metrics:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1-score: {val_fscore:.4f}")
print(f"ROC-AUC: {val_roc_auc:.4f}")

# Classification Report for Test set
print("\nClassification Report for Test Set:")
print(classification_report(y_test_encoded, test_predicted_values))

# Classification Report for Validation set
print("\nClassification Report for Validation Set:")
print(classification_report(y_val_encoded, val_predicted_values))

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder, RobustScaler
import pandas as pd
import numpy as np

data['Group_color'] = data['Group'].map({'A': 'Group1', 'F': 'Group1', 'C': 'Group2'})

# Extracting features
X = data[list(features.keys())]

# Target variable
y = data['Group_color']

# Splitting data
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

# Scaling the data
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Grid search for KNeighborsClassifier
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11, 13],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski', 'canberra']
}
grid_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=param_grid_knn, scoring='roc_auc', n_jobs=-1, cv=5, verbose=2)
grid_knn.fit(X_train_scaled, y_train)

# Using the best model to predict
y_val_pred = grid_knn.best_estimator_.predict(X_val_scaled)
y_val_pred_proba = grid_knn.best_estimator_.predict_proba(X_val_scaled)[:, 1]

# Encoding labels
le = LabelEncoder()
y_val_encoded = le.fit_transform(y_val)

# Validation set metrics
print("Validation Metrics for Best KNeighborsClassifier Model:")
print("Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("ROC AUC Score:", roc_auc_score(y_val_encoded, y_val_pred_proba))

# Test set metrics
y_test_pred = grid_knn.best_estimator_.predict(X_test_scaled)
y_test_pred_proba = grid_knn.best_estimator_.predict_proba(X_test_scaled)[:, 1]
y_test_encoded = le.transform(y_test)

print("\nTest Metrics for Best KNeighborsClassifier Model:")
print("Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test_encoded, y_test_pred_proba))

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder, RobustScaler
import pandas as pd
import numpy as np

data['Group_color'] = data['Group'].map({'A': 'Group1', 'F': 'Group1', 'C': 'Group2'})

# Extracting features
X = data[list(features.keys())]

# Target variable
y = data['Group_color']

# Splitting data
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

# Scaling the data
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Grid search for RandomForestClassifier
param_grid_rf = {
    'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'bootstrap': [True, False]
}

grid_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=42), param_grid=param_grid_rf, scoring='roc_auc', n_jobs=-1, cv=5, verbose=2)
grid_rf.fit(X_train_scaled, y_train)

# Using the best model to predict
y_val_pred = grid_rf.best_estimator_.predict(X_val_scaled)
y_val_pred_proba = grid_rf.best_estimator_.predict_proba(X_val_scaled)[:, 1]

# Encoding labels
le = LabelEncoder()
y_val_encoded = le.fit_transform(y_val)

# Validation set metrics
print("Validation Metrics for Best RandomForestClassifier Model:")
print("Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("ROC AUC Score:", roc_auc_score(y_val_encoded, y_val_pred_proba))

# Test set metrics
y_test_pred = grid_rf.best_estimator_.predict(X_test_scaled)
y_test_pred_proba = grid_rf.best_estimator_.predict_proba(X_test_scaled)[:, 1]
y_test_encoded = le.transform(y_test)

print("\nTest Metrics for Best RandomForestClassifier Model:")
print("Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test_encoded, y_test_pred_proba))

from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder, RobustScaler
import pandas as pd
import numpy as np
from xgboost import XGBClassifier

data['Group_color'] = data['Group'].map({'A': 'Group1', 'F': 'Group1', 'C': 'Group2'})

# Extracting features
X = data[list(features.keys())]

# Target variable
y = data['Group_color']

# Encode labels to 0 and 1
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Splitting data
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

# Scaling the data
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Grid search for XGBClassifier
param_grid_xgb = {
    'learning_rate': [0.0001, 0.001, 0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 150, 200, 250, 300, 350, 500, 1000],
    'max_depth': [3, 5, 7, 10, 15],
    'gamma': [0, 0.1, 0.2, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.7, 0.8, 0.9, 1]
}

grid_xgb = GridSearchCV(estimator=XGBClassifier(random_state=42), param_grid=param_grid_xgb, scoring='roc_auc', n_jobs=-1, cv=5, verbose=2)
grid_xgb.fit(X_train_scaled, y_train)

# Using the best model to predict
y_val_pred = grid_xgb.best_estimator_.predict(X_val_scaled)
y_val_pred_proba = grid_xgb.best_estimator_.predict_proba(X_val_scaled)[:, 1]

# Validation set metrics
print("Validation Metrics for Best XGBClassifier Model:")
print("Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_val_pred_proba))

# Test set metrics
y_test_pred = grid_xgb.best_estimator_.predict(X_test_scaled)
y_test_pred_proba = grid_xgb.best_estimator_.predict_proba(X_test_scaled)[:, 1]

print("\nTest Metrics for Best XGBClassifier Model:")
print("Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_test_pred_proba))

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

data['Group_color'] = data['Group'].map({'A': 'Group1', 'F': 'Group1', 'C': 'Group2'})

# The features are 'stationary_ratio' and 'Tik-norm'
X = data[list(features.keys())] # Make sure the column names are correct

# The target is 'Group_color'
y = data['Group_color']

# Split the data into train+validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(random_state=42) # You can add hyperparameters as needed

clf.fit(X_train_scaled, y_train)

# Predict the labels of the validation set
y_val_pred = clf.predict(X_val_scaled)

# Predict the probabilities of the validation set
y_val_pred_proba = clf.predict_proba(X_val_scaled)[:, 1]

# Encode labels to 0 and 1
le = LabelEncoder()
y_val_encoded = le.fit_transform(y_val)

# Print the accuracy and other metrics of the classifier on the validation set
print("Validation Metrics:")
print("Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("ROC AUC Score:", roc_auc_score(y_val_encoded, y_val_pred_proba))

# Predict the labels of the test set
y_test_pred = clf.predict(X_test_scaled)

# Predict the probabilities of the test set
y_test_pred_proba = clf.predict_proba(X_test_scaled)[:, 1]

# Encode labels to 0 and 1
y_test_encoded = le.transform(y_test)

# Print the accuracy and other metrics of the classifier on the test set
print("\nTest Metrics:")
print("Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test_encoded, y_test_pred_proba))

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler

data['Group_color'] = data['Group'].map({'A': 'Group1', 'F': 'Group1', 'C': 'Group2'})

# The features are 'stationary_ratio' and 'Tik-norm'
X = data[list(features.keys())] # Make sure the column names are correct

# The target is 'Group_color'
y = data['Group_color']

# Split the data into train+validation set and test set
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the train data into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.5, random_state=42)

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

clf = SVC(random_state=42, probability=True) # You can add hyperparameters as needed

clf.fit(X_train_scaled, y_train)

# Predict the labels of the validation set
y_val_pred = clf.predict(X_val_scaled)

# Predict the probabilities of the validation set
y_val_pred_proba = clf.predict_proba(X_val_scaled)[:, 1]

# Encode labels to 0 and 1
le = LabelEncoder()
y_val_encoded = le.fit_transform(y_val)

# Print the accuracy and other metrics of the classifier on the validation set
print("Validation Metrics:")
print("Precision, Recall, F1-score:\n", classification_report(y_val, y_val_pred))
print("ROC AUC Score:", roc_auc_score(y_val_encoded, y_val_pred_proba))

# Predict the labels of the test set
y_test_pred = clf.predict(X_test_scaled)

# Predict the probabilities of the test set
y_test_pred_proba = clf.predict_proba(X_test_scaled)[:, 1]

# Encode labels to 0 and 1
y_test_encoded = le.transform(y_test)

# Print the accuracy and other metrics of the classifier on the test set
print("\nTest Metrics:")
print("Precision, Recall, F1-score:\n", classification_report(y_test, y_test_pred))
print("ROC AUC Score:", roc_auc_score(y_test_encoded, y_test_pred_proba))

import time
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import label_binarize
import pytorch_tabular
from pytorch_tabular.models.tab_transformer import TabTransformerConfig
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig

# Assuming you've loaded your data and created the 'Group_color' column...

# Extracting features
X = data[list(features.keys())]
# Target variable
y = data['Group_color']

# Splitting data 
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Encoding labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)
y_test_encoded = le.transform(y_test)

# Convert datasets to DataFrames
train_df = pd.concat([X_train, pd.Series(y_train_encoded, name='Group_color', index=X_train.index)], axis=1)
val_df = pd.concat([X_val, pd.Series(y_val_encoded, name='Group_color', index=X_val.index)], axis=1)
test_df = pd.concat([X_test, pd.Series(y_test_encoded, name='Group_color', index=X_test.index)], axis=1)

# Define the configurations for TabTransformer
data_config = DataConfig(
    target=['Group_color'],
    continuous_cols=list(features.keys()),  # Assuming all features are continuous
    categorical_cols=[]
)
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Automatically find a suitable learning rate
    batch_size=1024,
    max_epochs=100_000,
    gpus=1,
    gradient_clip_val=1,  # Gradient clipping
    early_stopping_patience=100  # Early stopping if validation performance doesn't improve for 5 epochs
)
optimizer_config = OptimizerConfig()

# Define the head configuration for TabTransformer
head_config = {}  # Use default configurations for the LinearHead

# Improved TabTransformer Configuration
model_config = TabTransformerConfig(
    task="classification",
    head="LinearHead",
    head_config=head_config,
    num_heads=8,
    num_attn_blocks=8,
    transformer_head_dim=256,
    share_embedding=True,
    share_embedding_strategy='fraction',
    shared_embedding_fraction=0.5,
    attn_dropout=0.2,
    add_norm_dropout=0.2,
    ff_dropout=0.2,
    embedding_dropout=0.2,
    batch_norm_continuous_input=True
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

# Train the model using train_df and val_df
tabular_model.fit(train=train_df, validation=val_df)

time.sleep(3) 

# Evaluate on test set using test_df
test_metrics = tabular_model.evaluate(test_df)
print("Test Metrics for TabTransformer Model on Test Set:", test_metrics)

# Evaluate on validation set using val_df
val_metrics = tabular_model.evaluate(val_df)
print("Test Metrics for TabTransformer Model on Validation Set:", val_metrics)

# Getting predictions on the test dataset
test_predictions = tabular_model.predict(test_df)

# Getting predictions on the validation dataset
val_predictions = tabular_model.predict(val_df)

# Extracting the predicted values for test
test_predicted_values = test_predictions['prediction'].values

# Extracting the predicted values for validation
val_predicted_values = val_predictions['prediction'].values

# Calculate metrics for both test and validation
def calculate_metrics(y_true, predicted_values):
    accuracy = accuracy_score(y_true, predicted_values)
    precision, recall, fscore, _ = precision_recall_fscore_support(y_true, predicted_values, average='weighted')

    if len(np.unique(y_true)) > 2:  # Multi-class case
        y_true_binarized = label_binarize(y_true, classes=np.unique(y_true))
        predicted_values_binarized = label_binarize(predicted_values, classes=np.unique(predicted_values))
        roc_auc = roc_auc_score(y_true_binarized, predicted_values_binarized, average="weighted", multi_class="ovr")
    else:  # Binary case
        roc_auc = roc_auc_score(y_true, predicted_values)
    
    return accuracy, precision, recall, fscore, roc_auc

test_accuracy, test_precision, test_recall, test_fscore, test_roc_auc = calculate_metrics(y_test_encoded, test_predicted_values)
print("\nTest Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-score: {test_fscore:.4f}")
print(f"ROC-AUC: {test_roc_auc:.4f}")

val_accuracy, val_precision, val_recall, val_fscore, val_roc_auc = calculate_metrics(y_val_encoded, val_predicted_values)
print("\nValidation Metrics:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1-score: {val_fscore:.4f}")
print(f"ROC-AUC: {val_roc_auc:.4f}")

# Classification Report for Test set
print("\nClassification Report for Test Set:")
print(classification_report(y_test_encoded, test_predicted_values))

# Classification Report for Validation set
print("\nClassification Report for Validation Set:")
print(classification_report(y_val_encoded, val_predicted_values))

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\xmoot\\Desktop\\VSCode\\gsp-eeg-alz\\features_tv.csv'