In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import ConfusionMatrixDisplay
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics, preprocessing
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.cross_decomposition import PLSRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import chi2, mutual_info_classif
from sklearn.decomposition import PCA, FactorAnalysis
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import regularizers
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

In [None]:
df = pd.read_csv('survey_PE.csv')

print(df.head())
print(df.shape)

#-------------------------------------------------------CORRELATION MATRIX------------------------------------------------------

corr_matrix = df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, 
            annot=False, 
            fmt=".2f", 
            cmap='coolwarm', 
            center=0,
            linewidths=0.5)
plt.title("Correlation Matrix of Features")
plt.show()

upper_triangle = np.triu(np.ones_like(corr_matrix, dtype=bool))

high_corr_pairs = (corr_matrix.where(~upper_triangle)
                  .stack()
                  .reset_index()
                  .rename(columns={'level_0':'Feature1', 'level_1':'Feature2', 0:'Correlation'}))

high_corr_pairs = (high_corr_pairs[abs(high_corr_pairs['Correlation']) < 1]
                  .sort_values(by='Correlation', key=abs, ascending=False))

top_n = 20
print(f"Top {top_n} correlating feature pairs (|r| < 1):")
print(high_corr_pairs.head(top_n).to_string(index=False))

#-------------------------------------------------------MAKE TO CLASSIFICATION PROBLEM----------------------------------------------------

threshold = 0.0331
treshold2 = 0.0328

df['is_price_responsive'] = df['Price_elasticity'].apply(
    lambda x: 1 if x < -threshold else (0 if x > treshold2 else None)
)

df = df.dropna(subset=['is_price_responsive'])

counts = df['is_price_responsive'].value_counts()
print("Counts of each class:")
print(counts)


X = df.drop(columns=["is_price_responsive", "Price_elasticity", 'ID'])
y = df['is_price_responsive']

print("Features (X):", list(X.columns)) 

print("Target (y):", y.name)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 40)

In [None]:
#----------------------------------------------------------FEATURE SCALING -----------------------------------------------------

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train),
    columns=X_train.columns,  
    index=X_train.index       
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns,
    index=X_test.index
)

#----------------------------------------------------------DOMAIN FEATURES-----------------------------------------------------

selected_features = ['Q4', 'Q7', 'Q13_5', 'Q16', 'Q17', 'Q20', 
                    'Q21', 'Q22', 'Q27_5', 'Q29']

X_train_domain = X_train_scaled[selected_features]
X_test_domain = X_test_scaled[selected_features]


#---------------------------------------------------------FORWARD SELECTION-----------------------------------------------------

model = LogisticRegression(max_iter=10000, solver='saga', C=0.1,class_weight='balanced')

sfs = SFS(model, k_features=8, forward=True, floating=False, scoring="accuracy", cv=5)

sfs.fit(X_train_scaled, y_train)

accuracy = sfs.k_score_

selected_features = list(sfs.k_feature_names_)

X_train_FS = X_train_scaled[selected_features]
X_test_FS = X_test_scaled[selected_features]

print("Accuracy:", accuracy)
print("Selected Features:", sfs.k_feature_names_)

sfs_df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
sfs_df["avg_score"] = sfs_df["avg_score"].astype(float)
fig, ax = plt.subplots()
sfs_df.plot(kind="line", y="avg_score", ax=ax)
ax.set_xlabel("Number of Features")
ax.set_ylabel("Accuracy")
ax.set_title("Forward Selection Performance")
plt.show()

#-----------------------------------------------------------FISHER SCORE-------------------------------------------------------

def manual_fisher_score(X, y):
    
    X = np.array(X)
    y = np.array(y)
    classes = np.unique(y)
    scores = []
    
    for feature in X.T:  # Loop through each feature
        overall_mean = np.mean(feature)
        
        between_var = sum(
            [np.sum(y == cls) * (np.mean(feature[y == cls]) - overall_mean)**2 
            for cls in classes])
        
        within_var = sum([np.var(feature[y == cls]) * np.sum(y == cls) for cls in classes])
        
        fisher_score = between_var / (within_var + 1e-9)
        scores.append(fisher_score)
    
    return np.array(scores)

ranks = manual_fisher_score(X_train_scaled, y_train)

feature_importance = pd.DataFrame({
    'Feature': X_train_scaled.columns, 
    'Fisher_Score': ranks
}).sort_values('Fisher_Score', ascending=False)

feature_importance['Percentile'] = (
    feature_importance['Fisher_Score'].rank(pct=True).round(2) * 100
)

print("Top 10 Features Fisher Scores:")
print(feature_importance.head(10))

#-----------------------------------------------------------CHI SQUARED-------------------------------------------------------

X_train_chi2 = X_train.copy()

discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
X_train_chi2['Q_Age'] = discretizer.fit_transform(X_train_chi2[['Q_Age']])

X_train_chi2 = X_train_chi2.astype(int)
chi_scores, p_values = chi2(X_train_chi2, y_train)

chi2_results = pd.DataFrame({
    'Feature': X_train.columns,
    'Chi2_Score': chi_scores,
    'p_value': p_values
}).sort_values('Chi2_Score', ascending=False)

# Filter features on: p-value < 0.05 
significant_features = chi2_results[chi2_results['p_value'] < 0.4]
print("Significant Features by Chi-Squared Test (p < 0.05):")
print(significant_features)

#---------------------------------------------------------INFORMATION GAIN-----------------------------------------------------

def compute_stable_mi(X, y, n_runs=15, n_neighbors=5, random_state=42):
    
    mi_scores = np.zeros((n_runs, X.shape[1]))
    
    for i in range(n_runs):
        mi_scores[i] = mutual_info_classif(
            X, y,
            n_neighbors=n_neighbors,
            random_state=random_state + i
        )
    
    mean_scores = mi_scores.mean(axis=0)
    valid_idx = mean_scores > 0.001
    return (
        pd.Series(mean_scores[valid_idx], index=X.columns[valid_idx]),
        mi_scores.std(axis=0)[valid_idx],
        valid_idx
    )

stable_mi, mi_std, valid_idx = compute_stable_mi(X_train, y_train)

ig_results = pd.DataFrame({
    'Feature': X_train.columns[valid_idx],
    'IG_Score': stable_mi.values,
    'IG_Variance': mi_std
}).sort_values('IG_Score', ascending=False)

ig_results['IG_Percentile'] = (ig_results['IG_Score'].rank(pct=True).round(3) * 100)

print("\nTop 20 Features by Stable Mutual Information:")
print(ig_results.head(20).to_string(float_format="%.4f"))


#-----------------------------------------------------COMBINE RESULTS-----------------------------------------------------------

combined_results = (
    feature_importance
    .merge(chi2_results[['Feature', 'Chi2_Score', 'p_value']], on='Feature', how='left')
    .merge(ig_results, on='Feature', how='left')
)

# Fill NA for features not present in all methods
combined_results.fillna({'IG_Score': 0, 'IG_Percentile': 0, 'IG_Variance': 0}, inplace=True)

plt.figure(figsize=(12, 8))
top_n = min(15, len(combined_results))
top_features = combined_results.nlargest(top_n, 'Fisher_Score').copy()

# Create normalized scores
top_features['Fisher_norm'] = top_features['Fisher_Score'] / top_features['Fisher_Score'].max()
top_features['Chi2_norm'] = top_features['Chi2_Score'] / top_features['Chi2_Score'].max()
top_features['IG_norm'] = top_features['IG_Score'] / top_features['IG_Score'].max()

# Plot with proper column names
plot_data = top_features.set_index('Feature')[['Fisher_norm', 'Chi2_norm', 'IG_norm']]
plot_data.plot(
    kind='barh',
    color=['#647D65', '#A5C3CF', '#D9D9D9'],
    width=0.8,
    figsize=(12,8)
)

plt.title('Feature Importance Comparison Across Methods')
plt.xlabel('Normalized Importance Score')
plt.ylabel('Feature')
plt.legend(['Fisher Score', 'Chi-Squared', 'Mutual Info'])
plt.tight_layout()
plt.show()

#--------------------------------------------------Combined top features of the three methods---------------------------------------------------

print("\nConsensus Features (Important in All Methods):")
consensus = combined_results[
    (combined_results['Percentile'] > 60) & 
    (combined_results['p_value'] < 0.4) &
    (combined_results['IG_Percentile'] > 60)
]
print(consensus[['Feature', 'Fisher_Score', 'Chi2_Score', 'IG_Score']].to_string(float_format="%.4f"))

selected_features = consensus['Feature'].tolist()

test_features(selected_features)               

print('Selected features:')
print(selected_features)

X_train_selected = X_train_scaled[selected_features].copy()
X_test_selected  = X_test_scaled[selected_features].copy()
selected_features = list(selected_features)


In [None]:
#-----------------------------------------------------------PCA------------------------------------------------------------------


pca = PCA()  
pca.fit(X_train_scaled)

explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

pca_variance_df = pd.DataFrame({
    'Component': [f'PC{i+1}' for i in range(len(explained_variance))],
    'Individual_Variance': explained_variance,
    'Cumulative_Variance': cumulative_variance
})

n_components = np.argmax(cumulative_variance >= 0.95) + 1

detailed_variance = pd.DataFrame({
    'Component': [f'PC{i+1}' for i in range(len(explained_variance))],
    'Eigenvalue': pca.explained_variance_,
    'Variance_Ratio': pca.explained_variance_ratio_,
    'Cumulative_Variance': cumulative_variance,
    'Variance_Percent': pca.explained_variance_ratio_ * 100,
    'Cumulative_Percent': cumulative_variance * 100
})

print("\nDetailed PCA Component Analysis:")
print(detailed_variance.head(15).to_string(float_format=lambda x: f"{x:.4f}"))
    
pca_optimal = PCA(n_components=n_components)
X_train_pca = pca_optimal.fit_transform(X_train_scaled)
X_test_pca = pca_optimal.transform(X_test_scaled)

X_train_pca = pd.DataFrame(
    X_train_pca,
    columns=[f"PC{i+1}" for i in range(n_components)],
    index=X_train.index
)

X_test_pca = pd.DataFrame(
    X_test_pca,
    columns=[f"PC{i+1}" for i in range(n_components)],
    index=X_test.index
)

print(f"\nCreated PCA transformed datasets with {n_components} components")
print(f"X_train_pca shape: {X_train_pca.shape}")
print(f"X_test_pca shape: {X_test_pca.shape}")

loadings = pd.DataFrame(
    pca_optimal.components_.T,
    columns=[f'PC{i+1}' for i in range(n_components)],
    index=X_train.columns
)

print("\nPCA Component Loadings (showing 5 features per component):")
for pc in loadings.columns:
    print(f"\nTop features for {pc}:")
    print(loadings[pc].abs().sort_values(ascending=False).head(5))

#-----------------------------------------------------------PLS------------------------------------------------------------------

y_train_encoded = LabelEncoder().fit_transform(y_train)
n_components = 10

pls = PLSRegression(n_components=n_components)
X_train_pls = pls.fit_transform(X_train_scaled, y_train_encoded)[0]
X_test_pls = pls.transform(X_test_scaled)

X_train_pls = pd.DataFrame(X_train_pls, columns=[f'PLS{i+1}' for i in range(n_components)], index=X_train.index)
X_test_pls = pd.DataFrame(X_test_pls, columns=[f'PLS{i+1}' for i in range(n_components)], index=X_test.index)

pls_loadings = pd.DataFrame(
    pls.x_weights_,
    columns=[f'PLS{i+1}' for i in range(n_components)],
    index=X_train.columns
)

print("PLS Component Loadings (X-weights):")
print(pls_loadings)

for component in pls_loadings.columns:
    print(f"\nTop features for {component}:")
    print(pls_loadings[component].abs().sort_values(ascending=False).head(5))

pls_explained_variance = pls.x_scores_.var(axis=0) / pls.x_scores_.var(axis=0).sum()
print("\nExplained Variance per PLS Component (X-space):")
print(pd.Series(pls_explained_variance, index=[f'PLS{i+1}' for i in range(n_components)]))

print("\nPLS Model Coefficients (for original features):")
print(pd.Series(pls.coef_.flatten(), index=X_train.columns))

pls_variance_df = pd.DataFrame({
    'Component': [f'PLS{i+1}' for i in range(len(pls_explained_variance))],
    'Variance_Ratio': pls_explained_variance,
    'Cumulative_PLS': np.cumsum(pls_explained_variance)
})

#-----------------------------------------------------------PLS and PCA COMPARISON------------------------------------------------------------------


print("\n=== PCA vs PLS Explained Variance Comparison ===")
comparison_df = pd.DataFrame({
    'Component_Num': range(1, min(len(explained_variance), len(pls_explained_variance))+1),
    'PCA_Component': [f'PC{i}' for i in range(1, min(len(explained_variance), len(pls_explained_variance))+1)],
    'PCA_Variance': explained_variance[:len(pls_explained_variance)],
    'PCA_Cumulative': cumulative_variance[:len(pls_explained_variance)],
    'PLS_Component': [f'PLS{i}' for i in range(1, min(len(explained_variance), len(pls_explained_variance))+1)],
    'PLS_Variance': pls_explained_variance,
    'PLS_Cumulative': np.cumsum(pls_explained_variance)
})

print(comparison_df.head(10).to_string(float_format=lambda x: f"{x:.4f}"))

plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
plt.bar([i-0.2 for i in range(1, 11)], explained_variance[:10], width=0.4, label='PCA', alpha=0.7)
plt.bar([i+0.2 for i in range(1, 11)], pls_explained_variance[:10], width=0.4, label='PLS', alpha=0.7)
plt.xlabel('Component Number')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA vs PLS: Individual Explained Variance')
plt.legend()
plt.xticks(range(1, 11))

plt.subplot(1, 2, 2)
plt.plot(range(1, 11), cumulative_variance[:10], 'o-', label='PCA', markersize=4)
plt.plot(range(1, 11), np.cumsum(pls_explained_variance)[:10], 's-', label='PLS', markersize=4)
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Threshold')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA vs PLS: Cumulative Explained Variance')
plt.legend()
plt.xticks(range(1, 11))
plt.tight_layout()
plt.show()

In [None]:
#----------------------------------------------------------------MODELS FUNCTIONS------------------------------------------------

def run_logistic_regression(X_train, y_train, X_test, y_test, 
                          max_iter=100000, solver='saga', C=0.01, 
                          class_weight='balanced', dataset_name=''):
    """Logistic regression model."""
    model = LogisticRegression(max_iter=max_iter, solver=solver, 
                             C=C, class_weight=class_weight)
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    print('Logistic Regression Results:')
    print(f'Accuracy: {accuracy:.4f}')
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds))
    
    plot_confusion_matrix(y_test, preds, 'Logistic Regression', dataset_name)
    
    # Feature importance plot
    if hasattr(model, 'coef_') and hasattr(X_train, 'columns'):
        importances = np.abs(model.coef_[0])
        plot_feature_importance(importances, X_train, "Logistic Regression")
    
    return model, preds, accuracy

def run_svm(X_train, y_train, X_test, y_test, kernel='rbf', 
           class_weight='balanced', probability=True, dataset_name=''):
    """SVM model."""
    model = SVC(kernel=kernel, class_weight=class_weight, probability=probability)
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    print('SVM Results:')
    print(f'Accuracy: {accuracy:.4f}')
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds))
    
    plot_confusion_matrix(y_test, preds, 'SVM', dataset_name)
    
    if hasattr(X_train, 'columns'):
        result = permutation_importance(model, X_test, y_test, 
                                      n_repeats=10, random_state=42)
        plot_feature_importance(result.importances_mean, X_train, "SVM")
    
    return model, preds, accuracy

def run_random_forest(X_train, y_train, X_test, y_test, n_estimators=100, 
                     max_depth=12, max_features='sqrt', min_samples_leaf=3, 
                     min_samples_split=2, random_state=42, dataset_name=''):
    """Random Forest model."""
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth, max_features=max_features,
        min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split,
        random_state=random_state
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    print('Random Forest Results:')
    print(f'Accuracy: {accuracy:.4f}')
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds))
    
    plot_confusion_matrix(y_test, preds, "Random Forest", dataset_name)
    plot_feature_importance(model.feature_importances_, X_train, "Random Forest")
    
    return model, preds, accuracy

def run_xgboost(X_train, y_train, X_test, y_test, learning_rate=0.1, 
               max_depth=3, min_child_weight=1, subsample=0.8, 
               colsample_bytree=0.8, n_estimators=100, gamma=0, 
               random_state=42, dataset_name=''):
    """XGBoost model."""
    model = XGBClassifier(
        learning_rate=learning_rate, max_depth=max_depth, 
        min_child_weight=min_child_weight, subsample=subsample,
        colsample_bytree=colsample_bytree, n_estimators=n_estimators,
        gamma=gamma, random_state=random_state, eval_metric='logloss',
        use_label_encoder=False
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    print('XGBoost Results:')
    print(f'Accuracy: {accuracy:.4f}')
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds))
    
    plot_confusion_matrix(y_test, preds, "XGBoost", dataset_name)
    plot_feature_importance(model.feature_importances_, X_train, "XGBoost")
    
    return model, preds, accuracy

def run_lightgbm(X_train, y_train, X_test, y_test, n_estimators=100, 
                learning_rate=0.1, max_depth=5, num_leaves=31, 
                class_weight='balanced', random_state=42, dataset_name=''):
    """LightGBM model."""
    model = LGBMClassifier(
        n_estimators=n_estimators, learning_rate=learning_rate,
        max_depth=max_depth, num_leaves=num_leaves,
        class_weight=class_weight, random_state=random_state
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    
    print("LightGBM Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds))
    
    plot_confusion_matrix(y_test, preds, "LightGBM", dataset_name)
    plot_feature_importance(model.feature_importances_, X_train, "LightGBM")
    
    return model, preds, accuracy

#------------------------------------------------------------PLOTS-------------------------------------------------------------

def plot_confusion_matrix(y_true, y_pred, model_name, dataset_name):
    """Universal confusion matrix plot for all models."""
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_true, y_pred)
    
    # Choose color scheme based on model type
    if 'Random Forest' in model_name:
        cmap = 'Oranges'
        bg_color = '#f5f5f5'
    elif 'XGBoost' in model_name:
        cmap = 'Oranges'
        bg_color = '#f0f8ff'
    elif 'LightGBM' in model_name:
        cmap = 'Oranges'
        bg_color = '#fffaf0'
    else:  # Logistic Regression, SVM
        cmap = 'Reds'
        bg_color = None
    
    ax = sns.heatmap(
        cm, 
        annot=True, 
        fmt='d', 
        cmap=cmap,
        annot_kws={'size': 16, 'weight': 'bold'},
        cbar=False,            
        linewidths=1,          
        linecolor='lightgray'   
    )
    
    plt.title(
        f'{model_name} Confusion Matrix\n{dataset_name}', 
        pad=20, fontsize=16, weight='bold'
    )
    plt.xlabel('Predicted Label', fontsize=14, weight='bold', labelpad=10)
    plt.ylabel('True Label', fontsize=14, weight='bold', labelpad=10)
    
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=14, weight='bold')
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=14, weight='bold')
    
    if bg_color:
        plt.gca().set_facecolor(bg_color)
    
    plt.tight_layout()
    plt.show()

def plot_feature_importance(importances, X_train, model_name, top_n=8):
    """Universal feature importance plot for all models."""
    # Handle both DataFrame and numpy array cases
    if hasattr(X_train, 'columns'):
        feature_names = X_train.columns
    else:
        feature_names = [f'Feature {i}' for i in range(X_train.shape[1])]
    
    # Sort features by importance
    sorted_indices = np.argsort(importances)[::-1]
    n_features = min(top_n, len(feature_names))
    top_indices = sorted_indices[:n_features]
    
    # Choose colors based on model type
    if 'Random Forest' in model_name or 'XGBoost' in model_name or 'LightGBM' in model_name:
        color = '#A5C3CF'
    else:  # Logistic Regression, SVM
        color = '#AC3E3E'
    
    plt.figure(figsize=(10, 6))
    plt.title(f"Top Feature Importances - {model_name}")
    plt.barh(range(n_features), importances[top_indices], 
             align='center', color=color, height=0.7)
    plt.yticks(range(n_features), np.array(feature_names)[top_indices])
    
    # Customize x-label based on model type
    if 'SVM' in model_name:
        plt.xlabel("Mean Accuracy Decrease After Permutation")
    elif any(tree_model in model_name for tree_model in ['Random Forest', 'XGBoost', 'LightGBM']):
        plt.xlabel("Importance Score")
    else:  # Logistic Regression
        plt.xlabel("Absolute Coefficient Value")
    
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

    
#----------------------------------------------------------------RUN FUNCTIONS--------------------------------------------------

datasets = [
    ('Original', X_train, y_train, X_test, y_test),
    ('Scaled', X_train_scaled, y_train, X_test_scaled, y_test),
    ('Combined', X_train_combined, y_train, X_test_combined, y_test),
    ('Selected', X_train_selected, y_train, X_test_selected, y_test),
    ('Forward selection', X_train_FS, y_train, X_test_FS, y_test),
    ('Domain', X_train_domain, y_train, X_test_domain, y_test),
    ('PCA', X_train_pca, y_train, X_test_pca, y_test),
    ('PLS', X_train_pls, y_train, X_test_pls, y_test)
]

model_functions = {
    'Logistic Regression': run_logistic_regression,
    'SVM': run_svm,
    'Random Forest': run_random_forest,
    'XGBoost': run_xgboost,
    'LightGBM': run_lightgbm
}


all_accuracies = {model_name: {} for model_name in model_functions.keys()}

#Run all models on all datasets
for model_name, model_func in model_functions.items():
    
    print(f"RUNNING {model_name.upper()} MODELS")
    
    for dataset_name, X_tr, y_tr, X_te, y_te in datasets:
        print(f"\n--- {model_name} on {dataset_name} Dataset ---")
        try:
            _, _, acc = model_func(X_tr, y_tr, X_te, y_te, dataset_name=dataset_name)
            all_accuracies[model_name][dataset_name] = acc
        except Exception as e:
            print(f"Error running {model_name} on {dataset_name}: {str(e)}")
            all_accuracies[model_name][dataset_name] = 0.0

print("FINAL ACCURACY SUMMARY")

for model_name in all_accuracies:
    print(f"\n{model_name}:")
    for dataset_name, acc in all_accuracies[model_name].items():
        print(f"  {dataset_name:15}: {acc:.4f}")

#Best perfoming model
best_acc = 0
best_combo = ""
for model_name in all_accuracies:
    for dataset_name, acc in all_accuracies[model_name].items():
        if acc > best_acc:
            best_acc = acc
            best_combo = f"{model_name} on {dataset_name}"

print(f"\nBest Performance: {best_combo} with accuracy {best_acc:.4f}")