In [None]:
# AS AdaBoost
import csv
import numpy as np
from sklearn.ensemble import AdaBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os


# Function to load CSV data into a dictionary
def csv2dict(file):
    dicts = []
    with open(file, mode='r', encoding='utf-8') as f:  # Ensure correct encoding
        csv_reader = csv.DictReader(f)
        for row in csv_reader:
            new_dict = {}
            for key, value in row.items():
                try:
                    new_dict[key] = float(value)  # Convert to float if possible
                except ValueError:
                    new_dict[key] = value
            dicts.append(new_dict)
    return dicts


# Function to load training data
def load_training_data(features_location, performance_location):
    feature_dicts = csv2dict(features_location)
    performance_matrix = csv2dict(performance_location)
    algorithms = [list(algorithm.keys()) for algorithm in performance_matrix]

    X = []
    y = []

    for d, p in zip(feature_dicts, performance_matrix):
        # Extract feature values
        feature_values = []
        for key in d.keys():
            if isinstance(d[key], (int, float)):
                feature_values.append(float(d[key]))
            else:
                feature_values.append(0.0)  # Replace non-numeric with 0.0
        X.append(feature_values)

        # Extract target values
        try:
            target = float(list(p.values())[0])  # Assume each performance matrix has one value
        except (IndexError, ValueError):
            target = 0.0  # Handle according to the actual scenario
        y.append(target)

    X = np.array(X)
    y = np.array(y, dtype=float)

    return X, y, algorithms


# Function to train an AdaBoost regressor
def ada_boost_regressor(X_clean, y_clean):
    ada = AdaBoostRegressor(n_estimators=100, random_state=42)
    ada.fit(X_clean, y_clean)
    return ada


# Function to make predictions
def prediction(model, new_features_location, algorithms):
    new_feature_dicts = csv2dict(new_features_location)
    new_X = []
    for new_feature_dict in new_feature_dicts:
        feature_values = []
        for key in new_feature_dict.keys():
            if isinstance(new_feature_dict[key], (int, float)):
                feature_values.append(float(new_feature_dict[key]))
            else:
                feature_values.append(0.0)  # Replace non-numeric with 0.0
        new_X.append(feature_values)

    new_X_clean = np.array(new_X)

    # Check for valid feature data
    if new_X_clean.size == 0:
        print("No valid features to predict.")
        return None

    predicted_performance = model.predict(new_X_clean)
    best_algorithm = np.argmin(predicted_performance)

    return predicted_performance, algorithms[0][best_algorithm]


# Function to get the file names from a directory
def file_name(file_dir):
    for root, dirs, files in os.walk(file_dir):
        return files  # Return files in the first layer only


# Main implementation
def main():

    feature_directory = "data/feature_extraction"
    performance_directory = "data/performance"

    features_locations = file_name(feature_directory)
    performance_locations = file_name(performance_directory)

    features_locations.sort()
    performance_locations.sort()

    X = []
    y = []
    algorithms = []

    for i in range(len(features_locations)):
        features_path = os.path.join(feature_directory, features_locations[i])
        performance_path = os.path.join(performance_directory, performance_locations[i])

        X_temp, y_temp, algorithms_temp = load_training_data(features_path, performance_path)

        X.append(X_temp)
        y.append(y_temp)
        algorithms = algorithms_temp

    # Combine all data
    X = np.vstack(X)  # Combine feature arrays
    y = np.concatenate(y)  # Combine target values

    # Split training and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the AdaBoost model
    ada_model = ada_boost_regressor(X_train, y_train)

    # Predict on the test set
    y_pred = ada_model.predict(X_test)

    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error (MSE) on the test set: {mse}")

    # Predict on new data
    new_features_location = "data/feature_extraction/etth2_336_features.csv"
    predicted_performance, best_algorithm = prediction(ada_model, new_features_location, algorithms)

    if predicted_performance is not None:
        print(f"Predicted performance values: {predicted_performance}")
        print(f"Best algorithm: {best_algorithm}")


if __name__ == "__main__":
    main()
