# Algorithm Selection Model using Random Forest

1. Import necessary packages.

In [1]:
import csv
import numpy as np
from sklearn.ensemble import RandomForestRegressor

2. `csv2dict(file)` function that loads the feature space from a `.csv` file.

In [2]:
def csv2dict(file):
    dicts = []
    with open(file, mode='r') as f:
        csv_reader = csv.DictReader(f)
        for row in csv_reader:
            new_dict = {}
            for key, value in row.items():
                try:
                    new_dict[key] = float(value)
                except ValueError:
                    new_dict[key] = value
            dicts.append(new_dict)
    return dicts

3. `load_training_data(features_location, performance_location)` function that loads feature sets and performance scores.

In [3]:
def load_training_data(features_location, performance_location):
    feature_dicts = csv2dict(features_location)
    performance_matrix = csv2dict(performance_location)
    algorithms = [list(algorithm.keys()) for algorithm in performance_matrix]
    return feature_dicts, performance_matrix, algorithms

4. `random_forest(X_clean, y_clean)` function that utilizes `RandomForestRegressor` to train the AS model.

In [4]:
def random_forest(X_clean, y_clean):
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_clean, y_clean)
    return rf

5. `prediction(rf, new_features_location, algorithms)` that make predictions given new datasets.

In [5]:
def prediction(rf, new_features_location, algorithms):
    new_feature_dicts = csv2dict(new_features_location)
    new_X = [list(new_feature_dict.values()) for new_feature_dict in new_feature_dicts]
    new_X_clean = [[0.0 if val == '' else val for val in row] for row in new_X]
    
    predicted_performance = rf.predict(new_X_clean)
    best_algorithm = np.argmin(predicted_performance)
    
    return algorithms[0][best_algorithm]

6. Implementation part, where we call the functions above and get predicted results.

In [6]:
# traverse files in the directory
import os
def file_name(file_dir):
    for root, dirs, files in os.walk(file_dir):
        return files

features_locations = file_name("data/feature_extraction")
performance_locations = file_name("data/performance")

features_locations.sort()
performance_locations.sort()

In [7]:
X = []
y = []

for i in range (len(features_locations)):
    feature_dicts, performance_matrix, algorithms = load_training_data("data/feature_extraction/"+features_locations[i], "data/performance/"+performance_locations[i])
    
    temp_X = list(feature_dicts[0].values())
    temp_X_clean = [0.0 if val == '' else val for val in temp_X]
    X.append(temp_X_clean)
    
    temp_y = list(performance_matrix[0].values())
    y.append(temp_y)

rf = random_forest(X, y)
print(prediction(rf, "data/feature_extraction/etth2_336_features.csv", algorithms))

 Supervised_PatchTST_MSE
