# Algorithm Selection Model using Random Forest

1. Import necessary packages.

In [8]:
import csv
import numpy as np
from sklearn.ensemble import RandomForestRegressor

2. `csv2dict(file)` function that loads the feature space from a `.csv` file.

In [9]:
def csv2dict(file):
    dicts = []
    with open(file, mode='r') as f:
        csv_reader = csv.DictReader(f)
        for row in csv_reader:
            new_dict = {}
            for key, value in row.items():
                try:
                    new_dict[key] = float(value)
                except ValueError:
                    new_dict[key] = value
            dicts.append(new_dict)
    return dicts

3. `load_training_data(features_location, performance_location)` function that loads feature sets and performance scores.

In [10]:
def load_training_data(features_location, performance_location):
    feature_dicts = csv2dict(features_location)
    performance_matrix = csv2dict(performance_location)
    algorithms = [list(algorithm.keys()) for algorithm in performance_matrix]
    return feature_dicts, performance_matrix, algorithms

4. `random_forest(X_clean, y_clean)` function that utilizes `RandomForestRegressor` to train the AS model.

In [11]:
def random_forest(X_clean, y_clean):
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_clean, y_clean)
    return rf

5. `prediction(rf, new_features_location, algorithms)` that make predictions given new datasets.

In [12]:
def prediction(rf, new_features_location, algorithms):
    desired_keys = ['Horizontal Length', 'MULL_Spectrogram mean coefficient_27.42Hz', 'weekday_Spectrogram mean coefficient_37.1Hz', 'HULL_MFCC_5', 'HULL_Centroid', 'OT_Wavelet variance_12.5Hz', 'HULL_Spectral roll-off', 'MULL_Wavelet standard deviation_12.5Hz', 'LULL_Positive turning points', 'LUFL_Spectrogram mean coefficient_27.42Hz']
    
    new_feature_dicts = csv2dict(new_features_location)
    new_X = [[new_feature_dict[key] for key in desired_keys if key in new_feature_dict]
        for new_feature_dict in new_feature_dicts]
    # new_X = [list(new_feature_dict.values()) for new_feature_dict in new_feature_dicts]
    new_X_clean = [[0.0 if val == '' else val for val in row] for row in new_X]
    
    predicted_performance = rf.predict(new_X_clean)
    even_items = predicted_performance[:, ::2]
    odd_items = predicted_performance[:, 1::2]
    
    # Find the best algorithm for MSE and MAE
    best_algorithm_mse = np.argmin(even_items)
    best_algorithm_mae = np.argmin(odd_items)
    
    # Find the second best algorithm for MSE and MAE
    temp_mse = np.copy(even_items)
    temp_mae = np.copy(odd_items)
    temp_mse[0][best_algorithm_mse] = np.inf
    temp_mae[0][best_algorithm_mae] = np.inf
    second_best_algorithm_mse = np.argmin(temp_mse)
    second_best_algorithm_mae = np.argmin(temp_mae)
    
    print('Predicted performance:', predicted_performance)
    algorithm = [[], []]
    algorithm[0] = algorithms[0][::2]
    algorithm[1] = algorithms[0][1::2]

    print("Predicted MSEs:", even_items)
    print("Best algorithm for MSE:", algorithm[0][best_algorithm_mse])
    print("Second best algorithm for MSE:", algorithm[0][second_best_algorithm_mse])
    print("Predicted MAEs:", odd_items)
    print("Best algorithm for MAE:", algorithm[1][best_algorithm_mae])
    print("Second best algorithm for MAE:", algorithm[1][second_best_algorithm_mae])
    return

6. Implementation part, where we call the functions above and get predicted results.

In [13]:
# traverse files in the directory
import os
def file_name(file_dir):
    for root, dirs, files in os.walk(file_dir):
        return files

features_locations = file_name("data/feature_extraction")
performance_locations = file_name("data/performance")

features_locations.sort()
performance_locations.sort()

X = []
y = []

for i in range (len(features_locations)):
    feature_dicts, performance_matrix, algorithms = load_training_data("data/feature_extraction/"+features_locations[i], "data/performance/"+performance_locations[i])
    
    desired_keys = ['Horizontal Length', 'MULL_Spectrogram mean coefficient_27.42Hz', 'weekday_Spectrogram mean coefficient_37.1Hz', 'HULL_MFCC_5', 'HULL_Centroid', 'OT_Wavelet variance_12.5Hz', 'HULL_Spectral roll-off', 'MULL_Wavelet standard deviation_12.5Hz', 'LULL_Positive turning points', 'LUFL_Spectrogram mean coefficient_27.42Hz']
    # temp_X = list(feature_dicts[0].values())
    temp_X = [feature_dicts[0][key] if key in feature_dicts[0] else 0.0 for key in desired_keys]
    temp_X_clean = [0.0 if val == '' else val for val in temp_X]
    X.append(temp_X_clean)
    
    temp_y = list(performance_matrix[0].values())
    y.append(temp_y)

rf = random_forest(X, y)

In [31]:
with open('X_selected_rf-importance.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(X)

### For the `etth1` dataset:

In [14]:
print(prediction(rf, "data/feature_extraction/etth1_96_features.csv", algorithms))

Predicted performance: [[0.36179 0.37885 0.34783 0.37252 0.36287 0.38923 0.36439 0.39062 0.36211
  0.39569 0.39965 0.42694 0.3606  0.38855 0.3822  0.4056  0.40498 0.42808
  0.35959 0.38828]]
Predicted MSEs: [[0.36179 0.34783 0.36287 0.36439 0.36211 0.39965 0.3606  0.3822  0.40498
  0.35959]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  DLinear_MSE
Predicted MAEs: [[0.37885 0.37252 0.38923 0.39062 0.39569 0.42694 0.38855 0.4056  0.42808
  0.38828]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [15]:
print(prediction(rf, "data/feature_extraction/etth1_192_features.csv", algorithms))

Predicted performance: [[0.39716 0.40153 0.38263 0.39436 0.40107 0.41434 0.39689 0.40922 0.39315
  0.4157  0.44512 0.45827 0.39315 0.40907 0.42049 0.42962 0.45441 0.45852
  0.40045 0.41729]]
Predicted MSEs: [[0.39716 0.38263 0.40107 0.39689 0.39315 0.44512 0.39315 0.42049 0.45441
  0.40045]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  Supervised_PatchTST_MSE
Predicted MAEs: [[0.40153 0.39436 0.41434 0.40922 0.4157  0.45827 0.40907 0.42962 0.45852
  0.41729]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [16]:
print(prediction(rf, "data/feature_extraction/etth1_336_features.csv", algorithms))

Predicted performance: [[0.43428 0.42245 0.4137  0.41429 0.43641 0.43604 0.42499 0.42669 0.41404
  0.43069 0.48558 0.48654 0.42286 0.42916 0.45885 0.45524 0.49639 0.48575
  0.43486 0.44137]]
Predicted MSEs: [[0.43428 0.4137  0.43641 0.42499 0.41404 0.48558 0.42286 0.45885 0.49639
  0.43486]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  SimMTM_MSE
Predicted MAEs: [[0.42245 0.41429 0.43604 0.42669 0.43069 0.48654 0.42916 0.45524 0.48575
  0.44137]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [17]:
print(prediction(rf, "data/feature_extraction/etth1_720_features.csv", algorithms))

Predicted performance: [[0.44958 0.44337 0.41681 0.43175 0.45182 0.45773 0.45609 0.4618  0.43106
  0.44924 0.56932 0.55219 0.45596 0.4634  0.52824 0.51259 0.51666 0.5021
  0.514   0.50842]]
Predicted MSEs: [[0.44958 0.41681 0.45182 0.45609 0.43106 0.56932 0.45596 0.52824 0.51666
  0.514  ]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  SimMTM_MSE
Predicted MAEs: [[0.44337 0.43175 0.45773 0.4618  0.44924 0.55219 0.4634  0.51259 0.5021
  0.50842]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


### For the `etth2` dataset:

In [18]:
print(prediction(rf, "data/feature_extraction/etth2_96_features.csv", algorithms))

Predicted performance: [[0.32368 0.35746 0.31277 0.35103 0.31933 0.36618 0.3099  0.36051 0.31519
  0.36118 0.31606 0.37184 0.30656 0.35535 0.32821 0.37447 0.36579 0.41179
  0.32287 0.37533]]
Predicted MSEs: [[0.32368 0.31277 0.31933 0.3099  0.31519 0.31606 0.30656 0.32821 0.36579
  0.32287]]
Best algorithm for MSE:  Supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.35746 0.35103 0.36618 0.36051 0.36118 0.37184 0.35535 0.37447 0.41179
  0.37533]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  Supervised_PatchTST_MAE
None


In [19]:
print(prediction(rf, "data/feature_extraction/etth2_192_features.csv", algorithms))

Predicted performance: [[0.36485 0.3849  0.35494 0.37964 0.37191 0.3978  0.34977 0.38706 0.35345
  0.38508 0.35442 0.39789 0.34861 0.38436 0.37951 0.40592 0.3972  0.43009
  0.38734 0.41856]]
Predicted MSEs: [[0.36485 0.35494 0.37191 0.34977 0.35345 0.35442 0.34861 0.37951 0.3972
  0.38734]]
Best algorithm for MSE:  Supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.3849  0.37964 0.3978  0.38706 0.38508 0.39789 0.38436 0.40592 0.43009
  0.41856]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  Supervised_PatchTST_MAE
None


In [20]:
print(prediction(rf, "data/feature_extraction/etth2_336_features.csv", algorithms))

Predicted performance: [[0.39413 0.4084  0.38548 0.40389 0.39123 0.41258 0.38191 0.41264 0.38246
  0.40655 0.40011 0.42888 0.37547 0.40655 0.41201 0.42884 0.40852 0.43873
  0.44773 0.45732]]
Predicted MSEs: [[0.39413 0.38548 0.39123 0.38191 0.38246 0.40011 0.37547 0.41201 0.40852
  0.44773]]
Best algorithm for MSE:  Supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.4084  0.40389 0.41258 0.41264 0.40655 0.42888 0.40655 0.42884 0.43873
  0.45732]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  Supervised_PatchTST_MAE
None


In [21]:
print(prediction(rf, "data/feature_extraction/etth2_720_features.csv", algorithms))

Predicted performance: [[0.41761 0.4291  0.40771 0.42511 0.41482 0.43951 0.41703 0.44475 0.41135
  0.4329  0.52305 0.5031  0.41049 0.43914 0.44713 0.45925 0.45258 0.46694
  0.63212 0.55719]]
Predicted MSEs: [[0.41761 0.40771 0.41482 0.41703 0.41135 0.52305 0.41049 0.44713 0.45258
  0.63212]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  Supervised_PatchTST_MSE
Predicted MAEs: [[0.4291  0.42511 0.43951 0.44475 0.4329  0.5031  0.43914 0.45925 0.46694
  0.55719]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


### For the `ettm1` dataset:

In [22]:
print(prediction(rf, "data/feature_extraction/ettm1_96_features.csv", algorithms))

Predicted performance: [[0.30589 0.3473  0.29565 0.34111 0.28774 0.34514 0.29421 0.34839 0.29959
  0.35119 0.30657 0.35397 0.30041 0.34736 0.30931 0.36138 0.34466 0.38095
  0.30432 0.34912]]
Predicted MSEs: [[0.30589 0.29565 0.28774 0.29421 0.29959 0.30657 0.30041 0.30931 0.34466
  0.30432]]
Best algorithm for MSE:  Self-supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.3473  0.34111 0.34514 0.34839 0.35119 0.35397 0.34736 0.36138 0.38095
  0.34912]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  Self-supervised_PatchTST_MAE
None


In [23]:
print(prediction(rf, "data/feature_extraction/ettm1_192_features.csv", algorithms))

Predicted performance: [[0.3467  0.37361 0.33644 0.36738 0.32455 0.3704  0.32998 0.37087 0.33418
  0.37334 0.35024 0.3827  0.33872 0.37353 0.34543 0.38256 0.42243 0.41788
  0.33866 0.36938]]
Predicted MSEs: [[0.3467  0.33644 0.32455 0.32998 0.33418 0.35024 0.33872 0.34543 0.42243
  0.33866]]
Best algorithm for MSE:  Self-supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.37361 0.36738 0.3704  0.37087 0.37334 0.3827  0.37353 0.38256 0.41788
  0.36938]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  DLinear_MAE
None


In [24]:
print(prediction(rf, "data/feature_extraction/ettm1_336_features.csv", algorithms))

Predicted performance: [[0.38135 0.3956  0.37006 0.3882  0.34772 0.384   0.35997 0.38935 0.36365
  0.39284 0.3844  0.40515 0.37179 0.39533 0.37509 0.40041 0.4476  0.43863
  0.36959 0.39124]]
Predicted MSEs: [[0.38135 0.37006 0.34772 0.35997 0.36365 0.3844  0.37179 0.37509 0.4476
  0.36959]]
Best algorithm for MSE:  Self-supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.3956  0.3882  0.384   0.38935 0.39284 0.40515 0.39533 0.40041 0.43863
  0.39124]]
Best algorithm for MAE:  Self-supervised_PatchTST_MAE
Second best algorithm for MAE:  GPHT_MAE
None


In [25]:
print(prediction(rf, "data/feature_extraction/ettm1_720_features.csv", algorithms))

Predicted performance: [[0.44258 0.42896 0.42764 0.42104 0.39519 0.41564 0.40885 0.41973 0.40657
  0.41895 0.45199 0.44666 0.41561 0.42599 0.43175 0.43471 0.49614 0.46676
  0.43837 0.43497]]
Predicted MSEs: [[0.44258 0.42764 0.39519 0.40885 0.40657 0.45199 0.41561 0.43175 0.49614
  0.43837]]
Best algorithm for MSE:  Self-supervised_PatchTST_MSE
Second best algorithm for MSE:  SimMTM_MSE
Predicted MAEs: [[0.42896 0.42104 0.41564 0.41973 0.41895 0.44666 0.42599 0.43471 0.46676
  0.43497]]
Best algorithm for MAE:  Self-supervised_PatchTST_MAE
Second best algorithm for MAE:  SimMTM_MAE
None


### For the `ettm2` dataset:

In [26]:
print(prediction(rf, "data/feature_extraction/ettm2_96_features.csv", algorithms))

Predicted performance: [[0.20716 0.27558 0.19699 0.2686  0.19947 0.2778  0.19789 0.27963 0.1989
  0.28031 0.20807 0.28703 0.20213 0.28111 0.20543 0.28728 0.21622 0.2936
  0.19855 0.28495]]
Predicted MSEs: [[0.20716 0.19699 0.19947 0.19789 0.1989  0.20807 0.20213 0.20543 0.21622
  0.19855]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.27558 0.2686  0.2778  0.27963 0.28031 0.28703 0.28111 0.28728 0.2936
  0.28495]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [27]:
print(prediction(rf, "data/feature_extraction/ettm2_192_features.csv", algorithms))

Predicted performance: [[0.25033 0.3034  0.23827 0.29647 0.24358 0.30924 0.23927 0.30791 0.238
  0.30458 0.2517  0.31749 0.24503 0.31169 0.25202 0.31803 0.25663 0.31902
  0.24335 0.31747]]
Predicted MSEs: [[0.25033 0.23827 0.24358 0.23927 0.238   0.2517  0.24503 0.25202 0.25663
  0.24335]]
Best algorithm for MSE:  SimMTM_MSE
Second best algorithm for MSE:  GPHT_MSE
Predicted MAEs: [[0.3034  0.29647 0.30924 0.30791 0.30458 0.31749 0.31169 0.31803 0.31902
  0.31747]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [28]:
print(prediction(rf, "data/feature_extraction/ettm2_336_features.csv", algorithms))

Predicted performance: [[0.30719 0.33866 0.29224 0.33171 0.29835 0.34772 0.29535 0.34698 0.28932
  0.33716 0.31248 0.3578  0.29775 0.34901 0.30002 0.34841 0.31144 0.35451
  0.30811 0.36344]]
Predicted MSEs: [[0.30719 0.29224 0.29835 0.29535 0.28932 0.31248 0.29775 0.30002 0.31144
  0.30811]]
Best algorithm for MSE:  SimMTM_MSE
Second best algorithm for MSE:  GPHT_MSE
Predicted MAEs: [[0.33866 0.33171 0.34772 0.34698 0.33716 0.3578  0.34901 0.34841 0.35451
  0.36344]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  SimMTM_MAE
None


In [29]:
print(prediction(rf, "data/feature_extraction/ettm2_720_features.csv", algorithms))

Predicted performance: [[0.37259 0.37892 0.35415 0.37175 0.36054 0.38827 0.36132 0.38981 0.34957
  0.37657 0.38069 0.40077 0.35348 0.38797 0.35993 0.38727 0.38315 0.39615
  0.39466 0.41844]]
Predicted MSEs: [[0.37259 0.35415 0.36054 0.36132 0.34957 0.38069 0.35348 0.35993 0.38315
  0.39466]]
Best algorithm for MSE:  SimMTM_MSE
Second best algorithm for MSE:  Supervised_PatchTST_MSE
Predicted MAEs: [[0.37892 0.37175 0.38827 0.38981 0.37657 0.40077 0.38797 0.38727 0.39615
  0.41844]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  SimMTM_MAE
None
