# Algorithm Selection Model using Random Forest

1. Import necessary packages.

In [1]:
import csv
import numpy as np
from sklearn.ensemble import RandomForestRegressor

2. `csv2dict(file)` function that loads the feature space from a `.csv` file.

In [2]:
def csv2dict(file):
    dicts = []
    with open(file, mode='r') as f:
        csv_reader = csv.DictReader(f)
        for row in csv_reader:
            new_dict = {}
            for key, value in row.items():
                try:
                    new_dict[key] = float(value)
                except ValueError:
                    new_dict[key] = value
            dicts.append(new_dict)
    return dicts

3. `load_training_data(features_location, performance_location)` function that loads feature sets and performance scores.

In [3]:
def load_training_data(features_location, performance_location):
    feature_dicts = csv2dict(features_location)
    performance_matrix = csv2dict(performance_location)
    algorithms = [list(algorithm.keys()) for algorithm in performance_matrix]
    return feature_dicts, performance_matrix, algorithms

4. `random_forest(X_clean, y_clean)` function that utilizes `RandomForestRegressor` to train the AS model.

In [4]:
def random_forest(X_clean, y_clean):
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_clean, y_clean)
    return rf

5. `prediction(rf, new_features_location, algorithms)` that make predictions given new datasets.

In [5]:
def prediction(rf, new_features_location, algorithms):
    desired_keys = ['Horizontal Length', ' LULL_ECDF Percentile Count_0', 'LULL_Sum absolute diff', 'LUFL_Wavelet variance_2.78Hz', 'HULL_Wavelet energy_5.0Hz', 'HUFL_LPCC_10', 'OT_MFCC_11', 'OT_MFCC_0', 'HULL_Spectral spread']
    
    new_feature_dicts = csv2dict(new_features_location)
    new_X = [[new_feature_dict[key] for key in desired_keys if key in new_feature_dict]
        for new_feature_dict in new_feature_dicts]
    # new_X = [list(new_feature_dict.values()) for new_feature_dict in new_feature_dicts]
    new_X_clean = [[0.0 if val == '' else val for val in row] for row in new_X]
    
    predicted_performance = rf.predict(new_X_clean)
    even_items = predicted_performance[:, ::2]
    odd_items = predicted_performance[:, 1::2]
    
    # Find the best algorithm for MSE and MAE
    best_algorithm_mse = np.argmin(even_items)
    best_algorithm_mae = np.argmin(odd_items)
    
    # Find the second best algorithm for MSE and MAE
    temp_mse = np.copy(even_items)
    temp_mae = np.copy(odd_items)
    temp_mse[0][best_algorithm_mse] = np.inf
    temp_mae[0][best_algorithm_mae] = np.inf
    second_best_algorithm_mse = np.argmin(temp_mse)
    second_best_algorithm_mae = np.argmin(temp_mae)
    
    print('Predicted performance:', predicted_performance)
    algorithm = [[], []]
    algorithm[0] = algorithms[0][::2]
    algorithm[1] = algorithms[0][1::2]

    print("Predicted MSEs:", even_items)
    print("Best algorithm for MSE:", algorithm[0][best_algorithm_mse])
    print("Second best algorithm for MSE:", algorithm[0][second_best_algorithm_mse])
    print("Predicted MAEs:", odd_items)
    print("Best algorithm for MAE:", algorithm[1][best_algorithm_mae])
    print("Second best algorithm for MAE:", algorithm[1][second_best_algorithm_mae])
    return

6. Implementation part, where we call the functions above and get predicted results.

In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_selection import VarianceThreshold

def load_all_feature_files(directory):
    feature_files = []
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory, filename)
            feature_files.append(file_path)
        feature_files.sort()
    return feature_files

def load_and_combine_features(feature_files):
    combined_df = pd.DataFrame()
    for file in feature_files:
        temp_df = pd.read_csv(file)
        combined_df = pd.concat([combined_df, temp_df], axis=0)
    return combined_df

feature_directory = "data/feature_extraction"
feature_files = load_all_feature_files(feature_directory)

features_df = load_and_combine_features(feature_files)

var_thresh = 0.01
selector = VarianceThreshold(threshold=var_thresh)
X_var_reduced = selector.fit_transform(features_df)

selected_features = features_df.columns[selector.get_support(indices=True)]
features_var_reduced_df = features_df[selected_features]

corr_matrix = features_var_reduced_df.corr().abs()

upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

correlation_threshold = 0.9
to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]

X_final_reduced = features_var_reduced_df.drop(columns=to_drop)

print(f"Origin # of features: {features_df.shape[1]}")
print(f"# of features left after Variance Filter: {features_var_reduced_df.shape[1]}")
print(f"# of features left after Correlation Filter: {X_final_reduced.shape[1]}")

X_final_reduced

Origin # of features: 2185
# of features left after Variance Filter: 1150
# of features left after Correlation Filter: 9


Unnamed: 0,Horizontal Length,LULL_ECDF Percentile Count_0,LULL_Sum absolute diff,LUFL_Wavelet variance_2.78Hz,HULL_Wavelet energy_5.0Hz,HUFL_LPCC_10,OT_MFCC_11,OT_MFCC_0,HULL_Spectral spread
0,192.0,3484.0,2003.879997,3.672289,2.847223,0.728981,6.149743,-3.468326,14.990301
0,336.0,3484.0,2003.879997,3.672289,2.847223,0.728981,6.149743,-3.468326,14.990301
0,720.0,3484.0,2003.879997,3.672289,2.847223,0.728981,6.149743,-3.468326,14.990301
0,96.0,3484.0,2003.879997,3.672289,2.847223,0.728981,6.149743,-3.468326,14.990301
0,192.0,3484.0,4299.620996,7.332074,3.93993,1.114523,-38.626861,9.764201,15.399541
0,336.0,3484.0,4299.620996,7.332074,3.93993,1.114523,-38.626861,9.764201,15.399541
0,720.0,3484.0,4299.620996,7.332074,3.93993,1.114523,-38.626861,9.764201,15.399541
0,96.0,3484.0,4299.620996,7.332074,3.93993,1.114523,-38.626861,9.764201,15.399541
0,192.0,13936.0,4970.865002,4.34795,1.431463,0.761384,-44.726663,7.286104,15.446551
0,336.0,13936.0,4970.865002,4.34795,1.431463,0.761384,-44.726663,7.286104,15.446551


In [7]:
# traverse files in the directory
import os
def file_name(file_dir):
    for root, dirs, files in os.walk(file_dir):
        return files

features_locations = file_name("data/feature_extraction")
performance_locations = file_name("data/performance")

features_locations.sort()
performance_locations.sort()

X = []
y = []

for i in range (len(features_locations)):
    feature_dicts, performance_matrix, algorithms = load_training_data("data/feature_extraction/"+features_locations[i], "data/performance/"+performance_locations[i])
    
    # temp_X = list(feature_dicts[0].values())
    # temp_X_clean = [0.0 if val == '' else val for val in temp_X]
    # X.append(temp_X_clean)
    
    temp_y = list(performance_matrix[0].values())
    y.append(temp_y)
X = X_final_reduced.values.tolist()

rf = random_forest(X, y)

### For the `etth1` dataset:

In [8]:
print(prediction(rf, "data/feature_extraction/etth1_96_features.csv", algorithms))

Predicted performance: [[0.3632  0.37928 0.34897 0.37287 0.36527 0.39028 0.36616 0.39126 0.36323
  0.39638 0.40322 0.42956 0.36219 0.38952 0.38432 0.40658 0.40634 0.42865
  0.36095 0.38933]]
Predicted MSEs: [[0.3632  0.34897 0.36527 0.36616 0.36323 0.40322 0.36219 0.38432 0.40634
  0.36095]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  DLinear_MSE
Predicted MAEs: [[0.37928 0.37287 0.39028 0.39126 0.39638 0.42956 0.38952 0.40658 0.42865
  0.38933]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [9]:
print(prediction(rf, "data/feature_extraction/etth1_192_features.csv", algorithms))

Predicted performance: [[0.39619 0.40056 0.38148 0.39339 0.40147 0.41406 0.3969  0.40902 0.39278
  0.4155  0.44544 0.4589  0.39272 0.40878 0.4208  0.42968 0.44961 0.4566
  0.40006 0.4174 ]]
Predicted MSEs: [[0.39619 0.38148 0.40147 0.3969  0.39278 0.44544 0.39272 0.4208  0.44961
  0.40006]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  Supervised_PatchTST_MSE
Predicted MAEs: [[0.40056 0.39339 0.41406 0.40902 0.4155  0.4589  0.40878 0.42968 0.4566
  0.4174 ]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [10]:
print(prediction(rf, "data/feature_extraction/etth1_336_features.csv", algorithms))

Predicted performance: [[0.43351 0.42171 0.41216 0.4134  0.43706 0.43641 0.42561 0.42712 0.41361
  0.43069 0.48894 0.48944 0.42305 0.4295  0.46066 0.45661 0.49474 0.48507
  0.43484 0.44196]]
Predicted MSEs: [[0.43351 0.41216 0.43706 0.42561 0.41361 0.48894 0.42305 0.46066 0.49474
  0.43484]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  SimMTM_MSE
Predicted MAEs: [[0.42171 0.4134  0.43641 0.42712 0.43069 0.48944 0.4295  0.45661 0.48507
  0.44196]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [11]:
print(prediction(rf, "data/feature_extraction/etth1_720_features.csv", algorithms))

Predicted performance: [[0.4484  0.44277 0.41564 0.43122 0.45185 0.45783 0.45581 0.46192 0.43056
  0.44899 0.56952 0.55244 0.45519 0.46319 0.5275  0.51231 0.51463 0.50119
  0.51667 0.51025]]
Predicted MSEs: [[0.4484  0.41564 0.45185 0.45581 0.43056 0.56952 0.45519 0.5275  0.51463
  0.51667]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  SimMTM_MSE
Predicted MAEs: [[0.44277 0.43122 0.45783 0.46192 0.44899 0.55244 0.46319 0.51231 0.50119
  0.51025]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


### For the `etth2` dataset:

In [12]:
print(prediction(rf, "data/feature_extraction/etth2_96_features.csv", algorithms))

Predicted performance: [[0.31753 0.35336 0.30683 0.3469  0.31298 0.36185 0.30288 0.35585 0.30838
  0.35628 0.30766 0.36581 0.29988 0.35086 0.32094 0.36958 0.35885 0.40677
  0.31653 0.37151]]
Predicted MSEs: [[0.31753 0.30683 0.31298 0.30288 0.30838 0.30766 0.29988 0.32094 0.35885
  0.31653]]
Best algorithm for MSE:  Supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.35336 0.3469  0.36185 0.35585 0.35628 0.36581 0.35086 0.36958 0.40677
  0.37151]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  Supervised_PatchTST_MAE
None


In [13]:
print(prediction(rf, "data/feature_extraction/etth2_192_features.csv", algorithms))

Predicted performance: [[0.36203 0.38316 0.35225 0.37793 0.36841 0.39565 0.34636 0.38499 0.35026
  0.38278 0.34988 0.39445 0.34563 0.38242 0.37587 0.40358 0.39449 0.42766
  0.38412 0.41633]]
Predicted MSEs: [[0.36203 0.35225 0.36841 0.34636 0.35026 0.34988 0.34563 0.37587 0.39449
  0.38412]]
Best algorithm for MSE:  Supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.38316 0.37793 0.39565 0.38499 0.38278 0.39445 0.38242 0.40358 0.42766
  0.41633]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  Supervised_PatchTST_MAE
None


In [14]:
print(prediction(rf, "data/feature_extraction/etth2_336_features.csv", algorithms))

Predicted performance: [[0.39138 0.40696 0.38325 0.40253 0.38751 0.41036 0.379   0.41106 0.38022
  0.40473 0.39478 0.4248  0.37275 0.40487 0.40795 0.42615 0.40463 0.43583
  0.44541 0.45582]]
Predicted MSEs: [[0.39138 0.38325 0.38751 0.379   0.38022 0.39478 0.37275 0.40795 0.40463
  0.44541]]
Best algorithm for MSE:  Supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.40696 0.40253 0.41036 0.41106 0.40473 0.4248  0.40487 0.42615 0.43583
  0.45582]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  SimMTM_MAE
None


In [15]:
print(prediction(rf, "data/feature_extraction/etth2_720_features.csv", algorithms))

Predicted performance: [[0.41586 0.4275  0.40701 0.42376 0.41125 0.43681 0.41364 0.44196 0.40921
  0.43077 0.51557 0.49671 0.40732 0.43648 0.44047 0.45415 0.44996 0.46431
  0.62789 0.55309]]
Predicted MSEs: [[0.41586 0.40701 0.41125 0.41364 0.40921 0.51557 0.40732 0.44047 0.44996
  0.62789]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  Supervised_PatchTST_MSE
Predicted MAEs: [[0.4275  0.42376 0.43681 0.44196 0.43077 0.49671 0.43648 0.45415 0.46431
  0.55309]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


### For the `ettm1` dataset:

In [16]:
print(prediction(rf, "data/feature_extraction/ettm1_96_features.csv", algorithms))

Predicted performance: [[0.30743 0.34855 0.29724 0.34235 0.28881 0.34632 0.29506 0.34925 0.30075
  0.35192 0.30668 0.3542  0.30126 0.34808 0.31033 0.36233 0.34744 0.38312
  0.30568 0.3504 ]]
Predicted MSEs: [[0.30743 0.29724 0.28881 0.29506 0.30075 0.30668 0.30126 0.31033 0.34744
  0.30568]]
Best algorithm for MSE:  Self-supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.34855 0.34235 0.34632 0.34925 0.35192 0.3542  0.34808 0.36233 0.38312
  0.3504 ]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  Self-supervised_PatchTST_MAE
None


In [17]:
print(prediction(rf, "data/feature_extraction/ettm1_192_features.csv", algorithms))

Predicted performance: [[0.3471  0.3739  0.33688 0.36768 0.32483 0.37058 0.33008 0.37101 0.33436
  0.37339 0.34981 0.38241 0.33896 0.37375 0.34558 0.38263 0.4229  0.41816
  0.33899 0.36971]]
Predicted MSEs: [[0.3471  0.33688 0.32483 0.33008 0.33436 0.34981 0.33896 0.34558 0.4229
  0.33899]]
Best algorithm for MSE:  Self-supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.3739  0.36768 0.37058 0.37101 0.37339 0.38241 0.37375 0.38263 0.41816
  0.36971]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  DLinear_MAE
None


In [18]:
print(prediction(rf, "data/feature_extraction/ettm1_336_features.csv", algorithms))

Predicted performance: [[0.38042 0.39539 0.36953 0.38809 0.34665 0.38347 0.35898 0.38901 0.36315
  0.39245 0.38199 0.40353 0.37077 0.39479 0.37407 0.39985 0.44638 0.43793
  0.36939 0.39119]]
Predicted MSEs: [[0.38042 0.36953 0.34665 0.35898 0.36315 0.38199 0.37077 0.37407 0.44638
  0.36939]]
Best algorithm for MSE:  Self-supervised_PatchTST_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.39539 0.38809 0.38347 0.38901 0.39245 0.40353 0.39479 0.39985 0.43793
  0.39119]]
Best algorithm for MAE:  Self-supervised_PatchTST_MAE
Second best algorithm for MAE:  GPHT_MAE
None


In [19]:
print(prediction(rf, "data/feature_extraction/ettm1_720_features.csv", algorithms))

Predicted performance: [[0.44073 0.42861 0.4275  0.42129 0.39359 0.41536 0.40685 0.41935 0.40596
  0.41876 0.45224 0.44579 0.41329 0.4251  0.42774 0.43243 0.4933  0.4659
  0.45136 0.44094]]
Predicted MSEs: [[0.44073 0.4275  0.39359 0.40685 0.40596 0.45224 0.41329 0.42774 0.4933
  0.45136]]
Best algorithm for MSE:  Self-supervised_PatchTST_MSE
Second best algorithm for MSE:  SimMTM_MSE
Predicted MAEs: [[0.42861 0.42129 0.41536 0.41935 0.41876 0.44579 0.4251  0.43243 0.4659
  0.44094]]
Best algorithm for MAE:  Self-supervised_PatchTST_MAE
Second best algorithm for MAE:  SimMTM_MAE
None


### For the `ettm2` dataset:

In [20]:
print(prediction(rf, "data/feature_extraction/ettm2_96_features.csv", algorithms))

Predicted performance: [[0.20722 0.2756  0.19704 0.26861 0.19963 0.27793 0.1979  0.27966 0.19892
  0.28032 0.208   0.28713 0.20201 0.28108 0.20542 0.28729 0.21646 0.29396
  0.19848 0.28509]]
Predicted MSEs: [[0.20722 0.19704 0.19963 0.1979  0.19892 0.208   0.20201 0.20542 0.21646
  0.19848]]
Best algorithm for MSE:  GPHT_MSE
Second best algorithm for MSE:  FPT_MSE
Predicted MAEs: [[0.2756  0.26861 0.27793 0.27966 0.28032 0.28713 0.28108 0.28729 0.29396
  0.28509]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [21]:
print(prediction(rf, "data/feature_extraction/ettm2_192_features.csv", algorithms))

Predicted performance: [[0.24959 0.30296 0.23758 0.29604 0.24323 0.30898 0.23852 0.30747 0.23734
  0.30415 0.25084 0.31711 0.24441 0.31129 0.25165 0.31781 0.25527 0.31858
  0.24263 0.31723]]
Predicted MSEs: [[0.24959 0.23758 0.24323 0.23852 0.23734 0.25084 0.24441 0.25165 0.25527
  0.24263]]
Best algorithm for MSE:  SimMTM_MSE
Second best algorithm for MSE:  GPHT_MSE
Predicted MAEs: [[0.30296 0.29604 0.30898 0.30747 0.30415 0.31711 0.31129 0.31781 0.31858
  0.31723]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  GPHT'_MAE
None


In [22]:
print(prediction(rf, "data/feature_extraction/ettm2_336_features.csv", algorithms))

Predicted performance: [[0.30705 0.3387  0.29228 0.33171 0.29689 0.34691 0.29465 0.34646 0.28905
  0.33698 0.31181 0.35701 0.29774 0.34884 0.29926 0.34786 0.31252 0.35477
  0.30669 0.36207]]
Predicted MSEs: [[0.30705 0.29228 0.29689 0.29465 0.28905 0.31181 0.29774 0.29926 0.31252
  0.30669]]
Best algorithm for MSE:  SimMTM_MSE
Second best algorithm for MSE:  GPHT_MSE
Predicted MAEs: [[0.3387  0.33171 0.34691 0.34646 0.33698 0.35701 0.34884 0.34786 0.35477
  0.36207]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  SimMTM_MAE
None


In [23]:
print(prediction(rf, "data/feature_extraction/ettm2_720_features.csv", algorithms))

Predicted performance: [[0.37558 0.38078 0.3578  0.37377 0.36162 0.38905 0.36281 0.3906  0.35185
  0.37791 0.38173 0.40061 0.35512 0.38888 0.36055 0.38739 0.38678 0.39805
  0.39844 0.41961]]
Predicted MSEs: [[0.37558 0.3578  0.36162 0.36281 0.35185 0.38173 0.35512 0.36055 0.38678
  0.39844]]
Best algorithm for MSE:  SimMTM_MSE
Second best algorithm for MSE:  Supervised_PatchTST_MSE
Predicted MAEs: [[0.38078 0.37377 0.38905 0.3906  0.37791 0.40061 0.38888 0.38739 0.39805
  0.41961]]
Best algorithm for MAE:  GPHT_MAE
Second best algorithm for MAE:  SimMTM_MAE
None
