In [1]:
import csv
import os

def csv2dict(file):
    dicts = []
    with open(file, mode='r') as f:
        csv_reader = csv.DictReader(f)
        for row in csv_reader:
            new_dict = {}
            for key, value in row.items():
                try:
                    new_dict[key] = float(value)
                except ValueError:
                    new_dict[key] = value
            dicts.append(new_dict)
    return dicts

def load_training_data(features_location, performance_location):
    feature_dicts = csv2dict(features_location)
    performance_matrix = csv2dict(performance_location)
    algorithms = [list(algorithm.keys()) for algorithm in performance_matrix]
    return feature_dicts, performance_matrix, algorithms

def file_name(file_dir):
    for root, dirs, files in os.walk(file_dir):
        return files

features_locations = file_name("data/feature_extraction")
performance_locations = file_name("data/performance")

features_locations.sort()
performance_locations.sort()

X = []
y = []

for i in range (len(features_locations)-4):
    feature_dicts, performance_matrix, algorithms = load_training_data("data/feature_extraction/"+features_locations[i], "data/performance/"+performance_locations[i])
    
    temp_X = list(feature_dicts[0].values())
    temp_X_clean = [0.0 if val == '' else val for val in temp_X]
    X.append(temp_X_clean)
    
    temp_y = list(performance_matrix[0].values())
    y.append(temp_y)

In [2]:
from keras.models import Model
from keras.layers import Input, Dense
from keras import regularizers
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
# 1. 数据准备
# X是特征数据，y是目标数据（从之前加载的数据中获得）

# 将数据集拆分为训练集和测试集
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 定义Autoencoder模型
# X_train = np.array(X_train)
# X_test = np.array(X_test)
# y_train = np.array(y_train)
# y_test = np.array(y_test)
X = np.array(X)
y = np.array(y)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# 输入层维度为特征数
input_dim = X.shape[1]
encoding_dim = 100  # 你可以调整这个值，表示要压缩成多少维度的特征

# Autoencoder结构：输入 -> 编码层 -> 解码层
input_layer = Input(shape=(input_dim,))
# encoded = Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l1(1e-5))(input_layer)
encoded = Dense(encoding_dim, activation='relu')(input_layer)
decoded = Dense(input_dim, activation='sigmoid')(encoded)

# 定义Autoencoder模型
autoencoder = Model(inputs=input_layer, outputs=decoded)

# 编码器模型，用于提取压缩后的特征
encoder = Model(inputs=input_layer, outputs=encoded)

# 编译模型
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# 3. 训练Autoencoder模型
# 将输入作为标签进行无监督学习
autoencoder.fit(X, X, epochs=100, batch_size=256, shuffle=True, validation_split=0.2)

# 4. 使用训练好的编码器提取新的特征
X_train_encoded = encoder.predict(X)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step - loss: 326097064031107940352.0000 - val_loss: 2829424746492346761216.0000
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 326097064031107940352.0000 - val_loss: 2829424746492346761216.0000
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 326097028846735851520.0000 - val_loss: 2829424746492346761216.0000
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 326097028846735851520.0000 - val_loss: 2829424746492346761216.0000
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 326097064031107940352.0000 - val_loss: 2829424746492346761216.0000
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 326097064031107940352.0000 - val_loss: 2829424746492346761216.0000
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━

In [4]:
# 5. 使用压缩后的特征训练传统模型
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_encoded, y)

In [5]:
def prediction(rf, new_features_location, algorithms):
    new_feature_dicts = csv2dict(new_features_location)
    new_X = [list(new_feature_dict.values()) for new_feature_dict in new_feature_dicts]
    new_X_clean = [[0.0 if val == '' else val for val in row] for row in new_X]
    new_X_clean = np.array(new_X_clean)
    X_test_encoded = encoder.predict(new_X_clean)
    
    predicted_performance = rf.predict(X_test_encoded)
    print(predicted_performance)
    best_algorithm = np.argmin(predicted_performance)
    
    return algorithms[0][best_algorithm]

In [6]:
print(prediction(rf, "data/feature_extraction/ettm2_192_features.csv", algorithms))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[[0.37747838 0.39389644 0.36731549 0.3877257  0.35003123 0.38765467
  0.35866721 0.39130012 0.36159038 0.39233888 0.39311686 0.41094468
  0.36527089 0.39345965 0.37747229 0.40393749 0.43287244 0.43269112
  0.4006719  0.40914675]]
 Self-supervised_PatchTST_MSE
