## ML 

In [1]:
import os
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras import datasets, models, layers, utils, activations, losses, optimizers, metrics # DNN 系列
from tensorflow.keras.layers import Dense, LSTM,Bidirectional,SimpleRNN,GRU,Activation # RNN系列
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
import datetime
from IPython.display import clear_output
from concurrent.futures import ThreadPoolExecutor

In [2]:
# input path
project_path = 'D:/在職進修/修課/機器學習/Final_project/'
release_path = project_path+'html.2023.final.data-release/release/'

#### 特徵工程

###### feature1_day
* feature
    * Station characteristics * 1
    * Station upper bound * 1
    * date characteristics * 2 (星期、放假)
    * time characteristics * 3 (小時、分鐘、累積)
    * 1 day time step 72 * 1
* --> input 72 * 8
* --> pred 72 * 1

In [3]:
save_path_phase2 = project_path+'data_science_phase2_feature/feature1_day/'

X_train = np.load(save_path_phase2+'X_train.npy') 
y_train = np.load(save_path_phase2+'y_train.npy') 
X_test = np.load(save_path_phase2+'X_test_target_sno.npy') 
y_test = np.load(save_path_phase2+'y_test_target_sno.npy') 

X_train.shape, y_train.shape, X_test.shape, y_test.shape 

X_test_sampled = X_test[::20, :, :]
y_test_sampled = y_test[::20, :, :]

X_test_sampled.shape, y_test_sampled.shape 

time_steps = 72 # 根据您的数据设置时间步
features = 8 # 特征数量
X_train_reshaped = X_train.reshape((X_train.shape[0], time_steps, features))
y_train_reshaped = y_train.reshape((y_train.shape[0], time_steps, features))
X_test_reshaped = X_test_sampled.reshape((X_test_sampled.shape[0], time_steps, features))
y_test_reshaped = y_test_sampled.reshape((y_test_sampled.shape[0], time_steps, features))

ratio = X_test_reshaped.shape[0] / X_train_reshaped.shape[0]

X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(X_train_reshaped, y_train_reshaped, test_size=ratio)

y_train_train = y_train_train[:, :, -1:]
y_train_val = y_train_val[:, :, -1:]
y_test_reshaped = y_test_reshaped[:, :, -1:]

X_train_train.shape, y_train_train.shape, X_train_val.shape, y_train_val.shape, X_test_reshaped.shape, y_test_reshaped.shape

## LSTM

In [21]:
from keras import models, layers, losses, optimizers

num_outputs = 72  # 回归模型的输出维度通常是

model = models.Sequential()
model.add(layers.LSTM(
    units=50,
    input_shape=(time_steps, features),  # time_steps, features
    unroll=True,
))
model.add(layers.Dense(units=num_outputs, kernel_initializer='normal'))  # 无激活函数

print(model.summary())

# 设置训练
model.compile(loss=losses.mean_squared_error,  # 回归任务常用的损失函数
              optimizer='adam',  # 优化器
              metrics=['mean_squared_error']  # 评估指标
              )

Batch_size = 128
Epochs = 5
filepath = save_path_phase2+"model/LSTM_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_mean_squared_error', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', save_freq='epoch')
callbacks_list = [checkpoint]
logs = model.fit(X_train_train, y_train_train, batch_size=Batch_size, epochs=Epochs, validation_data=(X_train_val, y_train_val), callbacks=callbacks_list)
# filepath = save_path_phase2+"model/LSTM_weights.hdf5"
# model.load_weights(filepath)

## Dropout LSTM

In [18]:
from keras import models, layers, losses, optimizers, regularizers

time_steps = 72  # 假設您的時間步長為72
features = 8  # 假設您有8個特徵
num_outputs = 72  # 輸出維度

model = models.Sequential()
model.add(layers.LSTM(
    units=50,
    input_shape=(time_steps, features),
    unroll=True,
    dropout=0.25,  # 加入 dropout
    recurrent_dropout=0.25  # 加入 recurrent dropout
))
model.add(layers.Dense(
    units=num_outputs, 
    kernel_initializer='normal',
    kernel_regularizer=regularizers.l2(0.01)  # 加入 L2 正則化
))
model.add(layers.Dropout(0.5))  # 在 Dense 層後加入額外的 Dropout 層

# 設定模型訓練
model.compile(
    loss=losses.mean_squared_error,  # 使用均方誤差作為損失函數
    optimizer='adam',
    metrics=['mean_squared_error']
)

Batch_size = 128
Epochs = 5
filepath = save_path_phase2+"model/LSTM_dropout_recurrent_dropout_regularizers_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_mean_squared_error', verbose=1, save_best_only=True, save_weights_only=True, mode='auto', save_freq='epoch')
callbacks_list = [checkpoint]
logs = model.fit(X_train_train, y_train_train, batch_size=Batch_size, epochs=Epochs, validation_data=(X_train_val, y_train_val), callbacks=callbacks_list)

## BiLSTM

In [67]:
from keras import models, layers, losses, optimizers

num_outputs = 72  # 回归模型的输出维度

# 定义模型
model = models.Sequential()
model.add(layers.Bidirectional(layers.LSTM(
    units=50,
    input_shape=(time_steps, features),  # time_steps, features
    unroll=True
)))
model.add(layers.Dense(units=num_outputs, kernel_initializer='normal'))  # 无激活函数


# 设置训练
model.compile(loss=losses.mean_squared_error,  # 回归任务常用的损失函数
              optimizer='adam',  # 优化器
              metrics=['mean_squared_error']  # 评估指标
              )

Batch_size = 128
Epochs = 1
filepath = save_path_phase2+"model/BiLSTM_weights.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_mean_squared_error', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch')
callbacks_list = [checkpoint]
logs = model.fit(X_train_train, y_train_train, batch_size=Batch_size, epochs=Epochs, validation_data=(X_train_val, y_train_val), callbacks=callbacks_list)


Epoch 00001: val_mean_squared_error improved from inf to 0.04612, saving model to D:/在職進修/修課/機器學習/Final_project/data_science_phase2_feature/feature1_day/model\BiLSTM_weights.hdf5


## Finetuned-LSTM

In [15]:
## 確認主要想預測的 sno
def collect_target_sno(_path):
    '''
    Readme
    
    > input:
        _path # 提交格式csv檔案路徑
    
    > output
        target_sno # 提交csv所需sno
    
    '''
    sample_csv = pd.read_csv(_path)
    target_sno = {each.split('_')[1] for each in sample_csv['id']}
    print('finish target_sno: ', len(target_sno)) # 期望數量
    return list(target_sno)

In [16]:
# input path
project_path = 'D:/在職進修/修課/機器學習/Final_project/'
release_path = project_path+'html.2023.final.data-release/release/'
sample_csv_list = ['sample_submission_stage'+str(eg) for eg in range(1,4)]

# collect_target_sno
target_sno = collect_target_sno(project_path+sample_csv_list[0]+'.csv')

finish target_sno:  112


In [41]:
model.load_weights(filepath)

In [59]:
from IPython.display import clear_output
for sno in tqdm(target_sno):
    filepath = save_path_phase2+"model/finetune/LSTM_weights_"+sno+".hdf5"
    try:
        model.load_weights(filepath)
    except:
        model.load_weights(save_path_phase2+"model/LSTM_weights.hdf5")
        
    X_train_one = np.load(save_path_phase2+sno+'_X_train.npy') 
    y_train_one = np.load(save_path_phase2+sno+'_y_train.npy') 

    time_steps = 72 # 根据您的数据设置时间步
    features = 8 # 特征数量
    X_train_one_reshaped = X_train_one.reshape((X_train_one.shape[0], time_steps, features))
    y_train_one_reshaped = y_train_one.reshape((y_train_one.shape[0], time_steps, features))
    y_train_one_reshaped = y_train_one_reshaped[:, :, -1:]

    X_train_one_reshaped_train, X_train_one_reshaped_val, y_train_one_reshaped_train, y_train_one_reshaped_val =  X_train_one_reshaped[:60] , X_train_one_reshaped[60:], y_train_one_reshaped[:60] , y_train_one_reshaped[60:]

    Batch_size = 128
    Epochs = 50
    
    checkpoint = ModelCheckpoint(filepath, monitor='val_mean_squared_error', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', save_freq='epoch')
    callbacks_list = [checkpoint]
    logs = model.fit(X_train_one_reshaped_train, y_train_one_reshaped_train, batch_size=Batch_size, epochs=Epochs, validation_data=(X_train_one_reshaped_val, y_train_one_reshaped_val), callbacks=callbacks_list)
    clear_output()

100%|████████████████████████████████████████████████████████████████████████████████| 112/112 [27:02<00:00, 14.49s/it]


## ML

In [22]:
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC,SVR
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

STEPS = []
PCA_threshold = True  # 假设您有一些逻辑来决定是否使用 PCA
Normalizer_threshold = False  # 同上，对于 Normalizer

# 假设 pca 和 normalizer 已经被定义
pca = PCA(n_components = 10)
normalizer = Normalizer()

if PCA_threshold:
    STEPS.append(('pca', pca))
elif Normalizer_threshold:
    STEPS.append(('normalizer', normalizer))

Seed = 7
cv_train = 5
svr = SVR(C=100, gamma=0.1, tol=0.001, kernel='rbf')
lr = LinearRegression()
# STEPS.append(('svr', svr))


ML_model = Pipeline(steps=STEPS)

# 將 SVR 包裝在 MultiOutputRegressor 中
multioutput_regressor = MultiOutputRegressor(SVR(C=100, gamma=0.1, tol=0.0000001, kernel='rbf'))
# multioutput_regressor = MultiOutputRegressor(lr())
STEPS.append(('multioutput_regressor',multioutput_regressor))
# ML_model = Pipeline(steps=[('pca', pca),('normalizer', normalizer), ('multi_svr', multioutput_regressor)])
ML_model = Pipeline(steps=STEPS) 

# parameters = {}
#parameters['pca__n_components'] = [10,20,30]  
parameters = {
    'pca__n_components' : [10,20,30],
    # 'multioutput_regressor__estimator__gamma': [0.1,1],
    # 'multioutput_regressor__estimator__C': [10,100],
}
tuned_parameters = [parameters]  

In [None]:
ML_model

X_train.shape, y_train.shape

y_train_ml = y_train[:, :, -1:]

y_train_ml.shape

X_train_ml = X_train.reshape((X_train.shape[0], X_train.shape[1]*X_train.shape[2]))
y_train_ml = y_train_ml.reshape((y_train_ml.shape[0], y_train_ml.shape[1]*y_train_ml.shape[2]))

X_train_ml.shape, y_train_ml.shape

## ML_model.fit(X_train_ml, y_train_ml)

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# random_search = RandomizedSearchCV(ML_model, parameters, cv=5, scoring=custom_scorer, verbose=2, n_iter=10, random_state=42)
random_search = RandomizedSearchCV(ML_model, parameters, cv=5, scoring='neg_mean_squared_error', verbose=2, n_iter=10, random_state=42)

# random_search.fit(data_feature_normalize, labels)
random_search.fit(X_train_ml, y_train_ml)
best_model = random_search.best_estimator_