In [8]:
# !pip install xgboost tensorflow torch 
# !pip install optuna
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn models
from sklearn.svm import SVR  # Support Vector Regression
from sklearn.neighbors import KNeighborsRegressor  # K-Nearest Neighbors Regression
from sklearn.ensemble import RandomForestRegressor  # Random Forest Regression
from sklearn.neural_network import MLPRegressor  # Multi-Layer Perceptron for Regression

# XGBoost
from xgboost import XGBRegressor  # XGBoost for Regression

# TensorFlow/Keras for Deep Neural Network and CNN
from tensorflow.keras.models import Sequential  # Base model
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Input  # For DNN and CNN
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import optuna

In [9]:
df = pd.read_csv(r'output_csv/UniLu-Gaia-2014-2.swf.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51987 entries, 0 to 51986
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   job_id                    51987 non-null  float64
 1   submit_time               51987 non-null  float64
 2   wait_time                 51987 non-null  float64
 3   run_time                  51987 non-null  float64
 4   num_allocated_processors  51987 non-null  float64
 5   avg_cpu_time_used         51987 non-null  float64
 6   used_memory               51987 non-null  float64
 7   requested_processors      51987 non-null  float64
 8   requested_time            51987 non-null  float64
 9   requested_memory          51987 non-null  float64
 10  status                    51987 non-null  float64
 11  user_id                   51987 non-null  float64
 12  group_id                  51987 non-null  float64
 13  executable_id             51987 non-null  float64
 14  queue_

In [10]:
df

Unnamed: 0,job_id,submit_time,wait_time,run_time,num_allocated_processors,avg_cpu_time_used,used_memory,requested_processors,requested_time,requested_memory,status,user_id,group_id,executable_id,queue_id,partition_id,preceding_job_id,think_time
0,1.0,0.0,477768.0,35541.0,160.0,32096.0,89734.0,160.0,108000.0,-1.0,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0
1,2.0,83558.0,1.0,432024.0,36.0,1320.0,7566.0,36.0,432000.0,-1.0,0.0,2.0,2.0,2.0,1.0,-1.0,-1.0,-1.0
2,3.0,195861.0,1.0,278442.0,64.0,278442.0,34640.0,64.0,432000.0,-1.0,1.0,3.0,3.0,3.0,1.0,-1.0,-1.0,-1.0
3,4.0,278659.0,2.0,268225.0,4.0,4023.0,4864.0,4.0,345600.0,-1.0,1.0,4.0,4.0,4.0,1.0,-1.0,-1.0,-1.0
4,5.0,339016.0,1.0,305581.0,24.0,1783.0,14805.0,24.0,432000.0,-1.0,0.0,5.0,5.0,5.0,1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51982,51983.0,7692182.0,2.0,0.0,144.0,-1.0,11819.0,144.0,360000.0,-1.0,2.0,16.0,16.0,27947.0,1.0,-1.0,-1.0,-1.0
51983,51984.0,7692798.0,1.0,3574.0,60.0,1.0,17.0,60.0,32400.0,-1.0,1.0,69.0,69.0,66.0,0.0,-1.0,-1.0,-1.0
51984,51985.0,7694166.0,4.0,-1.0,1.0,-1.0,667648.0,1.0,7200.0,-1.0,2.0,46.0,46.0,3146.0,2.0,-1.0,-1.0,-1.0
51985,51986.0,7694169.0,1.0,-1.0,1.0,-1.0,789504.0,1.0,7200.0,-1.0,2.0,46.0,46.0,3146.0,2.0,-1.0,-1.0,-1.0


In [11]:
# filter row with status = 1
df = df.loc[df['status'] == 1].head(20)
filter_df = df.drop(columns=['job_id', 'submit_time', 'requested_memory',
        'partition_id', 'preceding_job_id', 'think_time', 'status'])
# metric variables
metric = filter_df['wait_time']

# all column need to predict
filter_df.info()
# categorical columns
categorical_columns = ['user_id', 'group_id', 'executable_id', 'queue_id']
# numerical columns
numerical_columns = ['num_allocated_processors', 'avg_cpu_time_used', 'used_memory', 'requested_processors', 'requested_time']

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 0 to 31
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   wait_time                 20 non-null     float64
 1   run_time                  20 non-null     float64
 2   num_allocated_processors  20 non-null     float64
 3   avg_cpu_time_used         20 non-null     float64
 4   used_memory               20 non-null     float64
 5   requested_processors      20 non-null     float64
 6   requested_time            20 non-null     float64
 7   user_id                   20 non-null     float64
 8   group_id                  20 non-null     float64
 9   executable_id             20 non-null     float64
 10  queue_id                  20 non-null     float64
dtypes: float64(11)
memory usage: 1.9 KB


In [12]:
# X
columns = categorical_columns + numerical_columns
X = filter_df[columns]
X

Unnamed: 0,user_id,group_id,executable_id,queue_id,num_allocated_processors,avg_cpu_time_used,used_memory,requested_processors,requested_time
0,1.0,1.0,1.0,1.0,160.0,32096.0,89734.0,160.0,108000.0
2,3.0,3.0,3.0,1.0,64.0,278442.0,34640.0,64.0,432000.0
3,4.0,4.0,4.0,1.0,4.0,4023.0,4864.0,4.0,345600.0
5,5.0,5.0,6.0,1.0,24.0,358.0,2560.0,24.0,432000.0
8,2.0,2.0,8.0,1.0,36.0,1066.0,50887.0,36.0,432000.0
9,2.0,2.0,8.0,1.0,36.0,22310.0,44487.0,36.0,432000.0
11,2.0,2.0,10.0,1.0,60.0,874.0,15582.0,60.0,432000.0
12,2.0,2.0,10.0,1.0,60.0,1123.0,11657.0,60.0,432000.0
13,2.0,2.0,11.0,1.0,36.0,2671.0,24121.0,36.0,432000.0
14,2.0,2.0,11.0,1.0,36.0,2500.0,31801.0,36.0,432000.0


In [13]:
# target variable
Y = filter_df['run_time']
Y

0      35541.0
2     278442.0
3     268225.0
5     214651.0
8     109634.0
9      98910.0
11    180824.0
12    292929.0
13    246546.0
14    163668.0
15    157777.0
16    160148.0
17    319797.0
22    169423.0
23     84779.0
26     10142.0
27     25001.0
28     28157.0
29     26588.0
31     28043.0
Name: run_time, dtype: float64

In [14]:
# Custom DNN model
def create_dnn(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),  # Thêm lớp Input ở đây
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(1)  # Output for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Custom CNN model
def create_cnn(input_dim):
    model = Sequential([
        Input(shape=(input_dim, 1)),  # Thêm lớp Input ở đây
        Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(input_dim, 1)),
        Flatten(),
        Dense(64, activation='relu'),
        Dense(1)  # Output for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Base learners and hyperparameter grids
base_learners = {
    "SVM": SVR(),
    "KNN": KNeighborsRegressor(),
    "RF": RandomForestRegressor(),
    "XG": XGBRegressor(),
    "MLP": MLPRegressor(),
    "DNN": "DNN",
    "CNN": "CNN" 
}

param_grids = {
    "SVM": {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    "KNN": {'n_neighbors': [3, 5, 7]},
    "RF": {'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    "XG": {'learning_rate': [0.01, 0.1], 'n_estimators': [100, 200]},
    "MLP": {'hidden_layer_sizes': [(100,), (100, 50)], 'activation': ['relu', 'tanh']}
}

# Dataset preparation
def split_data(X, y, m_folds=5):
    kf = KFold(n_splits=m_folds, shuffle=True, random_state=42)
    return list(kf.split(X, y))

# GEM-ITH implementation
def gem_ith(X, y, base_learners, param_grids, b_iterations=5, m_folds=5):
    predictions = {}
    input_dim = X.shape[1]

    for model_name, model in base_learners.items():
        print(f"Optimizing {model_name}...")
        if model_name in ["DNN", "CNN"]:
            # Special handling for neural networks
            model_func = create_dnn if model_name == "DNN" else create_cnn
            pred = []
            for train_idx, test_idx in split_data(X, y, m_folds):
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

                # Neural Network model training
                nn_model = model_func(input_dim)
                nn_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
                y_pred = nn_model.predict(X_test).flatten()
                pred.extend(y_pred)

            predictions[model_name] = np.array(pred)
        else:
            # Bayesian Optimization for Scikit-learn models
            def objective(trial):
                params = {key: trial.suggest_categorical(key, values) for key, values in param_grids[model_name].items()}
                model.set_params(**params)
                cv_splits = split_data(X, y, m_folds)
                mse_list = []

                for train_idx, test_idx in cv_splits:
                    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                    model.fit(X_train, y_train)
                    y_pred = model.predict(X_test)
                    mse_list.append(mean_squared_error(y_test, y_pred))

                return np.mean(mse_list)

            study = optuna.create_study(direction="minimize")
            study.optimize(objective, n_trials=b_iterations)
            best_params = study.best_params
            model.set_params(**best_params)

            # Collect predictions
            pred = []
            for train_idx, test_idx in split_data(X, y, m_folds):
                X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                pred.extend(y_pred)

            predictions[model_name] = np.array(pred)

    # Combine predictions into Y_hat
    Y_hat = np.column_stack(list(predictions.values()))

    # Optimize weights for ensemble
    def optimize_weights(y, Y_hat):
        def objective(weights):
            ensemble_preds = np.dot(Y_hat, weights)
            return mean_squared_error(y, ensemble_preds)

        from scipy.optimize import minimize
        initial_weights = np.ones(Y_hat.shape[1]) / Y_hat.shape[1]
        bounds = [(0, 1)] * Y_hat.shape[1]
        result = minimize(objective, initial_weights, bounds=bounds)
        return result.x

    optimal_weights = optimize_weights(y, Y_hat)

    # Final ensemble prediction
    final_predictions = np.dot(Y_hat, optimal_weights)

    return final_predictions, optimal_weights

final_preds, weights = gem_ith(X, Y, base_learners, param_grids)

print("Final Predictions:", final_preds)
print("Optimal Weights:", weights)

[I 2024-12-07 14:38:27,266] A new study created in memory with name: no-name-2c861a5c-42ae-464a-ab5c-fa4e06be935b
[I 2024-12-07 14:38:27,336] Trial 0 finished with value: 10388945735.81559 and parameters: {'C': 0.1, 'kernel': 'linear'}. Best is trial 0 with value: 10388945735.81559.


Optimizing SVM...


[I 2024-12-07 14:38:27,878] Trial 1 finished with value: 14543804363.807154 and parameters: {'C': 1, 'kernel': 'linear'}. Best is trial 0 with value: 10388945735.81559.
[I 2024-12-07 14:38:27,907] Trial 2 finished with value: 10738294379.867418 and parameters: {'C': 0.1, 'kernel': 'rbf'}. Best is trial 0 with value: 10388945735.81559.
[I 2024-12-07 14:38:27,924] Trial 3 finished with value: 10735464835.875793 and parameters: {'C': 10, 'kernel': 'rbf'}. Best is trial 0 with value: 10388945735.81559.
[I 2024-12-07 14:38:28,494] Trial 4 finished with value: 14543804363.807154 and parameters: {'C': 1, 'kernel': 'linear'}. Best is trial 0 with value: 10388945735.81559.
[I 2024-12-07 14:38:28,561] A new study created in memory with name: no-name-f3dc4ef6-7e21-4ac5-b53c-afae9d21ad59
[I 2024-12-07 14:38:28,582] Trial 0 finished with value: 3496555314.6 and parameters: {'n_neighbors': 3}. Best is trial 0 with value: 3496555314.6.
[I 2024-12-07 14:38:28,600] Trial 1 finished with value: 34965553

Optimizing KNN...
Optimizing RF...


[I 2024-12-07 14:38:29,147] Trial 0 finished with value: 4323632717.7638 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 0 with value: 4323632717.7638.
[I 2024-12-07 14:38:30,084] Trial 1 finished with value: 4421397708.601229 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 0 with value: 4323632717.7638.
[I 2024-12-07 14:38:30,574] Trial 2 finished with value: 4328917136.76951 and parameters: {'n_estimators': 100, 'max_depth': None}. Best is trial 0 with value: 4323632717.7638.
[I 2024-12-07 14:38:31,067] Trial 3 finished with value: 4601715156.808075 and parameters: {'n_estimators': 100, 'max_depth': None}. Best is trial 0 with value: 4323632717.7638.
[I 2024-12-07 14:38:31,560] Trial 4 finished with value: 3969525470.510115 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 4 with value: 3969525470.510115.
[I 2024-12-07 14:38:32,049] A new study created in memory with name: no-name-1a952b5c-80d6-4a44-9967-17511d806331


Optimizing XG...


[I 2024-12-07 14:38:32,570] Trial 0 finished with value: 6179560702.889705 and parameters: {'learning_rate': 0.1, 'n_estimators': 200}. Best is trial 0 with value: 6179560702.889705.
[I 2024-12-07 14:38:32,926] Trial 1 finished with value: 6499551981.394918 and parameters: {'learning_rate': 0.01, 'n_estimators': 200}. Best is trial 0 with value: 6179560702.889705.
[I 2024-12-07 14:38:33,204] Trial 2 finished with value: 6190390521.66895 and parameters: {'learning_rate': 0.1, 'n_estimators': 100}. Best is trial 0 with value: 6179560702.889705.
[I 2024-12-07 14:38:33,678] Trial 3 finished with value: 6499551981.394918 and parameters: {'learning_rate': 0.01, 'n_estimators': 200}. Best is trial 0 with value: 6179560702.889705.
[I 2024-12-07 14:38:34,097] Trial 4 finished with value: 6179560702.889705 and parameters: {'learning_rate': 0.1, 'n_estimators': 200}. Best is trial 0 with value: 6179560702.889705.
[I 2024-12-07 14:38:34,529] A new study created in memory with name: no-name-ffeeaea

Optimizing MLP...


[I 2024-12-07 14:38:35,074] Trial 0 finished with value: 30814873378.747643 and parameters: {'hidden_layer_sizes': (100, 50), 'activation': 'tanh'}. Best is trial 0 with value: 30814873378.747643.
[I 2024-12-07 14:38:35,553] Trial 1 finished with value: 30815076398.782978 and parameters: {'hidden_layer_sizes': (100, 50), 'activation': 'tanh'}. Best is trial 0 with value: 30814873378.747643.
[I 2024-12-07 14:38:36,045] Trial 2 finished with value: 30814964069.423656 and parameters: {'hidden_layer_sizes': (100, 50), 'activation': 'tanh'}. Best is trial 0 with value: 30814873378.747643.
[I 2024-12-07 14:38:36,295] Trial 3 finished with value: 4686786553.813307 and parameters: {'hidden_layer_sizes': (100, 50), 'activation': 'relu'}. Best is trial 3 with value: 4686786553.813307.
[I 2024-12-07 14:38:36,459] Trial 4 finished with value: 4916309583.942667 and parameters: {'hidden_layer_sizes': (100, 50), 'activation': 'relu'}. Best is trial 3 with value: 4686786553.813307.


Optimizing DNN...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
Optimizing CNN...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Final Predictions: [ 58472.3094314  219670.60805082  69119.25366563  43764.95586764
 171118.82671861 181188.78290936 175182.72699607 177232.96186014
 162766.57833715 211306.57768661  74018.91163351  74019.56696656
 190828.08336184 184885.2651868  187563.19594438  59553.60588597
 135433.50476176 135502.78075833 136172.84889285  31987.74425798]
Optimal Weights: [0.25405038 0.         0.         0.         0.         0.80432366
 0.        ]
