In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [2]:
import optuna
import torch
from torch import nn
import numpy
from pyspark.sql import SparkSession
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import  col, when
import numpy as np


In [3]:
device  = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
spark  = SparkSession.builder.appName('Onehot').getOrCreate()

In [6]:
df = spark.read.csv('/content/drive/MyDrive/Bigdata-Machine_learning /cleaned_data/part-00000-41f8a7f7-767e-4f2b-9ac9-b59ff9174cdb-c000.csv', header=True, inferSchema=True)


In [7]:
# không sử dụng dữ liệu hình ảnh
features = ['n_citi', 'bed', 'bath', 'sqft', 'price']
df = df.select(features)
df.show(5)

+------+---+----+----+------+
|n_citi|bed|bath|sqft| price|
+------+---+----+----+------+
|    48|  3|   2| 713|228500|
|   152|  3|   1| 800|273950|
|    48|  3|   1|1082|350000|
|    55|  4|   3|2547|385100|
|    55|  4|   3|2769|415000|
+------+---+----+----+------+
only showing top 5 rows



In [8]:
train_df, test_df = df.randomSplit([0.7, 0.3], seed=42)

In [9]:
y = ['price']
y_train_df = train_df.select(y)
x_train_df = train_df.drop('price')
y_test_df = test_df.select(y)
x_test_df = test_df.drop('price')

In [10]:
# xây dựng mô hình
class MLP_Regression(nn.Module):
  def __init__(self, input_size, hidden_layers, activations_func, output_size):
      super().__init__()
      layers = []
      pre_dim = input_size
      for h in hidden_layers:
        layers.append(nn.Linear(pre_dim, h))
        layers.append(activations_func())
        pre_dim = h
      layers.append(nn.Linear(pre_dim, output_size))
      layers.append(nn.ReLU())
      self.model = nn.Sequential(*layers)

  def forward(self, x):
      return self.model(x)

In [11]:
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader

# Hàm huấn luyện và chọn siêu tham số
best_RMSE_global = None
best_params_global = None

def Objective(X, y, trial):
  global best_RMSE_global
  global best_params_global
  ############# Khai báo hyper parameter grid #############
  num_hidden_layers = trial.suggest_int('num_hidden_layers', 2, 10)
  num_neural_for_each_hidden_layer = [trial.suggest_int(f'num_of_l{i}', 32, 256, step= 32) for i in range(num_hidden_layers)]
  activations_func = trial.suggest_categorical('activation', ['ReLU', 'Sigmoid', 'Tanh'])
  batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
  lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
  epochs = trial.suggest_int('epoch', 50, 200, step= 10)
  #########################################################
  activation = getattr(nn, activations_func)

  model = MLP_Regression(X.shape[1],num_neural_for_each_hidden_layer, activation, y.shape[1]).to(device)

  kf = KFold(n_splits=5, shuffle=True, random_state=42)

  rmse = []
  for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    x_tran_t = torch.tensor(X_train.values, dtype=torch.float32).to(device)
    y_tran_t = torch.tensor(y_train.values, dtype=torch.float32).to(device)
    x_val_t = torch.tensor(X_val.values, dtype=torch.float32).to(device)
    y_val_t = torch.tensor(y_val.values, dtype=torch.float32).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    train_dataset = TensorDataset(x_tran_t, y_tran_t)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
      model.train()
      for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
      y_pred = model(x_val_t)
      loss = loss_fn(y_pred, y_val_t)
      rmse.append(np.sqrt(loss.item()))

  if best_RMSE_global is None or np.mean(rmse) < best_RMSE_global:

    best_RMSE_global = np.mean(rmse)
    best_params_global = trial.params

    torch.save({
    "input_dim": X.shape[1],
    "hidden_dim": num_neural_for_each_hidden_layer,
    "output_dim": y.shape[1],
    "activation_func": activations_func,
    "model_state_dict": model.state_dict()
      }, "checkpoint.pth")

  return np.mean(rmse)


In [12]:
train = optuna.create_study(direction='minimize')
x_train_PD = x_train_df.toPandas()
y_train_PD = y_train_df.toPandas()
# x_test_np = x_test_df.toPandas()
# y_test_np = y_test_df.toPandas()
train.optimize(lambda trial: Objective(x_train_PD, y_train_PD, trial), n_trials = 50)

[I 2025-10-20 17:03:01,225] A new study created in memory with name: no-name-709516e5-48a5-43e2-94a3-3339d94b503f
[I 2025-10-20 17:15:51,978] Trial 0 finished with value: 828831.0892946229 and parameters: {'num_hidden_layers': 10, 'num_of_l0': 224, 'num_of_l1': 224, 'num_of_l2': 96, 'num_of_l3': 224, 'num_of_l4': 224, 'num_of_l5': 32, 'num_of_l6': 256, 'num_of_l7': 160, 'num_of_l8': 32, 'num_of_l9': 32, 'activation': 'Tanh', 'batch_size': 64, 'lr': 0.00028783232672048787, 'epoch': 150}. Best is trial 0 with value: 828831.0892946229.
[I 2025-10-20 17:20:17,985] Trial 1 finished with value: 327886.39392959065 and parameters: {'num_hidden_layers': 2, 'num_of_l0': 192, 'num_of_l1': 64, 'activation': 'ReLU', 'batch_size': 32, 'lr': 0.00028896607261141226, 'epoch': 130}. Best is trial 1 with value: 327886.39392959065.
[I 2025-10-20 17:37:39,806] Trial 2 finished with value: 326387.3493089583 and parameters: {'num_hidden_layers': 10, 'num_of_l0': 160, 'num_of_l1': 192, 'num_of_l2': 32, 'num_o

KeyboardInterrupt: 