In [None]:
!pip install optuna
!pip install  torch torchvision 

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [None]:
import optuna
import torch
from torch import nn
import numpy
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import  col, when
import numpy as np


In [None]:
device  = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# đọc tập dữ liệu
spark  = SparkSession.builder.appName('Onehot').getOrCreate()

In [None]:
df = spark.read.csv('.. /.. /data/cleaned_data/part-00000-41f8a7f7-767e-4f2b-9ac9-b59ff9174cdb-c000.csv', header=True, inferSchema=True)
df.show(5)

+--------+--------------------+------------+------+---+----+----+------+
|image_id|              street|        citi|n_citi|bed|bath|sqft| price|
+--------+--------------------+------------+------+---+----+----+------+
|       1|      124 C Street W| Brawley, CA|    48|  3|   2| 713|228500|
|       2|     2304 Clark Road|Imperial, CA|   152|  3|   1| 800|273950|
|       3|  755 Brawley Avenue| Brawley, CA|    48|  3|   1|1082|350000|
|       4|2207 R Carrillo C...|Calexico, CA|    55|  4|   3|2547|385100|
|       6| 1100 CAMILIA Street|Calexico, CA|    55|  4|   3|2769|415000|
+--------+--------------------+------------+------+---+----+----+------+
only showing top 5 rows



In [None]:
# không sử dụng dữ liệu hình ảnh
features = ['image_id', 'citi', 'bed', 'bath', 'sqft', 'price']
df = df.select(features)
df.show(5)

+--------+------------+---+----+----+------+
|image_id|        citi|bed|bath|sqft| price|
+--------+------------+---+----+----+------+
|       1| Brawley, CA|  3|   2| 713|228500|
|       2|Imperial, CA|  3|   1| 800|273950|
|       3| Brawley, CA|  3|   1|1082|350000|
|       4|Calexico, CA|  4|   3|2547|385100|
|       6|Calexico, CA|  4|   3|2769|415000|
+--------+------------+---+----+----+------+
only showing top 5 rows



In [None]:
df.printSchema()

root
 |-- image_id: integer (nullable = true)
 |-- citi: string (nullable = true)
 |-- bed: integer (nullable = true)
 |-- bath: integer (nullable = true)
 |-- sqft: integer (nullable = true)
 |-- price: integer (nullable = true)



In [None]:
# indexing và encoding cho feature citi
indexer = StringIndexer(inputCol='citi', outputCol='citi_index')
encoder = OneHotEncoder(inputCols=['citi_index'], outputCols=['citi_vec'], dropLast=False)

df_indexed = indexer.fit(df).transform(df)
df_encoded = encoder.fit(df_indexed).transform(df_indexed)

df_arr = df_encoded.withColumn("citi_encoded_arr", vector_to_array("citi_vec"))



In [None]:
df_arr.show(5)

+--------+------------+---+----+----+------+----------+-----------------+--------------------+
|image_id|        citi|bed|bath|sqft| price|citi_index|         citi_vec|    citi_encoded_arr|
+--------+------------+---+----+----+------+----------+-----------------+--------------------+
|       1| Brawley, CA|  3|   2| 713|228500|     324.0|(415,[324],[1.0])|[0.0, 0.0, 0.0, 0...|
|       2|Imperial, CA|  3|   1| 800|273950|     387.0|(415,[387],[1.0])|[0.0, 0.0, 0.0, 0...|
|       3| Brawley, CA|  3|   1|1082|350000|     324.0|(415,[324],[1.0])|[0.0, 0.0, 0.0, 0...|
|       4|Calexico, CA|  4|   3|2547|385100|     343.0|(415,[343],[1.0])|[0.0, 0.0, 0.0, 0...|
|       6|Calexico, CA|  4|   3|2769|415000|     343.0|(415,[343],[1.0])|[0.0, 0.0, 0.0, 0...|
+--------+------------+---+----+----+------+----------+-----------------+--------------------+
only showing top 5 rows



In [None]:
citi_labels = indexer.fit(df).labels

In [None]:
for i, label in enumerate(citi_labels):
   df_arr = df_arr.withColumn(f"{label}", col('citi_encoded_arr')[i])
df_arr.show(5)

+--------+------------+---+----+----+------+----------+-----------------+--------------------+-------------+---------------+-------------+-------------+-------------+----------+-------------+------------+-----------+------------+----------------+--------------------+-------------+------------+---------------+---------------+------------------+------------------+-----------------+------------+---------------+---------+----------+---------+---------------+-------------+------------+---------------+-----------+-------------+--------------+-----------+----------+---------------+----------+----------------+-----------+-------------+---------+--------------+-----------------+-----------+------------+--------------+------------+---------+-------------------+----------+----------+-----------------+--------------+---------------+-----------------+-----------------+----------+----------------+--------------------+-----------+-----------+----------------+----------+-----------+-----------------+-

In [None]:
# chọn xóa những features không cần thiết
df_arr = df_arr.drop('citi', 'citi_index', 'citi_vec', 'citi_encoded_arr')
df_arr.show(5)
# chia dữ liệu train test
train_df, test_df = df_arr.randomSplit([0.7, 0.3], seed=42)

+--------+---+----+----+------+-------------+---------------+-------------+-------------+-------------+----------+-------------+------------+-----------+------------+----------------+--------------------+-------------+------------+---------------+---------------+------------------+------------------+-----------------+------------+---------------+---------+----------+---------+---------------+-------------+------------+---------------+-----------+-------------+--------------+-----------+----------+---------------+----------+----------------+-----------+-------------+---------+--------------+-----------------+-----------+------------+--------------+------------+---------+-------------------+----------+----------+-----------------+--------------+---------------+-----------------+-----------------+----------+----------------+--------------------+-----------+-----------+----------------+----------+-----------+-----------------+----------------------+---------------+-----------+-------------

In [None]:
y = ['price']
y_train_df = train_df.select(y)
x_train_df = train_df.drop('price', 'image_id')
y_test_df = test_df.select(y)
x_test_df = test_df.drop('price')


In [None]:
x_train_df.show()

+---+----+----+-------------+---------------+-------------+-------------+-------------+----------+-------------+------------+-----------+------------+----------------+--------------------+-------------+------------+---------------+---------------+------------------+------------------+-----------------+------------+---------------+---------+----------+---------+---------------+-------------+------------+---------------+-----------+-------------+--------------+-----------+----------+---------------+----------+----------------+-----------+-------------+---------+--------------+-----------------+-----------+------------+--------------+------------+---------+-------------------+----------+----------+-----------------+--------------+---------------+-----------------+-----------------+----------+----------------+--------------------+-----------+-----------+----------------+----------+-----------+-----------------+----------------------+---------------+-----------+----------------+------------

In [None]:
y_train_df.show()

+-------+
|  price|
+-------+
| 228500|
| 273950|
| 385100|
| 415000|
| 545000|
|1350000|
| 995000|
|1550000|
|1850000|
| 249000|
| 195000|
| 229000|
| 239900|
| 239900|
| 219000|
| 220000|
| 199500|
| 225000|
| 234900|
| 199900|
+-------+
only showing top 20 rows



In [None]:
# xây dựng mô hình
class MLP_Regression(nn.Module):
  def __init__(self, input_size, hidden_layers, activations_func, output_size):
      super().__init__()
      layers = []
      pre_dim = input_size
      for h in hidden_layers:
        layers.append(nn.Linear(pre_dim, h))
        layers.append(activations_func())
        pre_dim = h
      layers.append(nn.Linear(pre_dim, output_size))
      layers.append(nn.ReLU())
      self.model = nn.Sequential(*layers)

  def forward(self, x):
      return self.model(x)

In [None]:
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader, Dataset

# Hàm huấn luyện và chọn siêu tham số
best_RMSE_global = None
best_params_global = None

def Objective(X, y, trial):
  global best_RMSE_global
  global best_params_global
  ############# Khai báo hyper parameter grid #############
  num_hidden_layers = trial.suggest_int('num_hidden_layers', 2, 10)
  num_neural_for_each_hidden_layer = [trial.suggest_int(f'num_of_l{i}', 32, 256, step= 32) for i in range(num_hidden_layers)]
  activations_func = trial.suggest_categorical('activation', ['ReLU', 'Sigmoid', 'Tanh'])
  batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
  lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
  epochs = trial.suggest_int('epoch', 50, 200, step= 10)
  #########################################################
  activation = getattr(nn, activations_func)

  model = MLP_Regression(X.shape[1],num_neural_for_each_hidden_layer, activation, y.shape[1]).to(device)

  kf = KFold(n_splits=5, shuffle=True, random_state=42)

  rmse = []
  for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    x_tran_t = torch.tensor(X_train.values, dtype=torch.float32).to(device)
    y_tran_t = torch.tensor(y_train.values, dtype=torch.float32).to(device)
    x_val_t = torch.tensor(X_val.values, dtype=torch.float32).to(device)
    y_val_t = torch.tensor(y_val.values, dtype=torch.float32).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    train_dataset = TensorDataset(x_tran_t, y_tran_t)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
      model.train()
      for x_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
      y_pred = model(x_val_t)
      loss = loss_fn(y_pred, y_val_t)
      rmse.append(np.sqrt(loss.item()))

  if best_RMSE_global is None or np.mean(rmse) < best_RMSE_global:

    best_RMSE_global = np.mean(rmse)
    best_params_global = trial.params

    torch.save({
    "input_dim": X.shape[1],
    "hidden_dim": num_neural_for_each_hidden_layer,
    "output_dim": y.shape[1],
    "activation_func": activations_func,
    "model_state_dict": model.state_dict()
      }, "checkpoint.pth")

  return np.mean(rmse)


In [None]:
# train = optuna.create_study(direction='minimize')
# x_train_PD = x_train_df.toPandas()
# y_train_PD = y_train_df.toPandas()
# # x_test_np = x_test_df.toPandas()
# # y_test_np = y_test_df.toPandas()
# train.optimize(lambda trial: Objective(x_train_PD, y_train_PD, trial), n_trials = 50)

In [None]:

# load lại mô hình
checkpoint = torch.load("checkpoint.pth", map_location="cpu")

activation_func = checkpoint['activation_func']
activation = getattr(nn, activation_func)
input_dim = checkpoint['input_dim']
output_dim = checkpoint['output_dim']
hidden_dim = checkpoint['hidden_dim']
model = MLP_Regression(input_dim, hidden_dim, activation, output_dim).to(device)
model.load_state_dict(checkpoint['model_state_dict'])

tabular_model  = model

In [None]:
test_df.show(5)

+--------+---+----+----+-------+-------------+---------------+-------------+-------------+-------------+----------+-------------+------------+-----------+------------+----------------+--------------------+-------------+------------+---------------+---------------+------------------+------------------+-----------------+------------+---------------+---------+----------+---------+---------------+-------------+------------+---------------+-----------+-------------+--------------+-----------+----------+---------------+----------+----------------+-----------+-------------+---------+--------------+-----------------+-----------+------------+--------------+------------+---------+-------------------+----------+----------+-----------------+--------------+---------------+-----------------+-----------------+----------+----------------+--------------------+-----------+-----------+----------------+----------+-----------+-----------------+----------------------+---------------+-----------+------------

In [None]:
# đọc dữ liệu hình ảnh cho train
import os

train_image_id = train_df.select("image_id").collect()
y_df = train_df.select("price")
images_dir = "../../data/socal2/socal_pics/"



In [None]:
train_image_id = [row.image_id for row in train_image_id]


In [None]:
train_df.show(5)

+--------+---+----+----+------+-------------+---------------+-------------+-------------+-------------+----------+-------------+------------+-----------+------------+----------------+--------------------+-------------+------------+---------------+---------------+------------------+------------------+-----------------+------------+---------------+---------+----------+---------+---------------+-------------+------------+---------------+-----------+-------------+--------------+-----------+----------+---------------+----------+----------------+-----------+-------------+---------+--------------+-----------------+-----------+------------+--------------+------------+---------+-------------------+----------+----------+-----------------+--------------+---------------+-----------------+-----------------+----------+----------------+--------------------+-----------+-----------+----------------+----------+-----------+-----------------+----------------------+---------------+-----------+-------------

In [None]:
from PIL import Image
class MultiModalDataset(Dataset):
    def __init__(self, df, image_dir, transform):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform

        self.image_ids = self.df['image_id'].values
        self.tabular_data = self.df.drop(['price', 'image_id'], axis=1).values
        self.target = self.df['price'].values

    def __getitem__(self, index):
        image_id = self.image_ids[index]
        image_path = os.path.join(self.image_dir, f"{image_id}.jpg")

        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)

        tabular = torch.tensor(self.tabular_data[index], dtype=torch.float32)
        target = torch.tensor(self.target[index], dtype=torch.float32)

        return image, tabular, target

    def __len__(self):
        return len(self.df)


In [None]:
from torchvision import transforms
import torchvision.models as models

input_size = (311, 415)

transform = transforms.Compose([
    transforms.ToTensor()
])

In [None]:
class CNN_model(nn.Module):
  def __init__(self ):
    super().__init__()
    layers = []

    layers.append(nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1))
    layers.append(nn.BatchNorm2d(16))
    layers.append(nn.ReLU())
    layers.append(nn.MaxPool2d(2,2)) # 155, 207

    layers.append(nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1))
    layers.append(nn.BatchNorm2d(32))
    layers.append(nn.ReLU())
    layers.append(nn.MaxPool2d(2,2)) # 77,103

    layers.append(nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1))
    layers.append(nn.BatchNorm2d(64))
    layers.append(nn.ReLU())
    layers.append(nn.MaxPool2d(2,2)) # 38,51

    layers.append(nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1))
    layers.append(nn.BatchNorm2d(128))
    layers.append(nn.ReLU())
    layers.append(nn.AdaptiveAvgPool2d((7, 7)))

    self.features = nn.Sequential(*layers)
    self.flatten = nn.Flatten()
    self.fc = nn.Linear(128*7*7, 256)

  def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x



In [None]:
class FusionModel(nn.Module):
  def __init__(self, tabular_model, cnn_model):
    super().__init__()
    for p in tabular_model.parameters():
            p.requires_grad = False
    tabular_model.eval()
    self.tabular_model = tabular_model

    self.cnn_model = cnn_model

    fusion_input_size = 1 + 256
    layers = []
    layers.append(nn.Linear(fusion_input_size, 128))
    layers.append(nn.ReLU())
    layers.append(nn.Linear(128, 64))
    layers.append(nn.ReLU())
    layers.append(nn.Linear(64, 1))
    layers.append(nn.ReLU())
    self.fusion_mlp = nn.Sequential(*layers)

  def forward(self, x_tabular, x_cnn):
    x_tabular = self.tabular_model(x_tabular)
    x_cnn = self.cnn_model(x_cnn)
    x_fusion = torch.cat([x_tabular, x_cnn], dim=1)
    return self.fusion_mlp(x_fusion)

In [None]:
cnn_model = CNN_model().to(device)

checkpoint = torch.load("checkpoint.pth", map_location="cuda")

activation_func = checkpoint['activation_func']
activation = getattr(nn, activation_func)
input_dim = checkpoint['input_dim']
output_dim = checkpoint['output_dim']
hidden_dim = checkpoint['hidden_dim']
model = MLP_Regression(input_dim, hidden_dim, activation, output_dim).to(device)
model.load_state_dict(checkpoint['model_state_dict'])

tabular_model  = model
fusion_model = FusionModel(tabular_model, cnn_model).to(device)




In [None]:
train_dataset = MultiModalDataset(train_df.toPandas(), images_dir, transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

optimizer = torch.optim.Adam(fusion_model.parameters(), lr=0.0005)
loss_fn = nn.MSELoss()

for epoch in range(20):
  print(f"Starting epoch:{epoch}")
  fusion_model.train()
  running_loss = 0.0
  for image, tabular, target in train_loader:
    y_pred = fusion_model(tabular, image)
    loss = loss_fn(y_pred.squeeze(), target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    running_loss += loss.item() * image.size(0)  # nhân với batch_size để tính trung bình sau

  epoch_mse = running_loss / len(train_loader.dataset)
  epoch_rmse = epoch_mse ** 0.5

  print(f"Epoch [{epoch+1}/{100}] - RMSE: {epoch_rmse:.4f}")

Starting epoch:0


KeyboardInterrupt: 