In [31]:
# !pip install optuna
# !pip install  torch torchvision 

In [1]:
import optuna
import torch
from torch import nn
import numpy
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.functions import vector_to_array
from pyspark.sql.functions import  col, when
import numpy as np


In [2]:
device  = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [3]:
# đọc tập dữ liệu
spark  = SparkSession.builder.appName('Onehot').getOrCreate()

In [5]:
df = spark.read.csv('cleaned_data/data.csv', header=True, inferSchema=True)
df.show(5)

+--------+--------------------+------------+------+---+----+----+------+
|image_id|              street|        citi|n_citi|bed|bath|sqft| price|
+--------+--------------------+------------+------+---+----+----+------+
|       1|      124 C Street W| Brawley, CA|    48|  3|   2| 713|228500|
|       2|     2304 Clark Road|Imperial, CA|   152|  3|   1| 800|273950|
|       3|  755 Brawley Avenue| Brawley, CA|    48|  3|   1|1082|350000|
|       4|2207 R Carrillo C...|Calexico, CA|    55|  4|   3|2547|385100|
|       6| 1100 CAMILIA Street|Calexico, CA|    55|  4|   3|2769|415000|
+--------+--------------------+------------+------+---+----+----+------+
only showing top 5 rows


In [6]:
# không sử dụng dữ liệu hình ảnh
features = ['image_id', 'citi', 'bed', 'bath', 'sqft', 'price']
df = df.select(features)
df.show(5)

+--------+------------+---+----+----+------+
|image_id|        citi|bed|bath|sqft| price|
+--------+------------+---+----+----+------+
|       1| Brawley, CA|  3|   2| 713|228500|
|       2|Imperial, CA|  3|   1| 800|273950|
|       3| Brawley, CA|  3|   1|1082|350000|
|       4|Calexico, CA|  4|   3|2547|385100|
|       6|Calexico, CA|  4|   3|2769|415000|
+--------+------------+---+----+----+------+
only showing top 5 rows


In [None]:
df.printSchema()

root
 |-- image_id: integer (nullable = true)
 |-- citi: string (nullable = true)
 |-- bed: integer (nullable = true)
 |-- bath: integer (nullable = true)
 |-- sqft: integer (nullable = true)
 |-- price: integer (nullable = true)



In [7]:
# indexing và encoding cho feature citi
indexer = StringIndexer(inputCol='citi', outputCol='citi_index')
encoder = OneHotEncoder(inputCols=['citi_index'], outputCols=['citi_vec'], dropLast=False)

df_indexed = indexer.fit(df).transform(df)
df_encoded = encoder.fit(df_indexed).transform(df_indexed)

df_arr = df_encoded.withColumn("citi_encoded_arr", vector_to_array("citi_vec"))



In [39]:
df_arr.show(5)

+--------+------------+---+----+----+------+----------+-----------------+--------------------+
|image_id|        citi|bed|bath|sqft| price|citi_index|         citi_vec|    citi_encoded_arr|
+--------+------------+---+----+----+------+----------+-----------------+--------------------+
|       1| Brawley, CA|  3|   2| 713|228500|     324.0|(415,[324],[1.0])|[0.0, 0.0, 0.0, 0...|
|       2|Imperial, CA|  3|   1| 800|273950|     387.0|(415,[387],[1.0])|[0.0, 0.0, 0.0, 0...|
|       3| Brawley, CA|  3|   1|1082|350000|     324.0|(415,[324],[1.0])|[0.0, 0.0, 0.0, 0...|
|       4|Calexico, CA|  4|   3|2547|385100|     343.0|(415,[343],[1.0])|[0.0, 0.0, 0.0, 0...|
|       6|Calexico, CA|  4|   3|2769|415000|     343.0|(415,[343],[1.0])|[0.0, 0.0, 0.0, 0...|
+--------+------------+---+----+----+------+----------+-----------------+--------------------+
only showing top 5 rows



In [8]:
citi_labels = indexer.fit(df).labels

In [9]:
for i, label in enumerate(citi_labels):
   df_arr = df_arr.withColumn(f"{label}", col('citi_encoded_arr')[i])
# df_arr.show(5)

In [10]:
# chọn xóa những features không cần thiết
df_arr = df_arr.drop('citi', 'citi_index', 'citi_vec', 'citi_encoded_arr')
# df_arr.show(5)
# chia dữ liệu train test
train_df, test_df = df_arr.randomSplit([0.7, 0.3], seed=42)

In [11]:
y = ['price']
y_train_df = train_df.select(y)

x_train_df = train_df.drop('price')
x_train_df1 = train_df.drop('image_id', 'price')

y_test_df = test_df.select(y)

x_test_df = test_df.drop('price')
x_test_df1 = test_df.drop('price', 'image_id')


In [44]:
# x_train_df.show()

In [45]:
y_train_df.show()

+-------+
|  price|
+-------+
| 228500|
| 273950|
| 385100|
| 415000|
| 545000|
|1350000|
| 995000|
|1550000|
|1850000|
| 249000|
| 195000|
| 229000|
| 239900|
| 239900|
| 219000|
| 220000|
| 199500|
| 225000|
| 234900|
| 199900|
+-------+
only showing top 20 rows



In [12]:
# xây dựng mô hình
class MLP_Regression(nn.Module):
  def __init__(self, input_size, hidden_layers, activations_func, output_size):
      super().__init__()
      layers = []
      pre_dim = input_size
      for h in hidden_layers:
        layers.append(nn.Linear(pre_dim, h))
        layers.append(activations_func())
        pre_dim = h
      layers.append(nn.Linear(pre_dim, output_size))
      layers.append(nn.ReLU())
      self.model = nn.Sequential(*layers)

  def forward(self, x):
      return self.model(x)

In [None]:
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader, Dataset

# Hàm huấn luyện và chọn siêu tham số
best_RMSE_global = None
best_params_global = None

def Objective(X, y, trial):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  global best_RMSE_global
  global best_params_global
  ############# Khai báo hyper parameter grid #############
  num_hidden_layers = trial.suggest_int('num_hidden_layers', 4, 12)
  num_neural_for_each_hidden_layer = [trial.suggest_int(f'num_of_l{i}', 64, 512, step= 64) for i in range(num_hidden_layers)]
  activations_func = trial.suggest_categorical('activation', ['ReLU', 'Sigmoid', 'Tanh'])
  batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
  lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
  epochs = trial.suggest_int('epoch', 50, 200, step= 10)
  #########################################################
  activation = getattr(nn, activations_func)

  model = MLP_Regression(X.shape[1],num_neural_for_each_hidden_layer, activation, y.shape[1]).to(device)

  kf = KFold(n_splits=5, shuffle=True, random_state=42)
  rmse = []
  for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    x_tran_t = torch.tensor(X_train.values, dtype=torch.float32).to(device)
    y_tran_t = torch.tensor(y_train.values, dtype=torch.float32).to(device)
    x_val_t = torch.tensor(X_val.values, dtype=torch.float32).to(device)
    y_val_t = torch.tensor(y_val.values, dtype=torch.float32).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()

    train_dataset = TensorDataset(x_tran_t, y_tran_t)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


    for epoch in range(epochs):
      model.train()
      for x_batch, y_batch in train_loader:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(x_batch)
        loss = loss_fn(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
      y_pred = model(x_val_t)
      loss = loss_fn(y_pred, y_val_t)
      rmse.append(np.sqrt(loss.item()))
  
  if best_RMSE_global is None or np.mean(rmse) < best_RMSE_global:

    best_RMSE_global = np.mean(rmse)
    best_params_global = trial.params

    torch.save({
    "input_dim": X.shape[1],
    "hidden_dim": num_neural_for_each_hidden_layer,
    "output_dim": y.shape[1],
    "activation_func": activations_func,
    "model_state_dict": model.state_dict()
      }, "/kaggle/working/checkpoint1.pth")

  return np.mean(rmse)


In [48]:
train = optuna.create_study(direction='minimize')
x_train_PD = x_train_df1.toPandas()
y_train_PD = y_train_df.toPandas()
# x_test_np = x_test_df.toPandas()
# y_test_np = y_test_df.toPandas()
train.optimize(lambda trial: Objective(x_train_PD, y_train_PD, trial), n_trials = 70)

[I 2025-10-25 18:06:55,040] A new study created in memory with name: no-name-d0969222-16e9-44ef-827a-bc4d65b94cc2
[I 2025-10-25 18:11:00,675] Trial 0 finished with value: 825394.0133616516 and parameters: {'num_hidden_layers': 5, 'num_of_l0': 320, 'num_of_l1': 64, 'num_of_l2': 256, 'num_of_l3': 128, 'num_of_l4': 256, 'activation': 'Tanh', 'batch_size': 64, 'lr': 0.005869402882338814, 'epoch': 190}. Best is trial 0 with value: 825394.0133616516.
[I 2025-10-25 18:12:33,355] Trial 1 finished with value: 825394.0133616516 and parameters: {'num_hidden_layers': 7, 'num_of_l0': 256, 'num_of_l1': 384, 'num_of_l2': 128, 'num_of_l3': 512, 'num_of_l4': 512, 'num_of_l5': 512, 'num_of_l6': 256, 'activation': 'Tanh', 'batch_size': 128, 'lr': 0.004835322299654834, 'epoch': 110}. Best is trial 0 with value: 825394.0133616516.
[I 2025-10-25 18:15:06,640] Trial 2 finished with value: 230647.1159788746 and parameters: {'num_hidden_layers': 8, 'num_of_l0': 192, 'num_of_l1': 64, 'num_of_l2': 384, 'num_of_l

In [13]:

# load lại mô hình
checkpoint = torch.load("checkpoint1.pth", map_location="cpu")

activation_func = checkpoint['activation_func']
activation = getattr(nn, activation_func)
input_dim = checkpoint['input_dim']
output_dim = checkpoint['output_dim']
hidden_dim = checkpoint['hidden_dim']
model = MLP_Regression(input_dim, hidden_dim, activation, output_dim).to(device)
model.load_state_dict(checkpoint['model_state_dict'])

tabular_model  = model

In [19]:
# đánh giá trên tập dữ liệu test
import torch.nn.functional as F

x_test_pd = x_test_df1.toPandas()
y_test_pd = y_test_df.toPandas()

X_test_tensor = torch.tensor(x_test_pd.values, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test_pd.values, dtype=torch.float32).to(device)

model.eval()
with torch.no_grad():
    y_pred = tabular_model(X_test_tensor).squeeze()

mse = F.mse_loss(y_pred, y_test_tensor)
rmse = torch.sqrt(mse)


print(f"Test RMSE: {rmse.item():.4f}")


Test RMSE: 534359.9375


  mse = F.mse_loss(y_pred, y_test_tensor)


In [12]:
# đọc dữ liệu hình ảnh cho train
import os

train_image_id = train_df.select("image_id").collect()
y_df = train_df.select("price")
images_dir = "/kaggle/input/dataset2/socal2/socal_pics/"



25/10/25 23:24:42 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [13]:
train_image_id = [row.image_id for row in train_image_id]


In [52]:
# train_df.show(5)

In [17]:
from PIL import Image
class MultiModalDataset(Dataset):
    def __init__(self, df, image_dir, transform):
        self.df = df
        self.image_dir = image_dir
        self.transform = transform

        self.image_ids = self.df['image_id'].values
        self.tabular_data = self.df.drop(['price', 'image_id'], axis=1).values
        self.target = self.df['price'].values

    def __getitem__(self, index):
        image_id = self.image_ids[index]
        image_path = os.path.join(self.image_dir, f"{image_id}.jpg")

        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)

        tabular = torch.tensor(self.tabular_data[index], dtype=torch.float32)
        target = torch.tensor(self.target[index], dtype=torch.float32)

        return image, tabular, target

    def __len__(self):
        return len(self.df)


In [18]:
from torchvision import transforms
import torchvision.models as models

input_size = (311, 415)

transform = transforms.Compose([
    transforms.ToTensor()
])

In [19]:
class CNN_model(nn.Module):
    def __init__(self):
        super().__init__()
        layers = []

        # Conv block 1
        layers.append(nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1))
        layers.append(nn.BatchNorm2d(32))
        layers.append(nn.ReLU())
        layers.append(nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1))
        layers.append(nn.BatchNorm2d(32))
        layers.append(nn.ReLU())
        layers.append(nn.MaxPool2d(2,2))
        layers.append(nn.Dropout(0.2))

        # Conv block 2
        layers.append(nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1))
        layers.append(nn.BatchNorm2d(64))
        layers.append(nn.ReLU())
        layers.append(nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1))
        layers.append(nn.BatchNorm2d(64))
        layers.append(nn.ReLU())
        layers.append(nn.MaxPool2d(2,2))
        layers.append(nn.Dropout(0.3))

        # Conv block 3
        layers.append(nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1))
        layers.append(nn.BatchNorm2d(128))
        layers.append(nn.ReLU())
        layers.append(nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1))
        layers.append(nn.BatchNorm2d(128))
        layers.append(nn.ReLU())
        layers.append(nn.AdaptiveAvgPool2d((7,7)))
        layers.append(nn.Dropout(0.4))

        self.features = nn.Sequential(*layers)
        self.flatten = nn.Flatten()
        self.fc = nn.Sequential(
            nn.Linear(128*7*7, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.features(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x


In [22]:
class TabularModel(nn.Module):
    def __init__(self, input_dim, hidden_dims=[256,128, 64, 32], tabular_out_dim=32, dropout=0.2):
        super().__init__()
        layers = []
        in_dim = input_dim
        for h_dim in hidden_dims:
            layers.append(nn.Linear(in_dim, h_dim))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(h_dim))
            layers.append(nn.Dropout(dropout))
            in_dim = h_dim
        self.mlp = nn.Sequential(*layers)
        self.out = nn.Linear(hidden_dims[-1], tabular_out_dim)  # output nhiều chiều

    def forward(self, x):
        x = self.mlp(x)
        x = self.out(x)
        return x

In [23]:
class FusionModel(nn.Module):
    def __init__(self, tabular_model, cnn_model, tabular_out_dim=32):
        super().__init__()
        
        self.tabular_model = tabular_model
        self.cnn_model = cnn_model

        fusion_input_size = tabular_out_dim + 256  # nhiều feature từ tabular + cnn
        self.fusion_mlp = nn.Sequential(
            nn.Linear(fusion_input_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)  # dự đoán giá nhà
        )

    def forward(self, x_tabular, x_cnn):
        x_tabular = self.tabular_model(x_tabular)
        x_cnn = self.cnn_model(x_cnn)
        x_fusion = torch.cat([x_tabular, x_cnn], dim=1)
        return self.fusion_mlp(x_fusion)


In [24]:
cnn_model = CNN_model().to(device)

tabular_model = TabularModel(x_train_df1.toPandas().shape[1]).to(device)

fusion_model = FusionModel(tabular_model, cnn_model ).to(device)

                                                                                

In [25]:
from torch.utils.data import random_split, DataLoader

train_dataset = MultiModalDataset(train_df.toPandas(), images_dir, transform)

optimizer = torch.optim.Adam(fusion_model.parameters(), lr=0.0005)
loss_fn = nn.MSELoss()

max_epochs = 200
patience = 10  # số epoch liên tiếp không cải thiện thì dừng
best_val_rmse = float('inf')
trigger_times = 0

for epoch in range(max_epochs):
    print(f"\n===== Starting epoch: {epoch+1} =====")

    # --- Re-split train/val mỗi epoch ---
    dataset_size = len(train_dataset)
    val_size = int(0.3 * dataset_size)
    train_size = dataset_size - val_size
    train_subset, val_subset = random_split(train_dataset, [train_size, val_size])

    train_loader = DataLoader(train_subset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=64, shuffle=False)

    # --------- Training ---------
    fusion_model.train()
    running_loss = 0.0
    for image, tabular, target in train_loader:
        image, tabular, target = image.to(device), tabular.to(device), target.to(device)
        optimizer.zero_grad()
        y_pred = fusion_model(tabular, image)
        loss = loss_fn(y_pred.squeeze(), target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * image.size(0)

    train_mse = running_loss / len(train_loader.dataset)
    train_rmse = train_mse ** 0.5

    # --------- Validation ---------
    fusion_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for image, tabular, target in val_loader:
            image, tabular, target = image.to(device), tabular.to(device), target.to(device)
            y_pred = fusion_model(tabular, image)
            loss = loss_fn(y_pred.squeeze(), target)
            val_loss += loss.item() * image.size(0)

    val_mse = val_loss / len(val_loader.dataset)
    val_rmse = val_mse ** 0.5

    print(f"Epoch [{epoch+1}/{max_epochs}] - Train RMSE: {train_rmse:.4f}, Val RMSE: {val_rmse:.4f}")

    # --------- Early Stopping & Save Best Model ---------
    if val_rmse < best_val_rmse:
        best_val_rmse = val_rmse
        trigger_times = 0
        torch.save(fusion_model.state_dict(), "/kaggle/working/checkpoint2.pth")
        print(f"✅ Saved new best model with Val RMSE: {best_val_rmse:.4f}")
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print(f"⏹️ Early stopping at epoch {epoch+1}")
            break


                                                                                


===== Starting epoch: 1 =====
Epoch [1/200] - Train RMSE: 677742.9408, Val RMSE: 386024.5440
✅ Saved new best model with Val RMSE: 386024.5440

===== Starting epoch: 2 =====
Epoch [2/200] - Train RMSE: 396767.3820, Val RMSE: 379725.1439
✅ Saved new best model with Val RMSE: 379725.1439

===== Starting epoch: 3 =====
Epoch [3/200] - Train RMSE: 390902.2593, Val RMSE: 370397.8035
✅ Saved new best model with Val RMSE: 370397.8035

===== Starting epoch: 4 =====
Epoch [4/200] - Train RMSE: 382849.3624, Val RMSE: 377384.8512

===== Starting epoch: 5 =====
Epoch [5/200] - Train RMSE: 378514.9315, Val RMSE: 374549.6736

===== Starting epoch: 6 =====
Epoch [6/200] - Train RMSE: 374775.0273, Val RMSE: 352142.6054
✅ Saved new best model with Val RMSE: 352142.6054

===== Starting epoch: 7 =====
Epoch [7/200] - Train RMSE: 365610.5384, Val RMSE: 378585.9303

===== Starting epoch: 8 =====
Epoch [8/200] - Train RMSE: 383024.5280, Val RMSE: 378747.5007

===== Starting epoch: 9 =====
Epoch [9/200] - T