In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv('data/train.csv')
df_val = pd.read_csv('data/val.csv')
df_test = pd.read_csv('data/test.csv')
df = pd.concat([df_train, df_val], axis=0)
df

# Visualize bar chart with each feature

In [None]:
features = df.drop(columns=['price', 'area', 'street_in_front_of_house', 'width'])
total_features = features.columns.to_list()
total_features

In [None]:
feature_unique_fields = {}
for i in total_features:
    feature_unique_fields[i] = features[i].unique().tolist()

len_features = len(feature_unique_fields)
for i in range(len_features):
    list(feature_unique_fields.values())[i].sort()
feature_unique_value = {}
for feature in total_features:
    # print("Feature: \n", feature)
    feature_len = len(feature_unique_fields[feature])
    # print("Len: ", feature_len)
    feature_value = feature_unique_fields[feature]
    # print(feature_value)
    feature_unique_value[feature] = [sum(df[df[feature] == feature_value[i]].price) / df[feature].value_counts()[feature_value[i]] for i in range(feature_len)]
    
# print(feature_unique_value)
feature_unique_fields


In [None]:
visualized_features = total_features
name_of_features = ['floor_number',
 'bedroom_number',
 'is_dinning_room',
 'is_kitchen',
 'is_terrace',
 'is_car_pack',
 'type',
 'direction',
 'city',
 'district']
labels = ['Số tầng', 'Số phòng ngủ', 'Có phòng ăn hay không', 'Có phòng bếp hay không', 'Có sân thượng hay không', 
'Có chỗ để xe hay không','Loại bất động sản','Thành phố', 'Quận/Huyện']
titles = ['Biểu đồ thể hiện giá nhà trung bình theo số tầng nhà','Biểu đồ giá nhà trung bình theo số phòng ngủ', 'Biểu đồ giá nhà trung bình theo phòng ăn', 
'Biểu đồ giá nhà trung bình có và không có bếp', 'Giá nhà trung bình với sân thượng', 'Giá nhà trung bình với chỗ để xe', 
'Biểu đồ thể hiện giá nhà trung bình theo loại bất động sản', 
'Biểu đồ thể hiện giá nhà trung bình theo thành phố', 'Biểu đồ thể hiện giá nhà trung bình theo quận/huyện bất động sản']

In [None]:
import random
count = 0
for feature in visualized_features:
    N = len(feature_unique_value[feature])
    ind = np.arange(N) 
    all_colors = list(plt.cm.colors.cnames.keys())
    random.seed(100)
    c = random.choices(all_colors, k=N) 
    text_value = {}
    for i in ind:
        text_value[i] = feature_unique_value[feature][i]
        
    fig = plt.subplots(figsize=(10, 7))
    plt.bar(ind, feature_unique_value[feature], color=c)
    
    for key in text_value:
        plt.text(key, text_value[key], float(round(text_value[key], 2)), 
                horizontalalignment='center', verticalalignment='bottom', 
                fontdict={'fontweight':500, 'size':12})
    
    # Decide whether to rotate labels based on number of categories and label length
    max_label_length = max([len(str(label)) for label in feature_unique_fields[feature]])
    available_width = 10  # Figure width in inches
    
    # Apply rotation if many values or long labels
    if N > 5 or (N * max_label_length > 30):
        plt.xticks(ind, list(feature_unique_fields[feature]), rotation=45, ha='right')
    else:
        plt.xticks(ind, list(feature_unique_fields[feature]))  # No rotation
    
    plt.xlabel(labels[count])
    plt.ylabel("Giá (tỷ đồng)")
    plt.title(titles[count], fontsize=22)
    plt.tight_layout()
    count += 1
    
plt.show()

# Preprocessing to be ready for predict 

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
features_numerical = df.select_dtypes(exclude=['object', 'bool']).copy()
numerical_cols = features_numerical.columns.tolist()

numerical_cols.remove('price')
numerical_cols

In [None]:
features_categorical = df.select_dtypes(include=['object', 'bool']).copy()
categorical_cols = features_categorical.columns.tolist()
categorical_cols

In [None]:
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)     
    ],
    remainder='passthrough' 
)


In [None]:
y_train = df_train['price'].copy()
y_val = df_val['price'].copy()

df_train = df_train.drop(['price'], axis = 1)
df_val = df_val.drop(['price'], axis = 1)

In [None]:
y_train.to_numpy()
y_val.to_numpy()

In [None]:
y_train.shape, y_val.shape

In [None]:
X_train = preprocessor.fit_transform(df_train)
X_val = preprocessor.transform(df_val)
X_test = preprocessor.transform(df_test)

In [None]:
from matplotlib import pyplot as plt

def plot_evaluate(y_true, y_pred):
    plt.plot(y_true, y_pred, 'b.')
    x = [np.min(y_true), np.max(y_true)]
    y = x
    plt.plot(x, y, 'r')
    plt.title('XGBoost')
    plt.xlabel('Reality')
    plt.ylabel('Predict')
    plt.show()

# Support Vector Regression

In [None]:
from sklearn.svm import SVR
svr_model = SVR(kernel='poly', gamma=0.0975, C=5.1, epsilon=2.4798, coef0 = 2.3)
svr_model.fit(X_train, y_train)
y_pred_val = svr_model.predict(X_val)
y_test_pred = svr_model.predict(X_test)
svr_submit = pd.DataFrame({
    'Id': df_test.index,
    'TARGET': y_test_pred
})
svr_submit.to_csv('data/svr_model.csv', index=False)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import r2_score

# Giả sử X_train, y_train, X_val, y_val đã được định nghĩa

r2_scores = []
C_range = range(1, 101)  # Từ 1 đến 99

best_score = -float('inf')
best_params = {}

for C_value in C_range: 
    model = SVR(kernel='rbf', C=C_value)
    model.fit(X_train, y_train)
    y_pred_val = model.predict(X_val)
    score = r2_score(y_val, y_pred_val)
    print(f"C = {C_value}, R² = {score:.4f}")
    r2_scores.append(score)

    if score > best_score:
        best_score = score
        best_params = {
            'C': C_value
        }

print("\nBest parameters with highest R² score:")
print(f"C = {best_params['C']}, R² = {best_score:.4f}")

plt.figure(figsize=(10, 5))
plt.plot(C_range, r2_scores)
plt.title('Sự thay đổi R² theo tham số C')
plt.xlabel('Giá trị C')
plt.ylabel('R² Score')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

C_range = range(1, 101)
rmse_val_scores = []
rmse_train_scores = []
best_rmse = float('inf')
best_params = {}

for C_val in C_range:
    model = SVR(kernel='rbf', C=C_val)
    model.fit(X_train, y_train)
    
    # Dự đoán trên tập validation
    y_val_pred = model.predict(X_val)
    rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
    rmse_val_scores.append(rmse_val)
    
    # Dự đoán trên tập train
    y_train_pred = model.predict(X_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    rmse_train_scores.append(rmse_train)

    print(f"C = {C_val}, RMSE Train = {rmse_train:.4f}, RMSE Val = {rmse_val:.4f}")
    
    if rmse_val < best_rmse:
        best_rmse = rmse_val
        best_params = {'C': C_val}

print("\nBest Validation RMSE:")
print(f"C = {best_params['C']}, RMSE = {best_rmse:.4f}")

# Vẽ biểu đồ RMSE
plt.figure(figsize=(10, 5))
plt.plot(C_range, rmse_train_scores, label='Train', color='blue')
plt.plot(C_range, rmse_val_scores, label='Validation', color='orange')
plt.title('RMSE trên tập Train và Validation theo tham số C (SVR với kernel RBF)')
plt.xlabel('Giá trị C')
plt.ylabel('RMSE')
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

epsilon_values = np.linspace(0.01, 5, 100)
rmse_scores = []

best_rmse = float('inf')
best_params = {}

for eps in epsilon_values:
    model = SVR(kernel='rbf', epsilon=eps)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

    if rmse < best_rmse:
        best_rmse = rmse
        best_params = {'epsilon': eps}

print(f"Best epsilon: {best_params['epsilon']:.4f} with RMSE = {best_rmse:.4f}")

# Vẽ biểu đồ RMSE theo epsilon
plt.figure(figsize=(10, 5))
plt.plot(epsilon_values, rmse_scores)
plt.title('Đồ thị biểu diễn sự thay đổi RMSE theo tham số epsilon với kernel rbf')
plt.xlabel('Giá trị epsilon')
plt.ylabel('RMSE')
plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

gamma_values = np.linspace(0.001, 0.2, 100)
rmse_scores = []

best_rmse = float('inf')
best_gamma = None

for gamma in gamma_values:
    model = SVR(kernel='rbf', gamma=gamma)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

    if rmse < best_rmse:
        best_rmse = rmse
        best_gamma = gamma

print(f"Best gamma: {best_gamma:.4f} with RMSE = {best_rmse:.4f}")

# Vẽ biểu đồ
plt.figure(figsize=(10, 5))
plt.plot(gamma_values, rmse_scores)
plt.title('Đồ thị biểu diễn sự thay đổi RMSE theo tham số gamma với kernel rbf')
plt.xlabel('Giá trị gamma')
plt.ylabel('RMSE')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

gamma_values = np.linspace(0.001, 1, 100)
rmse_scores = []

best_rmse = float('inf')
best_gamma = None

for gamma in gamma_values:
    model = SVR(kernel='rbf', gamma=gamma, C = 51, epsilon = 2.1774)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

    if rmse < best_rmse:
        best_rmse = rmse
        best_gamma = gamma

print(f"Best gamma: {best_gamma:.4f} with RMSE = {best_rmse:.4f}")

# Vẽ biểu đồ
plt.figure(figsize=(10, 5))
plt.plot(gamma_values, rmse_scores)
plt.title('Sự thay đổi RMSE theo gamma')
plt.xlabel('Giá trị gamma')
plt.ylabel('RMSE')
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

gamma_values = np.linspace(1, 5, 10)
rmse_scores = []

best_rmse = float('inf')
best_gamma = None

for gamma in gamma_values:
    model = SVR(kernel='poly', C = 5.1, epsilon = 2.4798, gamma=0.0975, coef0=2.3, degree=2)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_scores.append(rmse)

    if rmse < best_rmse:
        best_rmse = rmse
        best_gamma = gamma

print(f"Best gamma: {best_gamma:.4f} with RMSE = {best_rmse:.4f}")

# Vẽ biểu đồ
plt.figure(figsize=(10, 5))
plt.plot(gamma_values, rmse_scores)
plt.title('Đồ thị biểu diễn sự thay đổi RMSE theo tham số gamma với kernel rbf')
plt.xlabel('Giá trị gamma')
plt.ylabel('RMSE')
plt.tight_layout()
plt.show()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# --- Chuẩn hóa dữ liệu y ---
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1))

# --- Chuyển đổi sang tensor ---
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_scaled, dtype=torch.float32)

# --- Định nghĩa lớp mô hình MLP ---
class FlexibleMLP(nn.Module):
    def __init__(self, input_dim, hidden_sizes):
        super(FlexibleMLP, self).__init__()
        layers = []
        in_features = input_dim
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(in_features, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(0.3))
            in_features = hidden_size
        layers.append(nn.Linear(in_features, 1))  # Tầng đầu ra
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# --- Hàm huấn luyện mô hình và trả về RMSE ---
def train_and_evaluate(hidden_sizes):
    model = FlexibleMLP(X_train_tensor.shape[1], hidden_sizes)
    criterion = nn.MSELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.005, momentum=0.9)
    best_val_loss = float('inf')
    patience = 20
    patience_counter = 0
    batch_size = 64
    epochs = 200

    for epoch in range(epochs):
        model.train()
        for i in range(0, len(X_train_tensor), batch_size):
            X_batch = X_train_tensor[i:i + batch_size]
            y_batch = y_train_tensor[i:i + batch_size]
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        with torch.no_grad():
            val_pred = model(X_val_tensor)
            val_loss = criterion(val_pred, y_val_tensor)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            best_model = model.state_dict()
        else:
            patience_counter += 1
        if patience_counter >= patience:
            break

    # Load best model
    model.load_state_dict(best_model)
    model.eval()

    with torch.no_grad():
        y_val_pred_scaled = model(X_val_tensor)
        y_val_pred = scaler_y.inverse_transform(y_val_pred_scaled.numpy())
        y_val_true = scaler_y.inverse_transform(y_val_tensor.numpy())
        rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))

    return model, rmse

# --- Chạy từ 1 đến 5 tầng ẩn theo quy tắc 256/2^(n-1) ---
results = []
best_rmse = float('inf')
best_model = None
best_test_pred = None

for n_layers in range(1, 6):
    hidden_sizes = [int(256 / (2 ** i)) for i in range(n_layers)]
    print(f"\n🔁 Training MLP with {n_layers} hidden layer(s), sizes = {hidden_sizes}")
    model, rmse = train_and_evaluate(hidden_sizes)
    print(f"✅ RMSE: {rmse:.4f}")

    results.append((n_layers, hidden_sizes, rmse))

    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        with torch.no_grad():
            y_test_pred_scaled = best_model(X_test_tensor)
            y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled.numpy())

# --- Xuất kết quả dự đoán từ mô hình tốt nhất ---
mlp_submit = pd.DataFrame({
    'Id': df_test.index,
    'TARGET': y_test_pred.flatten()
})
mlp_submit.to_csv('data/mlp_model_pytorch_best.csv', index=False)

# --- In tóm tắt kết quả ---
print("\n📊 Tóm tắt RMSE theo số tầng:")
for n_layers, sizes, rmse in results:
    print(f"{n_layers} tầng ẩn {sizes} → RMSE: {rmse:.4f}")

print(f"\n🏆 Mô hình tốt nhất: {best_rmse:.4f}")
