In [61]:
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import matplotlib.pyplot as plt
import torch.nn.init as init
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
import torch
import pandas as pd
from tqdm import tqdm
from torchsummary import summary
import numpy as np

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

In [None]:
import os
os.chdir(os.getcwd().replace('/notebooks', ''))

In [None]:
df = pd.read_csv('data/preprocessed/train_preprocessed.csv')
data_array = df.values
data_tensor = torch.tensor(data_array, dtype=torch.float32)

print(data_tensor.shape)

In [None]:
features = torch.tensor(df.iloc[:, :-2].values, dtype=torch.float32)  # Tất cả cột trừ cột cuối
labels = torch.tensor(df.iloc[:, -1].values, dtype=torch.float32).view(-1, 1)  # Cột cuối cùng

print(features.shape, labels.shape)  # Kiểm tra kích thước tensor

In [None]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self, input_size=75):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(128, 64),
            nn.ReLU(),

            nn.Linear(64,32),
            nn.ReLU(),

            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
# Kích thước batch
batch_size = 32  

# Chia tập train (80%) và validation (20%)
train_size = int(0.8 * len(features))
val_size = len(features) - train_size

# Chia dataset thật thành train và validation
train_dataset, val_dataset = random_split(TensorDataset(features, labels), [train_size, val_size])

# Tạo DataLoader cho train và validation
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Kiểm tra một batch train
for batch_features, batch_labels in train_loader:
    print("Train Batch Shape:", batch_features.shape, batch_labels.shape)
    break  # Chỉ in thử batch đầu tiên

# Kiểm tra một batch validation
for batch_features, batch_labels in test_loader:
    print("Validation Batch Shape:", batch_features.shape, batch_labels.shape)
    break  # Chỉ in thử batch đầu tiên

In [None]:
def evaluate(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    num_batches = 0  # Đếm số batch
    
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            num_batches += 1  # Cập nhật số batch

    # Trả về loss trung bình thay vì tổng loss
    return test_loss / num_batches if num_batches > 0 else float('inf')

In [None]:
# Khởi tạo mô hình và chuyển lên device
model = MLP()
summary(model, input_size = (75,))
model.to(device)
criterion = nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

In [None]:
# Danh sách lưu loss
train_losses = []
test_losses = []
batch_losses = []  # Lưu loss theo từng batch để minh họa chi tiết hơn

max_epoch = 200

# Training loop
for epoch in range(max_epoch):
    model.train()
    running_loss = 0.0
    batch_loss_list = []  # Lưu loss của từng batch trong epoch

    for i, (features, labels) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{max_epoch}")):
        # Chuyển dữ liệu lên device
        features, labels = features.to(device), labels.to(device)

        optimizer.zero_grad()  # Fix lỗi thiếu dấu ()
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        batch_loss_list.append(loss.item())  # Ghi lại loss từng batch

    # Tính trung bình loss trên tập train
    epoch_loss = running_loss / (i + 1)
    train_losses.append(epoch_loss)
    batch_losses.append(batch_loss_list)  # Ghi lại loss từng batch theo epoch

    # Đánh giá trên tập test
    model.eval()
    test_loss = evaluate(model, test_loader, criterion)
    scheduler.step(test_loss)
    test_losses.append(test_loss)

    print(f"Epoch {epoch+1}/{max_epoch} - Train Loss: {epoch_loss:.4f}, Test Loss: {test_loss:.4f}")

In [None]:
# Vẽ loss theo epoch
plt.figure(figsize=(12, 6))

# Vẽ Train & Validation Loss
plt.subplot(1, 2, 1)
plt.plot(range(1, max_epoch+1), train_losses, label="Train Loss", color="blue")
plt.plot(range(1, max_epoch+1), test_losses, label="Test Loss", color="red")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training & Validation Loss")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
for epoch in [0, max_epoch//4, max_epoch//2, max_epoch-1]:  # Chọn vài epoch để hiển thị
    plt.plot(batch_losses[epoch], label=f"Epoch {epoch+1}")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.title("Loss per Batch in Selected Epochs")
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()

In [None]:
torch.save(model, './model/model.pth')

In [None]:
filepath = os.path.join('data', 'raw', 'test.csv')
data = pd.read_csv(filepath, index_col='Id')

# Loại bỏ cột cuối cùng khỏi danh sách cột xử lý
columns_to_process = data.columns

# Find columns with missing values
lst_of_missing = [col for col in columns_to_process if data[col].isnull().sum() > 0]

# Find numerical and categorical columns
lst_of_numerical = [col for col in columns_to_process if data[col].dtype != 'object']
lst_of_categorical = [col for col in columns_to_process if data[col].dtype == 'object']

# Print information about missing categorical columns
cat_missing = set(lst_of_categorical) & set(lst_of_missing)
print(f'There are {len(cat_missing)} categorical columns with missing values')
for col in cat_missing:
    print(f'{col:<13}: {data[col].isnull().sum(): <4} missing values - {data[col].isnull().sum() / len(data) * 100:.2f}% - {len(data[col].unique())} unique values')

# Save preprocessed data
output_dir = os.path.join('data', 'preprocessed')
os.makedirs(output_dir, exist_ok=True)
output_filepath = os.path.join(output_dir, 'test_preprocessed.csv')
data.to_csv(output_filepath)
print(f"Preprocessed data saved to {output_filepath}")

In [None]:
filepath = './data/preprocessed/test_preprocessed.csv'
data = pd.read_csv(filepath, index_col='Id')

# Fill missing values
data['LotFrontage'] = data['LotFrontage'].fillna(data[data['LotFrontage'] < 300]['LotFrontage'].mean())
data['GarageYrBlt'] = data['GarageYrBlt'].fillna(data['GarageYrBlt'].interpolate())
data['MasVnrArea'] = data['MasVnrArea'].fillna(0)

# Fill missing values for categorical columns
data['MasVnrType'] = data['MasVnrType'].fillna('None')
data.drop(['MiscFeature', 'PoolQC', 'Fence', 'Alley'], axis=1, inplace=True)

# Encode categorical columns
label_encoders = {}
for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  # Store encoders for potential inverse transform

# Standardize numerical features
scaler = StandardScaler()
data_standardized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)

# Save preprocessed data
output_filepath = './data/preprocessed/test_preprocessed.csv'
data_standardized.to_csv(output_filepath)
print(f"Preprocessed data saved to {output_filepath}")

In [60]:
# Load preprocessed test data
test_filepath = './data/preprocessed/test_preprocessed.csv'
test_data = pd.read_csv(test_filepath, index_col='Id')

test_tensor = torch.tensor(test_data.values, dtype=torch.float32)

# Load trained model
model_path = './model/model.pth'
model = torch.load(model_path, map_location=device, weights_only=False)
model.eval()

# Perform inference
with torch.no_grad():
    test_tensor = test_tensor.to(device)
    predictions = model(test_tensor).cpu().numpy()

# Load scaling parameters
# Load scaling parameters safely
scaling_params_filepath = './data/preprocessed/scaling_params.txt'

with open(scaling_params_filepath, 'r') as f:
    lines = f.readlines()
    mean = float(lines[0].split()[1])
    std = float(lines[1].split()[1])
        
# Reverse standardization
predictions = predictions * std + mean
predictions = np.nan_to_num(predictions)

In [62]:
# Giả sử `test_data` có index là Id
ids = test_data.index  # Lấy cột Id từ dữ liệu test ban đầu

# Tạo DataFrame với 2 cột: Id và SalesPrice
df_predictions = pd.DataFrame({'Id': ids, 'SalePrice': predictions.flatten()})

# Lưu file CSV
output_dir = './data/output'
os.makedirs(output_dir, exist_ok=True)
output_filepath = os.path.join(output_dir, 'predictions.csv')

df_predictions.to_csv(output_filepath, index=False)

print(f"Predictions saved to {output_filepath}")

Predictions saved to ./data/output/predictions.csv
