In [1]:
!brew install libomp

To reinstall 19.1.7, run:
  brew reinstall libomp


In [2]:
!export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH

In [3]:
!echo 'export DYLD_LIBRARY_PATH=$(brew --prefix libomp)/lib:$DYLD_LIBRARY_PATH' >> ~/.zshrc
!source ~/.zshrc

# Best: 0.15679

In [6]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Chuyển thư mục nếu cần
os.chdir(os.getcwd().replace('/notebooks', ''))

# Đọc dữ liệu
train_filepath = './data/preprocessed/train_preprocessed.csv'
test_filepath = './data/preprocessed/test_preprocessed.csv'

train_data = pd.read_csv(train_filepath)
test_data = pd.read_csv(test_filepath)

# Tách features và target
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Chia train - validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Cải thiện mô hình XGBoost
model = xgb.XGBRegressor(
    n_estimators=50000,        # Giảm số vòng lặp để tránh overfitting
    learning_rate=0.001,       # Giảm tốc độ học
    max_depth=9,              # Tăng chiều sâu
    subsample=0.85,           # Tăng tập con train
    colsample_bytree=0.85,    # Tăng tập con feature
    reg_lambda=2.0,           # L2 Regularization
    reg_alpha=0.5,            # L1 Regularization
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42
)

# Huấn luyện mô hình với Early Stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)

# Dự đoán trên tập test
test_predictions = model.predict(test_data)

print("Training Completed!")

with open('data/preprocessed/scaling_params.txt', 'r') as f:
    lines = f.readlines()
    mean = float(lines[0].split()[1])
    std = float(lines[1].split()[1])

# Đảo ngược chuẩn hóa
test_predictions = test_predictions * std + mean

# Lưu kết quả
output_dir = './data/output'
os.makedirs(output_dir, exist_ok=True)
output_filepath = os.path.join(output_dir, 'predictions.csv')
test_ids = pd.read_csv(test_filepath)['Id']
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv(output_filepath, index=False)

print(f"Predictions saved to {output_filepath}")

[0]	validation_0-rmse:1.10251
[100]	validation_0-rmse:1.02930
[200]	validation_0-rmse:0.96324
[300]	validation_0-rmse:0.90376
[400]	validation_0-rmse:0.84967
[500]	validation_0-rmse:0.80063
[600]	validation_0-rmse:0.75661
[700]	validation_0-rmse:0.71683
[800]	validation_0-rmse:0.68072
[900]	validation_0-rmse:0.64749
[1000]	validation_0-rmse:0.61845
[1100]	validation_0-rmse:0.59161
[1200]	validation_0-rmse:0.56742
[1300]	validation_0-rmse:0.54577
[1400]	validation_0-rmse:0.52620
[1500]	validation_0-rmse:0.50877
[1600]	validation_0-rmse:0.49280
[1700]	validation_0-rmse:0.47824
[1800]	validation_0-rmse:0.46513
[1900]	validation_0-rmse:0.45348
[2000]	validation_0-rmse:0.44281
[2100]	validation_0-rmse:0.43327
[2200]	validation_0-rmse:0.42462
[2300]	validation_0-rmse:0.41653
[2400]	validation_0-rmse:0.40908
[2500]	validation_0-rmse:0.40228
[2600]	validation_0-rmse:0.39624
[2700]	validation_0-rmse:0.39084
[2800]	validation_0-rmse:0.38557
[2900]	validation_0-rmse:0.38102
[3000]	validation_0-rm

In [5]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Chuyển thư mục nếu cần
os.chdir(os.getcwd().replace('/notebooks', ''))

# Đọc dữ liệu
train_filepath = './data/preprocessed/train_preprocessed_pca.csv'
test_filepath = './data/preprocessed/test_preprocessed_pca.csv'

train_data = pd.read_csv(train_filepath)
test_data = pd.read_csv(test_filepath)

# Tách features và target
X = train_data.drop(columns=['SalePrice'])
y = train_data['SalePrice']

# Chia train - validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Cải thiện mô hình XGBoost
model = xgb.XGBRegressor(
    n_estimators=50000,        # Giảm số vòng lặp để tránh overfitting
    learning_rate=0.03,       # Giảm tốc độ học
    max_depth=7,              # Tăng chiều sâu
    subsample=0.85,           # Tăng tập con train
    colsample_bytree=0.85,    # Tăng tập con feature
    reg_lambda=2.0,           # L2 Regularization
    reg_alpha=0.5,            # L1 Regularization
    objective='reg:squarederror',
    eval_metric='rmse',
    random_state=42
)

# Huấn luyện mô hình với Early Stopping
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100
)
# Align test features with train features

# Dự đoán trên tập test
test_predictions = model.predict(test_data)

print("Training Completed!")

with open('data/preprocessed/scaling_params.txt', 'r') as f:
    lines = f.readlines()
    mean = float(lines[0].split()[1])
    std = float(lines[1].split()[1])

# Đảo ngược chuẩn hóa
test_predictions = test_predictions * std + mean

# Lưu kết quả
output_dir = './data/output'
os.makedirs(output_dir, exist_ok=True)
output_filepath = os.path.join(output_dir, 'predictions.csv')
test_ids = pd.read_csv(test_filepath)['Id']
submission = pd.DataFrame({'Id': test_ids, 'SalePrice': test_predictions})
submission.to_csv(output_filepath, index=False)

print(f"Predictions saved to {output_filepath}")

[0]	validation_0-rmse:1.07923
[100]	validation_0-rmse:0.45906
[200]	validation_0-rmse:0.41944
[300]	validation_0-rmse:0.41366
[400]	validation_0-rmse:0.41103
[500]	validation_0-rmse:0.40997
[600]	validation_0-rmse:0.40960
[700]	validation_0-rmse:0.40958
[800]	validation_0-rmse:0.40940
[900]	validation_0-rmse:0.40934
[1000]	validation_0-rmse:0.40936
[1100]	validation_0-rmse:0.40934
[1200]	validation_0-rmse:0.40936
[1300]	validation_0-rmse:0.40938
[1400]	validation_0-rmse:0.40934
[1500]	validation_0-rmse:0.40935
[1600]	validation_0-rmse:0.40933
[1700]	validation_0-rmse:0.40932
[1800]	validation_0-rmse:0.40932
[1900]	validation_0-rmse:0.40934
[2000]	validation_0-rmse:0.40932
[2100]	validation_0-rmse:0.40933
[2200]	validation_0-rmse:0.40932
[2300]	validation_0-rmse:0.40932
[2400]	validation_0-rmse:0.40932
[2500]	validation_0-rmse:0.40934
[2600]	validation_0-rmse:0.40932
[2700]	validation_0-rmse:0.40932
[2800]	validation_0-rmse:0.40932
[2900]	validation_0-rmse:0.40931
[3000]	validation_0-rm