In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [3]:
class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        # Combine YrSold and MoSold into a datetime format
        X['Date'] = pd.to_datetime(X['YrSold'].astype(str) + X['MoSold'].astype(str).str.zfill(2), format='%Y%m')
        # Convert datetime to Unix timestamp in seconds
        X['Date'] = X['Date'].astype('int64') // 10**9
        # Drop the original 'MoSold' and 'YrSold' columns
        X.drop(['MoSold', 'YrSold'], axis=1, inplace=True) 
        return X

In [4]:
# Define a custom transformer for encoding categorical columns
class LEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, mappings):
        self.mappings = mappings

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for column_name, mapping_dict in self.mappings.items():
            if column_name in X.columns:
                # Handle null values by filling with 'NA'
                X[column_name] = X[column_name].fillna('NA')
                # Create the encoded column
                X[column_name + '_encoded'] = X[column_name].map(mapping_dict)
                # Drop the original column
                X = X.drop([column_name], axis=1)
        return X

In [5]:
# This custom OHEncoder ensure the return object to be dataframe
class OHEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, handle_unknown='ignore'):
        self.columns = columns
        self.handle_unknown = handle_unknown
        self.encoder = OneHotEncoder(handle_unknown=handle_unknown, sparse_output=False)

    def fit(self, X, y=None):
        # Fit the encoder on the specified columns
        if self.columns is None:
            self.columns = X.columns
        self.encoder.fit(X[self.columns])
        return self

    def transform(self, X):
        # Transform the specified columns
        X_transformed = X.copy()
        # Apply the encoder
        encoded_array = self.encoder.transform(X[self.columns])
        # Create DataFrame with appropriate column names
        encoded_df = pd.DataFrame(
            encoded_array, 
            columns=self.encoder.get_feature_names_out(self.columns),
            index=X.index
        )
        # Drop the original columns and concatenate the new DataFrame
        X_transformed = X_transformed.drop(columns=self.columns)
        X_transformed = pd.concat([X_transformed, encoded_df], axis=1)
        return X_transformed

In [6]:
import os

# Feature stripper 
class StripColumn(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_file = 'retained_columns.txt'

    def fit(self, X, y=None):
        # Ensure X is a DataFrame
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        X = X.copy()
        numeric_cols = X.select_dtypes(include=[int, float]).columns
        cols_to_keep = numeric_cols[X[numeric_cols].sum() >= 10]
        
        # Save column indices to a text file
        with open(self.columns_file, 'w') as f:
            for index in range(len(cols_to_keep)):
                f.write(f"{index}\n")
        
        # Save column names separately if needed for reference
        self.column_names = cols_to_keep.tolist()
        
        return self

    def transform(self, X, y=None):
        # Ensure X is a DataFrame
        if isinstance(X, np.ndarray):
            X = pd.DataFrame(X)
        
        X = X.copy()
        
        # Check if the columns file exists
        if os.path.exists(self.columns_file):
            # Load the indices to keep from the text file
            with open(self.columns_file, 'r') as f:
                indices_to_keep = [int(line.strip()) for line in f]
            
            # Use indices to select columns
            if hasattr(self, 'column_names'):
                # Use saved column names to select by indices
                cols_to_keep = [self.column_names[i] for i in indices_to_keep]
            else:
                # If names are not available, directly use indices
                cols_to_keep = X.columns[indices_to_keep]
            
            X = X[cols_to_keep]
        else:
            print("Column file not found. Transforming with all columns.")
        
        return X

In [7]:
# Log transform the SalePrice
class LogTransform(BaseEstimator, TransformerMixin):
    def __init__(self, column_name='SalePrice'):
        self.column_name = column_name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        # Check if the column exists in the DataFrame
        if self.column_name in X.columns:
            X[f'Log{self.column_name}'] = np.log(X[self.column_name])
            X = X.drop([self.column_name], axis=1)
        else:
            raise ValueError(f"Column {self.column_name} not found in the DataFrame")
        return X

In [8]:
class DropColumn(BaseEstimator, TransformerMixin):
    def __init__(self, columns: list):
        self.columns = columns

    def fit(self, X, y=None):
        # No fitting necessary for this transformer
        return self

    def transform(self, X, y=None):
        # Ensure X is a DataFrame
        if not isinstance(X, pd.DataFrame):
            raise TypeError("X should be a pandas DataFrame")

        # Identify which columns to drop and which do not exist
        existing_columns = [col for col in self.columns if col in X.columns]
        non_existent_columns = [col for col in self.columns if col not in X.columns]

        # Print out which columns are not found
        if non_existent_columns:
            print(f"Columns not found in DataFrame: {non_existent_columns}")

        # Drop the existing columns
        X_transformed = X.drop(columns=existing_columns)

        return X_transformed

In [9]:
class CustomImputer(BaseEstimator, TransformerMixin):
    def __init__(self, strategy='median'):
        self.strategy = strategy
        self.imputer = SimpleImputer(strategy=strategy)

    def fit(self, X, y=None):
        self.imputer.fit(X)
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        # Transform and convert to DataFrame
        imputed_array = self.imputer.transform(X)
        return pd.DataFrame(imputed_array, columns=X.columns, index=X.index)
    
class CustomScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X)
        return self

    def transform(self, X, y=None):
        X_transformed = X.copy()
        # Transform and convert to DataFrame
        scaled_array = self.scaler.transform(X)
        return pd.DataFrame(scaled_array, columns=X.columns, index=X.index)

In [10]:
# Define mappings for categorical columns
categorical_mappings = {
    'PoolQC': {'NA': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'Fence': {'NA': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4},
    'GarageCond': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageQual': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageFinish': {'NA': 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'FireplaceQu': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7},
    'KitchenQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'HeatingQC': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'BsmtFinType2': {'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType1': {'NA': 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtExposure': {'NA': 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'BsmtQual': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {'NA': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'ExterCond': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'ExterQual': {'Po': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'LandSlope': {'Sev': 0, 'Mod': 1, 'Gtl': 2},
    'LotShape': {'IR3': 0, 'IR2': 1, 'IR1': 2, 'Reg': 3}
}

# Define categorical columns
categorical_columns = [
    'SaleCondition', 'SaleType', 'PavedDrive', 'Electrical', 'GarageType', 'CentralAir', 
    'MasVnrType', 'Foundation', 'Heating', 'MiscFeature', 'MSSubClass', 'MSZoning', 
    'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig', 'Neighborhood', 
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 
    'Exterior1st', 'Exterior2nd'
]

In [11]:
# Create the preprocessing pipeline
preprocessor = Pipeline(steps=[
    ('dropId', DropColumn(columns=['Id'])),
    ('date_converter', DateTransformer()),
    ('categorical_encoder', LEncoder(categorical_mappings)),
    ('onehot_encoder', OHEncoder(columns=categorical_columns)),
    ("ImputerMedian", CustomImputer(strategy='median')),
    ('strip_column', StripColumn()),
    ('standard_scaler', CustomScaler())
])

In [12]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [13]:
# y = df_train['SalePrice']
y = np.log(df_train['SalePrice'])
df_train = df_train.drop('SalePrice', axis=1)
combined_df = pd.concat([df_train, df_test])

In [14]:
train_size = df_train.shape[0]

# Preprocess the combined feature set | p = 'preprocessed'
p_train = preprocessor.fit_transform(combined_df)

# Get the unlabeled test feature set out
p_test = p_train.iloc[train_size:, :] 

# Get the labeled feature set
X = p_train.iloc[:train_size, :]

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=247)

In [16]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.metrics import mean_squared_error

In [17]:
# Initialize models
models = {
    'Random Forest Regressor': RandomForestRegressor(n_estimators=500, max_depth=2),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=500, max_depth=2),
    'XGBoost Regressor': XGBRegressor(n_estimators=1000, learning_rate=0.01, max_depth=6, min_child_weight=1, gamma=0,
                                        subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror', eval_metric='rmse'),
    'LGBM Regressor': LGBMRegressor(n_estimators=900, learning_rate=0.1, max_depth=1, verbosity=-1)
}

# models = {
#     'Random Forest Regressor': RandomForestRegressor(n_estimators=500, max_depth=2),
#     'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=500, max_depth=2),
#     'XGBoost Regressor': XGBRegressor(learning_rate=0.1, max_depth=2, n_estimators=500),
#     'LGBM Regressor': LGBMRegressor(n_estimators=900, learning_rate=0.1, max_depth=1, verbosity=-1)
# }


def safe_log(x):
    # Ensure that x is a NumPy array for element-wise operations
    x = np.asarray(x)
    # Clip values to avoid log(0) and negative values
    x_clipped = np.clip(x, a_min=1e-10, a_max=None)
    return np.log(x_clipped)

# Calculate RMSE on the log scale
def calculate_rmse_log(y_true, y_pred):
    # Apply safe_log to avoid issues with log(0)
    y_true_log = safe_log(y_true)
    y_pred_log = safe_log(y_pred)
    
    # Check for NaN values
    if np.any(np.isnan(y_pred_log)):
        raise ValueError("Log transformation resulted in NaN values.")
    return np.sqrt(mean_squared_error(y_true_log, y_pred_log)) # Calculate RMSE

In [18]:
# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse_log = calculate_rmse_log(y_test, y_pred)
    results[name] = rmse_log
    print(f'{name} RMSE: {rmse_log:.8f}') 

Random Forest Regressor RMSE: 0.01660677
Gradient Boosting Regressor RMSE: 0.01088696
XGBoost Regressor RMSE: 0.01027249
LGBM Regressor RMSE: 0.01036735


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

# Define a custom scoring function for RMSE on log-transformed data
def rmse_log_transform(y_true, y_pred):
    # Calculate RMSE on the log-transformed scale
    return np.sqrt(mean_squared_error(np.log1p(y_true), np.log1p(y_pred)))

# Create a custom scorer
rmse_log_scorer = make_scorer(rmse_log_transform, greater_is_better=False)

# Parameter grid
param_grid = {
    'n_estimators': [500, 1000],
    'learning_rate': [0.1, 0.2],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 5],
    'gamma': [0, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

model = XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

# Perform GridSearchCV with the custom scoring function
grid_search = GridSearchCV(model, param_grid, cv=5, scoring=rmse_log_scorer)
grid_search.fit(X, y)

# Output the best parameters
print("Best Parameters:", grid_search.best_params_)

KeyboardInterrupt: 

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
rmse_log = calculate_rmse_log(y_test, y_pred)
print(f'Ensemble Model RMSE: {rmse_log:.8f}')

In [195]:
# For ensemble models
ensemble_model = VotingRegressor(
    estimators=[
        # ('rf', RandomForestRegressor(n_estimators=500)),
        ('gb', GradientBoostingRegressor(n_estimators=500)),
        ('xgb', XGBRegressor(n_estimators=1000, learning_rate=0.01, max_depth=6, min_child_weight=1, gamma=0,
                                        subsample=0.8, colsample_bytree=0.8, objective='reg:squarederror', eval_metric='rmse')),
        ('lgbm', LGBMRegressor(n_estimators=500, learning_rate=0.1, max_depth=2, verbosity=-1))
    ]
)

ensemble_pipeline = Pipeline(steps=[
        ('model', ensemble_model)
    ])

ensemble_pipeline = ensemble_pipeline.fit(X_train, y_train)
y_pred = ensemble_pipeline.predict(X_test)
rmse_log = calculate_rmse_log(y_test, y_pred)
print(f'Ensemble Model RMSE: {rmse_log:.8f}')

Ensemble Model RMSE: 0.01008863


In [147]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [175]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [178]:
class HousePriceModel(nn.Module):
    def __init__(self, input_dim):
        super(HousePriceModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 64)
        self.fc5 = nn.Linear(64, 32)
        self.fc6 = nn.Linear(32, 16)
        self.fc7 = nn.Linear(16, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.relu(self.fc4(x))
        x = torch.relu(self.fc5(x))
        x = torch.relu(self.fc6(x))
        x = self.fc7(x)
        return x

model = HousePriceModel(X_train_tensor.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [179]:
def train_model(model, train_loader, criterion, optimizer, num_epochs=50):
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for inputs, targets in train_loader:
            targets_log = torch.log(targets + 1)  # Apply log transformation
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets_log)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader)}')

train_model(model, train_loader, criterion, optimizer)

Epoch 1/50, Loss: 3.4910226887778233
Epoch 2/50, Loss: 0.3277937565979205
Epoch 3/50, Loss: 0.084050542037738
Epoch 4/50, Loss: 0.025240929846308734
Epoch 5/50, Loss: 0.013432219305908993
Epoch 6/50, Loss: 0.008623741763202767
Epoch 7/50, Loss: 0.006022345103127391
Epoch 8/50, Loss: 0.004446564413803189
Epoch 9/50, Loss: 0.003408696796548994
Epoch 10/50, Loss: 0.0026156002300252255
Epoch 11/50, Loss: 0.0020991578440819132
Epoch 12/50, Loss: 0.00175849802326411
Epoch 13/50, Loss: 0.0014099925603276414
Epoch 14/50, Loss: 0.0012027478200922672
Epoch 15/50, Loss: 0.0011398931107434787
Epoch 16/50, Loss: 0.0009477206438436712
Epoch 17/50, Loss: 0.0008236818738575829
Epoch 18/50, Loss: 0.000679960652058454
Epoch 19/50, Loss: 0.0005503415074934693
Epoch 20/50, Loss: 0.0004475177924369315
Epoch 21/50, Loss: 0.00041505947286312123
Epoch 22/50, Loss: 0.0003586512925961104
Epoch 23/50, Loss: 0.0003276999157510306
Epoch 24/50, Loss: 0.000296505961542655
Epoch 25/50, Loss: 0.000249366658911305
Epoc

In [180]:
def evaluate_model(model, test_loader):
    model.eval()
    with torch.no_grad():
        predictions = []
        targets = []
        for inputs, target in test_loader:
            output = model(inputs)
            predictions.append(output.numpy())
            targets.append(target.numpy())
        predictions = np.concatenate(predictions)
        targets = np.concatenate(targets)
        
        # Compute RMSE between log-transformed predictions and targets
        epsilon = 1e-8
        log_predictions = np.log(predictions + epsilon)  # Apply log transformation
        log_targets = np.log(targets + epsilon)
        mse = np.mean((log_predictions - log_targets) ** 2)
        rmse = np.sqrt(mse)
        print(f'Test RMSE: {rmse}')

evaluate_model(model, test_loader)

Test RMSE: 1.5558704137802124


In [163]:
from pytorch_tabnet.tab_model import TabNetRegressor

# Define the model
model = TabNetRegressor(
    n_d=8,
    n_a=8,
    n_steps=3,
    gamma=1.3,
    n_independent=2,
    n_shared=2,
    epsilon=1e-15,
    seed=42
)

# Train the model
model.fit(
    X_train=X_train_tensor.numpy(),
    y_train=y_train_tensor.numpy(),
    eval_set=[(X_test_tensor.numpy(), y_test_tensor.numpy())],
    batch_size=64,
    virtual_batch_size=16,
    num_workers=0,
    max_epochs=100,
    patience=10,  # Early stopping
    drop_last=False  # Drop last incomplete batch
)


epoch 0  | loss: 103.60258| val_0_mse: 63.58573913574219|  0:00:01s
epoch 1  | loss: 24.36324| val_0_mse: 23.579059600830078|  0:00:01s
epoch 2  | loss: 4.38717 | val_0_mse: 7.3284101486206055|  0:00:02s
epoch 3  | loss: 1.41206 | val_0_mse: 3.1465799808502197|  0:00:02s
epoch 4  | loss: 0.85694 | val_0_mse: 10.383910179138184|  0:00:03s
epoch 5  | loss: 0.43355 | val_0_mse: 4.817409992218018|  0:00:04s
epoch 6  | loss: 0.31852 | val_0_mse: 14.489990234375|  0:00:04s
epoch 7  | loss: 0.36035 | val_0_mse: 7.753880023956299|  0:00:05s
epoch 8  | loss: 0.20058 | val_0_mse: 5.5832600593566895|  0:00:06s
epoch 9  | loss: 0.17102 | val_0_mse: 4.623380184173584|  0:00:06s
epoch 10 | loss: 0.19933 | val_0_mse: 1.8931699991226196|  0:00:07s
epoch 11 | loss: 0.21299 | val_0_mse: 2.7609400749206543|  0:00:07s
epoch 12 | loss: 0.17574 | val_0_mse: 3.557490110397339|  0:00:08s
epoch 13 | loss: 0.16404 | val_0_mse: 1.771720051765442|  0:00:09s
epoch 14 | loss: 0.13178 | val_0_mse: 1.5937600135803223



In [168]:
from sklearn.metrics import mean_squared_error
import numpy as np

def evaluate_model(model, X_test_tensor, y_test_tensor):
    # Generate predictions
    y_pred = model.predict(X_test_tensor.numpy())
    
    # Convert tensors to numpy arrays for evaluation
    y_true = y_test_tensor.numpy()
    
    # Compute RMSE between log-transformed predictions and targets
    epsilon = 1e-8
    log_predictions = np.log(y_pred + epsilon)  # Apply log transformation
    log_targets = np.log(y_true + epsilon)
    mse = np.mean((log_predictions - log_targets) ** 2)
    rmse = np.sqrt(mse)
    print(f'Test RMSE: {rmse}')

evaluate_model(model, X_test_tensor, y_test_tensor)


Test RMSE: 0.05162693187594414
