# Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import missingno as msno

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import RidgeCV, LassoCV, HuberRegressor, RANSACRegressor
import statsmodels.api as sm

# Load Data

In [3]:
train_clean = pd.read_csv('../datasets/train_clean.csv')
test_df = pd.read_csv('../datasets/test_imputed.csv')

In [4]:
# check columns
train_clean.shape

(14103, 164)

In [5]:
# check columns
test_df.shape

(2500, 163)

# Modelling

#### Set Feature

In [12]:
# Get dummy station columns
dummy_station_columns = [col for col in train_clean.columns if col.startswith('station_name')]

# Feature set
# Define the feature columns
feature_cols1 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 'floor_level', 'facilities', 
                 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns
feature_cols2 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 
                 'nearby_stations', 'year_built', 'total_units', 'floor_level', 'facilities', 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House']
feature_cols3 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'Nonthaburi', 'Samut Prakan'] + dummy_station_columns
feature_cols4 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_stations', 
                 'year_built', 'nearby_shops', 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns 
feature_cols5 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops','nearby_stations', 
                'latitude', 'longitude', 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns 
feature_cols6 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 
                 'nearby_stations', 'year_built', 'total_units', 'floor_level', 'facilities', 
                 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns
feature_cols6 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 
                 'year_built', 'total_units', 'floor_level', 'facilities', 
                 'nearby_stations','Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns
feature_cols7 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 
                 'nearby_stations','total_units', 'floor_level', 'facilities', 
                 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns
# Define feature set
feature_sets = [
    (feature_cols1, 'Model 1'),
    (feature_cols2, 'Model 2'),
    (feature_cols3, 'Model 3'),
    (feature_cols4, 'Model 4'),
    (feature_cols5, 'Model 5'),
    (feature_cols6, 'Model 6'),
    (feature_cols7, 'Model 7')
]


In [14]:
from sklearn.tree import DecisionTreeRegressor  # Import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler

# Function to train and evaluate a model
def run_models(X, y):
    # Apply scale to improve convergence 
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)  # 80:20

    # Initialize models
    models = {
        'RidgeCV': RidgeCV(),
        'LassoCV': LassoCV(),
        'Huber': HuberRegressor(max_iter=2000),  # Increased max_iter 
        'RANSAC': RANSACRegressor(),
        'Decision Tree': DecisionTreeRegressor()  # Add Decision Tree model. 
    }
    
    # Store results
    results = {}

    # Fit models and calculate RMSE and R^2
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)  # Calculate R^2 score
        
        results[model_name] = {'RMSE': rmse, 'R^2': r2}  # Store both metrics

    return results

# Create variables to track model that hase the best RMSE 
best_rmse = float('inf')  # Start with infinity as the best RMSE
best_model = None         # To store the name of the best model


# Iterate through each feature set and run the models
for feature_cols, model_name in feature_sets:
    X = train_clean[feature_cols]  # Select the features based on the current feature set
    y = train_clean['price']        # Target variable

    # Run models for the feature set
    results = run_models(X, y)

    # Print the results 
    print(f"Results for {model_name}:")
    for model, metrics in results.items():
        print(f"{model}: RMSE = {metrics['RMSE']:,.2f}, R² = {metrics['R^2']:.2f}")
        
        # Check which model has the lowest RMSE
        if metrics['RMSE'] < best_rmse:
            best_rmse = metrics['RMSE']
            best_model = f"{model_name} - {model}"  # Include the feature set name

    print("\n")  

# Print the model with the lowest RMSE
print(f"The model with the lowest RMSE is: {best_model} with an RMSE of {best_rmse:,.2f}")


Results for Model 1:
RidgeCV: RMSE = 1,287,667.20, R² = 0.65
LassoCV: RMSE = 1,288,812.11, R² = 0.65
Huber: RMSE = 1,295,719.51, R² = 0.64
RANSAC: RMSE = 27,992,916,530,805,092,352.00, R² = -166626880022627845398331392.00
Decision Tree: RMSE = 1,269,712.77, R² = 0.66


Results for Model 2:
RidgeCV: RMSE = 1,410,370.63, R² = 0.58
LassoCV: RMSE = 1,410,377.27, R² = 0.58
Huber: RMSE = 1,419,004.59, R² = 0.57
RANSAC: RMSE = 1,609,639.70, R² = 0.45
Decision Tree: RMSE = 1,289,160.99, R² = 0.65


Results for Model 3:
RidgeCV: RMSE = 1,456,909.99, R² = 0.55
LassoCV: RMSE = 1,454,383.44, R² = 0.55
Huber: RMSE = 1,469,918.21, R² = 0.54
RANSAC: RMSE = 136,708,234,936,948,654,080.00, R² = -3974093826225038843394916352.00
Decision Tree: RMSE = 1,314,711.61, R² = 0.63


Results for Model 4:
RidgeCV: RMSE = 1,276,577.54, R² = 0.65
LassoCV: RMSE = 1,276,417.79, R² = 0.65
Huber: RMSE = 1,289,523.44, R² = 0.65
RANSAC: RMSE = 47,574,958,663,483,179,008.00, R² = -481288631563725076697186304.00
Decision T

In [11]:
train_clean.shape

(14103, 164)

In [12]:
test_df.shape

(2500, 163)