# Import

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import plotly.express as px
import missingno as msno

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, HuberRegressor
import statsmodels.api as sm

# Load Data

In [4]:
train_clean = pd.read_csv('../datasets/train_clean.csv')
test_df = pd.read_csv('../datasets/test_imputed.csv')

In [5]:
# check columns
train_clean.shape

(14103, 164)

In [6]:
# check columns
test_df.shape

(2500, 163)

# Modelling

#### Set Feature

In [9]:
dummy_station_columns = [col for col in train_clean.columns if col.startswith('station_name_')]
len(dummy_station_columns)

139

In [10]:
# Check why there is not the same column shape
train_columns = set(train_clean.columns)
test_columns = set(test_df.columns)

# Columns in train but not in test
train_not_in_test = train_columns - test_columns

# Columns in test but not in train
test_not_in_train = test_columns - train_columns

print(
    f"In train but not in test: {train_not_in_test}\n\n"
    f"In test but not in train: {test_not_in_train}"
)


In train but not in test: {'price'}

In test but not in train: set()


In [11]:
# Get dummy station columns
dummy_station_columns = [col for col in train_clean.columns if col.startswith('station_name')]

# Feature set
# Define the feature columns
feature_cols1 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 'floor_level', 'facilities', 
                 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns
feature_cols2 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 
                 'nearby_stations', 'year_built', 'total_units', 'floor_level', 'facilities', 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House']
feature_cols3 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'Nonthaburi', 'Samut Prakan'] + dummy_station_columns
feature_cols4 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_stations', 
                 'year_built', 'nearby_shops', 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns 
feature_cols5 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_stations', 
                 'year_built', 'latitude', 'longitude', 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns 
feature_cols6 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 
                 'nearby_stations', 'year_built', 'total_units', 'floor_level', 'facilities', 
                 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns
feature_cols6 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 
                 'year_built', 'total_units', 'floor_level', 'facilities', 
                 'nearby_stations','Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns
feature_cols7 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops', 'nearby_supermarkets', 
                 'nearby_stations','total_units', 'floor_level', 'facilities', 
                 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns
# Define feature set
feature_sets = [
    (feature_cols1, 'Model 1'),
    (feature_cols2, 'Model 2'),
    (feature_cols3, 'Model 3'),
    (feature_cols4, 'Model 4'),
    (feature_cols5, 'Model 5'),
    (feature_cols6, 'Model 6'),
    (feature_cols7, 'Model 7')
]


In [12]:
# Function to train and evaluate a model
def train_and_evaluate(train_data, feature_set, target, model):
  
    # Define features and target variable
    X = train_data[feature_set]  # Predictor variables
    y = train_data[target]  # Target variable

    # Split the dataset into training and validation sets
    X_train, X_dev, y_train, y_dev = train_test_split(X, y, train_size=0.8, random_state=42)

    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_dev)

    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
    r2 = r2_score(y_dev, y_pred)

    # Prepare metrics for return
    metrics = {
        'RMSE': rmse,
        'R-squared': r2
    }
    
    return metrics


# Run on the feature sets
for feature_set, model_name in feature_sets:
    print(f"\nEvaluating {model_name}")
    
    # Instantiate the model
    lr = LinearRegression()
    
    # Call the train_and_evaluate function
    metrics = train_and_evaluate(train_clean, feature_set, 'price', lr)
    
    # Print the results
    print(f"RMSE: {metrics['RMSE']:.4f}, R-squared: {metrics['R-squared']:.4f}")



Evaluating Model 1
RMSE: 1287542.5676, R-squared: 0.6475

Evaluating Model 2
RMSE: 1410255.7794, R-squared: 0.5771

Evaluating Model 3
RMSE: 1456995.1504, R-squared: 0.5486

Evaluating Model 4
RMSE: 1276346.6992, R-squared: 0.6536

Evaluating Model 5
RMSE: 1301603.6017, R-squared: 0.6397

Evaluating Model 6
RMSE: 1243351.2399, R-squared: 0.6713

Evaluating Model 7
RMSE: 1260312.9358, R-squared: 0.6622


In [17]:
train_clean.shape

(14103, 164)

In [19]:
test_df.shape

(2500, 163)