# Import

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import plotly.express as px
import missingno as msno

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import RidgeCV, LassoCV, HuberRegressor, RANSACRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

# Load Data

In [4]:
train_clean = pd.read_csv('../datasets/train_clean.csv')
test_df = pd.read_csv('../datasets/test_imputed.csv')

In [5]:
# check columns
train_clean.shape

(14103, 164)

In [6]:
# check columns
test_df.shape

(2500, 163)

# Modelling

#### Set Feature

#### Predicted with test_df

In [18]:
# Get dummy station columns
dummy_station_columns = [col for col in train_clean.columns if col.startswith('station_name')]
feature_cols5 = ['bedrooms', 'baths', 'land_area', 'floor_area', 'nearby_bus_stops','nearby_stations', 
                'latitude', 'longitude', 'Nonthaburi', 'Samut Prakan', 'Townhouse', 'Detached House'] + dummy_station_columns 


X = train_clean [feature_cols5]
y = train_clean['price']  

# Split the dataset into training and validation sets
X_train, X_dev, y_train, y_dev = train_test_split(X, y, train_size=0.8, random_state=42)

# Expanded parameter grid for hyperparameter tuning
param_grid = {
    'max_depth': [None, 3, 5, 7, 10, 15, 20,25],
    'min_samples_split': [2, 4, 8, 12, 16],
    'min_samples_leaf': [1, 2, 3, 5, 7, 10] 
}

grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from Grid Search
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Instantiate the best model
best_dt = DecisionTreeRegressor(**best_params)
best_dt.fit(X_train, y_train)

# Train score
train_score = best_dt.score(X_train, y_train)
print(f'Train R-squared Score: {train_score:.2f}')

# Test score
test_score = best_dt.score(X_dev, y_dev)
print(f'Test R-squared Score: {test_score:.2f}')

# Make predictions on the validation set
y_test_pred = best_dt.predict(X_dev)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(y_dev, y_pred))
r2 = r2_score(y_dev, y_pred)

# Print the evaluation metrics
print(f'RMSE: {rmse:.2f}, R-squared: {r2:.2f}')

Best parameters: {'max_depth': 20, 'min_samples_leaf': 5, 'min_samples_split': 16}
Train R-squared Score: 0.88
Test R-squared Score: 0.77
RMSE: 1031504.91, R-squared: 0.77


In [19]:
# Evaluate the model using cross-validation
cv_scores = cross_val_score(best_dt, X, y, cv=5, scoring='r2')
print(f'Cross-Validated R-squared Scores: {cv_scores}')
print(f'Average Cross-Validated R-squared: {np.mean(cv_scores):.2f}')

Cross-Validated R-squared Scores: [0.76835486 0.76954568 0.7706113  0.76735195 0.76781805]
Average Cross-Validated R-squared: 0.77


In [20]:
train_clean.shape

(14103, 164)

In [21]:
test_df.shape

(2500, 163)

# Prepare to submit

In [22]:
# Prepare the features for the test DataFrame
X_test = test_df[feature_cols5]  

# Fill missing values with 0 (or handle them differently based on your preprocessing)
X_test = X_test.fillna(0)

# Make predictions on the test set
y_test_pred = best_dt.predict(X_test)

# Add the predictions to the test DataFrame
test_df['price'] = y_test_pred

# Save the results to a CSV file
test_df[['id', 'price']].to_csv('../datasets/submission.csv', index=False)