In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [2]:
# Load and preprocess the data
data = pd.read_csv('Singapore_cleaned.csv')
data

Unnamed: 0,floor_area_sqm,lease_commence_date,resale_price,remaining_lease_months,town_encoded,flat_type_encoded,block_encoded,street_name_encoded,storey_range_encoded,flat_model_encoded
0,31.0,1977,9000.0,184.861588,0,0,833,13,5,7
1,31.0,1977,6000.0,184.861588,0,0,833,13,2,7
2,31.0,1977,8000.0,184.861588,0,0,833,13,5,7
3,31.0,1977,6000.0,184.861588,0,0,833,13,4,7
4,73.0,1976,47200.0,184.861588,0,2,450,13,2,20
...,...,...,...,...,...,...,...,...,...,...
940734,112.0,2015,700000.0,1080.000000,26,4,953,572,4,9
940735,113.0,2017,720000.0,1102.000000,26,4,1539,575,5,9
940736,122.0,1988,670000.0,753.000000,26,4,2430,579,0,9
940737,148.0,1988,855500.0,754.000000,26,5,1121,562,2,15


In [3]:
x = data.drop('resale_price', axis = 1)
x

Unnamed: 0,floor_area_sqm,lease_commence_date,remaining_lease_months,town_encoded,flat_type_encoded,block_encoded,street_name_encoded,storey_range_encoded,flat_model_encoded
0,31.0,1977,184.861588,0,0,833,13,5,7
1,31.0,1977,184.861588,0,0,833,13,2,7
2,31.0,1977,184.861588,0,0,833,13,5,7
3,31.0,1977,184.861588,0,0,833,13,4,7
4,73.0,1976,184.861588,0,2,450,13,2,20
...,...,...,...,...,...,...,...,...,...
940734,112.0,2015,1080.000000,26,4,953,572,4,9
940735,113.0,2017,1102.000000,26,4,1539,575,5,9
940736,122.0,1988,753.000000,26,4,2430,579,0,9
940737,148.0,1988,754.000000,26,5,1121,562,2,15


In [4]:
y = data['resale_price']
y

0           9000.0
1           6000.0
2           8000.0
3           6000.0
4          47200.0
            ...   
940734    700000.0
940735    720000.0
940736    670000.0
940737    855500.0
940738    865000.0
Name: resale_price, Length: 940739, dtype: float64

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [6]:
# Train a Linear Regression model
pipeline = Pipeline([('scaler', StandardScaler()),('model', LinearRegression())])
pipeline.fit(x_train, y_train)

In [7]:
# Evaluate the Linear Regression model
y_pred = pipeline.predict(x_test)
print(f"Linear Regression MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"Linear Regression R2: {r2_score(y_test, y_pred):.2f}")

Linear Regression MSE: 11353192361.66
Linear Regression R2: 0.63


In [None]:
# Train a Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(x_train, y_train)

# Evaluate the Random Forest model
y_pred_rf = rf_model.predict(x_test)
print(f"Random Forest MSE: {mean_squared_error(y_test, y_pred_rf):.2f}")
print(f"Random Forest R2: {r2_score(y_test, y_pred_rf):.2f}")

# Save the best model
joblib.dump(rf_model, 'resale_predictor.joblib')

Random Forest MSE: 6338533749.29
Random Forest R2: 0.79
