In [2]:
import pandas as pd
data = pd.read_csv('Real_Estate.csv')

data.head()

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414 entries, 0 to 413
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Transaction date                     414 non-null    object 
 1   House age                            414 non-null    float64
 2   Distance to the nearest MRT station  414 non-null    float64
 3   Number of convenience stores         414 non-null    int64  
 4   Latitude                             414 non-null    float64
 5   Longitude                            414 non-null    float64
 6   House price of unit area             414 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 22.8+ KB


In [5]:
data.describe()

Unnamed: 0,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area
count,414.0,414.0,414.0,414.0,414.0,414.0
mean,18.405072,1064.468233,4.2657,24.973605,121.520268,29.102149
std,11.75767,1196.749385,2.880498,0.024178,0.026989,15.750935
min,0.0,23.38284,0.0,24.932075,121.473888,0.0
25%,9.9,289.3248,2.0,24.952422,121.496866,18.422493
50%,16.45,506.1144,5.0,24.974353,121.520912,30.39407
75%,30.375,1454.279,6.75,24.994947,121.544676,40.615184
max,42.7,6306.153,10.0,25.014578,121.565321,65.571716


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import datetime

# convert "Transaction date" to datetime and extract year and month
data['Transaction date'] = pd.to_datetime(data['Transaction date'])
data['Transaction year'] = data['Transaction date'].dt.year
data['Transaction month'] = data['Transaction date'].dt.month

data.head()
# data['Transaction date']

Unnamed: 0,Transaction date,House age,Distance to the nearest MRT station,Number of convenience stores,Latitude,Longitude,House price of unit area,Transaction year,Transaction month
0,2012-09-02 16:42:30.519336,13.3,4082.015,8,25.007059,121.561694,6.488673,2012,9
1,2012-09-04 22:52:29.919544,35.5,274.0144,2,25.012148,121.54699,24.970725,2012,9
2,2012-09-05 01:10:52.349449,1.1,1978.671,10,25.00385,121.528336,26.694267,2012,9
3,2012-09-05 13:26:01.189083,22.2,1055.067,5,24.962887,121.482178,38.091638,2012,9
4,2012-09-06 08:29:47.910523,8.5,967.4,6,25.011037,121.479946,21.65471,2012,9


In [9]:
# drop the original "Transaction date" as we've extracted relevant features
data = data.drop(columns=['Transaction date'])

# define features and target variable
X = data.drop('House price of unit area', axis=1)
y = data['House price of unit area']

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled.shape


(331, 7)

In [11]:
X_test_scaled

array([[-2.60865532e-01, -5.23525629e-01,  1.00053687e+00,
         6.57868263e-01,  1.62222330e+00,  7.24807300e-01,
        -5.06523035e-02],
       [-2.49461574e-02,  2.45603201e-01,  1.35895430e+00,
        -7.99567856e-01,  1.74069839e+00,  7.24807300e-01,
        -5.06523035e-02],
       [-1.06973196e+00, -6.72942804e-01, -1.14996770e+00,
         1.45816905e+00,  1.40793313e+00,  7.24807300e-01,
        -5.06523035e-02],
       [ 1.48325270e+00, -7.07913117e-01,  1.71737173e+00,
         1.55718472e+00,  1.72952866e+00,  7.24807300e-01,
         2.28779571e-01],
       [ 1.04511672e+00,  2.90886903e-01, -7.91550275e-01,
         1.12502065e+00,  3.61442874e-01,  7.24807300e-01,
        -5.06523035e-02],
       [-9.85475038e-01, -4.98697166e-01, -7.47154158e-02,
        -1.55786181e+00, -1.71263737e+00, -1.37967705e+00,
         1.06707519e+00],
       [ 1.60121239e+00,  3.19767702e-01, -1.14996770e+00,
        -4.22502481e-01,  1.94507237e-01,  7.24807300e-01,
        -8.8894792

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# initialize the models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

# dictionary to hold the evaluation metrics for each model
results = {}

# train and evaluate each model
for name, model in models.items():
    # training the model
    model.fit(X_train_scaled, y_train)

    # making predictions on the test set
    predictions = model.predict(X_test_scaled)

    # calculating evaluation metrics
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)

    # storing the metrics
    results[name] = {"MAE": mae, "R²": r2}

results_df = pd.DataFrame(results).T  # convert the results to a DataFrame for better readability
print(results_df)


                         MAE        R²
Linear Regression   9.748246  0.529615
Decision Tree      11.760342  0.204962
Random Forest       9.887601  0.509547
Gradient Boosting  10.000117  0.476071
