In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Import data

df = pd.read_csv('real_estate.csv', index_col = 'No')

In [3]:
df.columns

Index(['X1 transaction date', 'X2 house age', 'X3 distance to MRT',
       'X4 number of stores', 'X5 latitude', 'X6 longitude',
       'Y house price of unit area'],
      dtype='object')

In [4]:
# remove whitespaces from column names

for column in df.columns:
    new_column = column.replace(' ', '_')
    df = df.rename(columns = {column: new_column})

In [5]:
# Separate the features and target variables
X = df[['X1_transaction_date', 'X2_house_age', 'X3_distance_to_MRT', 'X4_number_of_stores', 'X5_latitude', 'X6_longitude']]
y = df['Y_house_price_of_unit_area']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Initialize the Random Forest regressor
rf_regressor = RandomForestRegressor()

# Train the model using the training data
rf_regressor.fit(X_train, y_train)

In [7]:
# Predict the target variable for the test set
y_pred = rf_regressor.predict(X_test)

In [8]:
# Calculate the root mean squared error (RMSE)

rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 5.470402347309792


In [9]:
# Calculate the R-squared score

score = rf_regressor.score(X_test, y_test)
print("R-squared score:", score)

R-squared score: 0.8216181007724395


SAME THING WITH REMOVED OUTLIER

In [10]:
index_max = df[df['Y_house_price_of_unit_area'] == max(df['Y_house_price_of_unit_area'])].index

df2 = df.drop(index = index_max)

In [12]:
# Separate the features and target variables
X2 = df2[['X1_transaction_date', 'X2_house_age', 'X3_distance_to_MRT', 'X4_number_of_stores', 'X5_latitude', 'X6_longitude']]
y2 = df2['Y_house_price_of_unit_area']

# Split the data into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [13]:
# Initialize the Random Forest regressor
rf_regressor_2 = RandomForestRegressor()

# Train the model using the training data
rf_regressor_2.fit(X2_train, y2_train)

In [15]:
# Predict the target variable for the test set
y2_pred = rf_regressor_2.predict(X2_test)

In [16]:
# Calculate the root mean squared error (RMSE)

rmse_2 = mean_squared_error(y2_test, y2_pred, squared=False)
print("Root Mean Squared Error:", rmse_2)

Root Mean Squared Error: 7.004954724230848


In [17]:
# Calculate the R-squared score

score_2 = rf_regressor_2.score(X2_test, y2_test)
print("R-squared score:", score_2)

R-squared score: 0.6828858940430949


HYPERPARAMETER TUNING TEST

In [21]:
# Initialize the Random Forest regressor with different criterion
rf_regressor_3 = RandomForestRegressor(criterion = 'absolute_error')

# Train the model using the training data
rf_regressor_3.fit(X_train, y_train)

In [23]:
# Predict the target variable for the test set
y3_pred = rf_regressor_3.predict(X_test)

In [24]:
# Calculate the root mean squared error (RMSE)

rmse_3 = mean_squared_error(y_test, y3_pred, squared=False)
print("Root Mean Squared Error:", rmse_3)

Root Mean Squared Error: 5.7266497867532


In [28]:
# Calculate the R-squared score

score_3 = rf_regressor_3.score(X_test, y_test)
print("R-squared score:", score_3)

R-squared score: 0.8045149754256931


In [25]:
# Initialize the Random Forest regressor with different number of estimators
rf_regressor_4 = RandomForestRegressor(n_estimators = 200)

# Train the model using the training data
rf_regressor_4.fit(X_train, y_train)

In [26]:
# Predict the target variable for the test set
y4_pred = rf_regressor_4.predict(X_test)

In [27]:
# Calculate the root mean squared error (RMSE)

rmse_4 = mean_squared_error(y_test, y4_pred, squared=False)
print("Root Mean Squared Error:", rmse_4)

Root Mean Squared Error: 5.6471067403828625


In [29]:
# Calculate the R-squared score

score_4 = rf_regressor_4.score(X_test, y_test)
print("R-squared score:", score_4)

R-squared score: 0.8099078260492448
