In [2]:
import os, sys
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules or "COLAB_RELEASE_TAG" in os.environ

if IN_COLAB:
    # Mount Google Drive (only in Colab)
    from google.colab import drive
    drive.mount("/content/drive")
    ROOT = Path("/content/drive/MyDrive/Colab Notebooks/Final Project")
else:
    # Local dev: assume repo structure
    ROOT = Path.cwd().parents[0] if (Path.cwd().name == "Notebooks") else Path.cwd()

DATA_DIR = ROOT / "Data"
REPORTS_DIR = ROOT / "Reports"
ARTIFACTS_DIR = ROOT / "artifacts"

REPORTS_DIR.mkdir(exist_ok=True, parents=True)
ARTIFACTS_DIR.mkdir(exist_ok=True, parents=True)

# Example usage
# df = pd.read_csv(DATA_DIR / "real_estate.csv")


Mounted at /content/drive


In [3]:
#RUN CELL ONLY WHEN GOOGLE COLAB USED
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive/Colab Notebooks/Final Project/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Final Project


In [4]:
#Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.neighbors import KNeighborsRegressor # KNN for regression
from sklearn.model_selection import GridSearchCV ,cross_val_score# For hyperparameter tuning
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score # For model evaluation
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression

!pip install dmba
from dmba import regressionSummary, backward_elimination, AIC_score

Collecting dmba
  Downloading dmba-0.2.4-py3-none-any.whl.metadata (1.9 kB)
Downloading dmba-0.2.4-py3-none-any.whl (11.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m94.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dmba
Successfully installed dmba-0.2.4
Colab environment detected.


In [5]:
# STEP 1: LOAD AND PREVIEW DATA
houseprice_df = pd.read_csv('Real estate.csv')
houseprice_df.head()

#Note:
#No: Transaction ID
#X1 transaction date: Date of the house purchase
#X2 house age: The age of the house in months
#X3 distance to the nearest MRT station: Distance to nearest MRT station in meters
#X4 number of convenience stores: Number of convenience stores near the house
#X5 latitude: Latitude of the house location
#X6 longitude: Longitude of the house location
#Y house price of unit area: House price per unit area

FileNotFoundError: [Errno 2] No such file or directory: 'Real estate.csv'

In [None]:
#STEP 2: DATA CLEANING AND PREPARATION
#Display dimensions of data frame
houseprice_df.shape

In [None]:
# Fixing inconsistent formatting
houseprice_df.columns = houseprice_df.columns.str.strip().str.lower().str.replace(" ", "_")

# Rename 'no' column to 'transaction_id'
houseprice_df.rename(columns={'no': 'transaction_id'}, inplace=True)

# Display the updated DataFrame
houseprice_df.head(20)


In [None]:
# Checking for missing values in each column
houseprice_df.isnull().sum()

#Note: No missing values found

In [None]:
# Checking for duplicate rows
print("\nDuplicate rows found:")
houseprice_df.duplicated().sum()

#Note: No duplicate records found

In [None]:
# Checking for data types
houseprice_df.dtypes

#Note: All column's datatypes are appropriate

In [None]:
# Checking Outliers and Data Distribution
# Display descriptive statistics to get an overview of the data
# distribution and identify potential outliers (e.g., min/max values far from quartiles).
print("\n--- Descriptive Statistics of Numerical Columns ---")
print(houseprice_df.describe())

#Note: "x3_distance_to_the_nearest_mrt_station" and "y_house_price_of_unit_area" variables seemingly have outliers based on their min and max
#values respectively. 'transaction_id' and 'x1_transaction_date' columns are ignored because the former is not a potential predictor of the house
#price variable and the latter is a potential predictor but has notable inconsistent and inaccuracte data. Hence, instead of this varible we will
#consider 'x2_house_age' variable because it is saying the same story as transaction date variable but is more accurate and consistent.

In [None]:
# Graphical representation of outliers to confirm outlier detection for "x3_distance_to_the_nearest_mrt_station" and "y_house_price_of_unit_area" variables
#And checking for potential outliers in other variables
columns_to_plot = ['x2_house_age','x3_distance_to_the_nearest_mrt_station','x4_number_of_convenience_stores','x5_latitude','x6_longitude', 'y_house_price_of_unit_area']
plt.figure(figsize=(15, 8))

# Create one boxplot per column
for i, col in enumerate(columns_to_plot):
    ax = plt.subplot(2, 3, i + 1)
    sns.boxplot(y=houseprice_df[col], color="skyblue", ax=ax)
    ax.set_title(f'Boxplot: {col}', fontsize=10)
    ax.set_ylabel('')

plt.tight_layout()
plt.show()

#Note: 'x3_distance_to_the_nearest_mrt_station','x5_latitude','x6_longitude', 'y_house_price_of_unit_area' variables are confirmed to have outliers

In [None]:
#Count the outliers in each of the columns
# Function to count outliers using IQR method
# Updated function to return outlier values
def get_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = series[(series < lower_bound) | (series > upper_bound)]
    return outliers

# Apply to selected columns
for col in ['x3_distance_to_the_nearest_mrt_station', 'x5_latitude', 'x6_longitude', 'y_house_price_of_unit_area']:
    outliers = get_outliers(houseprice_df[col])
    print(f"\nNumber of outliers in {col}: {outliers.count()}")
    print(f"Outlier values in {col}:\n{outliers.values}")

#Outliers Handling:
#'x5_latitude' and 'x6_longitude' columns have outliers but they are valid values for location of the houses. Removing or altering
#them can distort information and analysis. Hence, the outliers of these two variables will remain in the dataset without any changes.
#Other 2 outliers will be handled in the upcoming 2 code cells:
#The outliers in 'x3_distance_to_the_nearest_mrt_station' column will be handled through winsorizing(capping) to reduce extreme impact but keep the
#data points valid and the outliers in 'y_house_price_of_unit_area' column will be eliminated from dataset since the outliers are very few(3)in number

In [None]:
# Function to cap values above a specified upper percentile for 'x3_distance_to_the_nearest_mrt_station' column
def cap_outliers(series, upper_percentile=0.97):
    upper_bound = series.quantile(upper_percentile)
    return series.clip(upper=upper_bound)

# Apply capping
houseprice_df['x3_distance_to_the_nearest_mrt_station'] = cap_outliers(houseprice_df['x3_distance_to_the_nearest_mrt_station'])
print(f"{'x3_distance_to_the_nearest_mrt_station'}: capped at 97th percentile = {houseprice_df['x3_distance_to_the_nearest_mrt_station'].max()}")

In [None]:
#Remove records with outliers in 'y_house_price_of_unit_area' column
# Define outlier values to remove
outlier_values = [78.3, 117.5, 78.0]

# Remove rows where 'y_house_price_of_unit_area' has these values
houseprice_df = houseprice_df[~houseprice_df['y_house_price_of_unit_area'].isin(outlier_values)]

# Confirm removal
print("Remaining rows after removing outliers:", len(houseprice_df))

In [None]:
# STEP 3: DATA ANALYSIS FOR DATA UNDERSTANDING
#Display descriptive statistics of the variables
houseprice_df.describe()

#Inference: We can understand various aspects about each of the variables such as count, mean, std, min, 25% quartile, 50% quartile, 75% quartile and max values

In [None]:
# Plotting the boxplot for house price per unit area
mean = np.mean(houseprice_df['y_house_price_of_unit_area'])
q1 = np.percentile(houseprice_df['y_house_price_of_unit_area'], 25)
q3 = np.percentile(houseprice_df['y_house_price_of_unit_area'], 75)
min_val = np.min(houseprice_df['y_house_price_of_unit_area'])
max_val = np.max(houseprice_df['y_house_price_of_unit_area'])

plt.figure(figsize=(8, 4))
sns.boxplot(x=houseprice_df['y_house_price_of_unit_area'], color='skyblue',)
plt.title('Boxplot of House Price per Unit Area')
plt.xlabel('House Price per Unit Area')
plt.grid(True)
plt.text(mean, 0.05, f'Mean: {mean:.2f}', ha='center', va='top', color='red', fontweight='bold')
plt.text(q1, 0.05, f'Q1: {q1:.2f}', ha='center', va='bottom', color='green')
plt.text(q3, 0.05, f'Q3: {q3:.2f}', ha='center', va='bottom', color='blue')
plt.text(min_val, -0.05, f'Min: {min_val:.2f}', ha='center', va='top', color='purple')
plt.text(max_val, -0.05, f'Max: {max_val:.2f}', ha='center', va='top', color='brown')
plt.show()

#Inference:The boxplot of 411 observations shows a mean of 37.98.
#A wide range from 7.6 to 117.5 indicates significant variability, while the interquartile range of 18.9 highlights diverse mid-range prices.
#In this case, the outliers are anything above upper quartile(75). The existence of outliers suggest right skewness, revealing a few highly
#priced properties that create disparities in the housing market and affect overall price distribution.

In [None]:
# Generating and plotting the correlation matrix
plt.figure(figsize=(10, 8))
corr_matrix = houseprice_df.corr(numeric_only=True)

sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Correlation Heatmap of Numeric Variables")
plt.show()

#Inference: "transaction_id" and "x1_transaction_date" have low correlation with outcome variable "y_house_price_of_unit_area". Hence, we will NOT
# be considering these two variables during variable selection for any of the model building process. "x2_house_age" have moderate correlation with outcome
#variable while all the other potential predictors which are "x3_distance_to_the_nearest_mrt_station", "x4_number_of_convenience_stores",
#"x5_latitude" and "x6_longitude" have moderate to high correlation with outcome variable. Hence, the remaining variables will be considered during
#variable selection in the model building processes.

In [None]:
#Removing "transaction_id" and "x1_transaction_date" columns from houseprice_df
houseprice_df = houseprice_df.drop(columns=['transaction_id', 'x1_transaction_date'])
houseprice_df.head()

In [None]:
#STEP 4: PREDICTIVE MODELLING
#Model 1: Linear Regression
#Defining predictors and outcome variables for variable selection through backward elimination
predictors = ['x2_house_age', 'x3_distance_to_the_nearest_mrt_station', 'x4_number_of_convenience_stores', 'x5_latitude', 'x6_longitude']
outcome = 'y_house_price_of_unit_area'

#Partition data into predictors (x) and output (y)
X = houseprice_df[predictors]
y = houseprice_df[outcome]

#Split the data into training and validation datasets. Validation dataset size is 40% of the input datasize
train_X, valid_X, train_y, valid_y = train_test_split(X,y,test_size=0.4,random_state=1)

#Backward elimination for variable selection
def train_model(variables):
  model = LinearRegression()
  model.fit(train_X[variables], train_y)
  return model
def score_model(model, variables):
  return AIC_score(train_y, model.predict(train_X[variables]), model)
best_model, best_variables = backward_elimination(train_X.columns, train_model, score_model, verbose=True)
print(best_variables)

#The optimal parameters are chosen by backward elimination process for this linear regression model/algorithm. The model stats with all available
#features and it iteratively removes the least significant features using AIC score (Akaike Information Criterion). Once every remaining features
#are significant, the model stops the process and gives the list of "best variables" where all features are significant in predicting the outcome
#variable which is "house price of unit area".

In [None]:
#Train model on "best_variables"
best_model.fit(train_X[best_variables], train_y)

#Print coefficients of the selected variables(best_variables)
for var, coef in zip(best_variables, best_model.coef_):
  print(f"{var}: {coef}")

In [None]:
#Training Data: Predicting using predictor variables selected in backward elimination process
pred_y = best_model.predict(train_X[best_variables])
regressionSummary(train_y, pred_y)

result = pd.DataFrame({'Predicted': pred_y,'Actual': train_y,'Residual': train_y - pred_y})

In [None]:
# Plot Actual vs Predicted for training data
plt.figure()
sns.scatterplot(x=train_y, y=pred_y)
plt.plot([train_y.min(), train_y.max()], [train_y.min(), train_y.max()], 'r--')
plt.title(" Training Data: Actual vs Predicted (Linear Regression)")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Plot the actual, predicted and residuals of training data
fig, ax = plt.subplots()
ax = train_y.hist()
ax.set_xlabel('House Price Per Unit Area')
plt.title("Actual Training Data Distribution - Linear Regression")

fig, ax = plt.subplots()
ax = result['Predicted'].hist()
ax.set_xlabel('House Price Per Unit Area')
plt.title("Predicted Training Data Distribution - Linear Regression")

fig, ax = plt.subplots()
ax = result['Residual'].hist()
ax.set_xlabel('Residuals')
plt.title("Residuals Distribution - Linear Regression")
plt.show()

In [None]:
#Validation Data: Predicting using predictor variables selected in backward elimination process
pred_y = best_model.predict(valid_X[best_variables])
regressionSummary(valid_y, pred_y)

result = pd.DataFrame({'Predicted': pred_y,'Actual': valid_y,'Residual': valid_y - pred_y})

In [None]:
# Plot Actual vs Predicted for validation data
plt.figure()
sns.scatterplot(x=valid_y, y=pred_y)
plt.plot([valid_y.min(), valid_y.max()], [valid_y.min(), valid_y.max()], 'r--')
plt.title(" Validation Data: Actual vs Predicted (Linear Regression)")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Plot the actual, predicted and residuals of validation data
fig, ax = plt.subplots()
ax = valid_y.hist()
ax.set_xlabel('House Price Per Unit Area')
plt.title("Actual Validation Data Distribution - Linear Regression")

fig, ax = plt.subplots()
ax = result['Predicted'].hist()
ax.set_xlabel('House Price Per Unit Area')
plt.title("Predicted Validation Data Distribution - Linear Regression")

fig, ax = plt.subplots()
ax = result['Residual'].hist()
ax.set_xlabel('Residuals')
plt.title("Residuals Distribution - Linear Regression")
plt.show()

In [None]:
#Model 2: Random Forest Regressor
#Defining predictors and outcome variables for variable selection through RFECV
predictors = ['x2_house_age', 'x3_distance_to_the_nearest_mrt_station', 'x4_number_of_convenience_stores', 'x5_latitude', 'x6_longitude']
outcome = 'y_house_price_of_unit_area'

#Partition data into predictors (x) and output (y)
X = houseprice_df[predictors]
y = houseprice_df[outcome]

#Split the data into training and validation datasets. Validation dataset size is 40% of the input datasize
train_X, valid_X, train_y, valid_y = train_test_split(X,y,test_size=0.4,random_state=1)

# Define the best parameters for the Random Forest model
best_params = {
    "n_estimators": 150,
    "max_depth": 10,
    "min_samples_split": 8,
    "min_samples_leaf": 2,
    "max_features": 'sqrt'
}

# Initialize Random Forest Regressor with reasonable params
rf = RandomForestRegressor(**best_params, random_state=42)

# RFECV: Recursive feature elimination with cross-validation for variable selection
rfecv = RFECV(
    estimator=rf,
    step=1,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1
)

# Fit RFECV on training data
rfecv.fit(train_X, train_y)

# Get the variables selected through RFECV
selected_features = train_X.columns[rfecv.support_]
print("Optimal number of features:", rfecv.n_features_)
print("Selected features:", list(selected_features))

# The optimal features are chosen by the RFECV process using a Random Forest model.
# The model starts with all available features and iteratively removes the least important features based on feature importance.
# At each step, the model performance is evaluated using 5-fold cross-validation with mean squared error as the metric.
# This process continues until the subset of features that gives the best cross-validated performance is found.
# The final list of selected features are those that contribute most to accurately predicting the outcome variable,
# which is the "house price of unit area".


In [None]:
#Train model on "selected_features"
rf.fit(train_X[selected_features], train_y)

# Create a DataFrame for feature importance
feature_importance = pd.DataFrame({
    "Feature": houseprice_df[predictors].columns,
    "Importance": rf.feature_importances_
})

feature_importance = feature_importance.sort_values(by="Importance", ascending=False)

# Plot bar chart for feature importance
plt.figure(figsize=(10, 5))
plt.barh(feature_importance["Feature"], feature_importance["Importance"], color="skyblue")
plt.xlabel("Feature Importance")
plt.ylabel("Features")
plt.title("Feature Importance in Random Forest Regressor Model")
plt.gca().invert_yaxis()

In [None]:
#Training Data: Predicting using "selected_features"
pred_y = rf.predict(train_X[selected_features])
regressionSummary(train_y, pred_y)

result = pd.DataFrame({'Predicted': pred_y,'Actual': train_y,'Residual': train_y - pred_y})

In [None]:
# Plot Actual vs Predicted for training data
plt.figure()
sns.scatterplot(x=train_y, y=pred_y)
plt.plot([train_y.min(), train_y.max()], [train_y.min(), train_y.max()], 'r--')
plt.title(" Training Data: Actual vs Predicted (Random Forest Regressor)")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Plot the actual, predicted and residuals of training data
fig, ax = plt.subplots()
ax = train_y.hist()
ax.set_xlabel('House Price Per Unit Area')
plt.title("Actual Training Data Distribution - Random Forest Regressor")

fig, ax = plt.subplots()
ax = result['Predicted'].hist()
ax.set_xlabel('House Price Per Unit Area')
plt.title("Predicted Training Data Distribution - Random Forest Regressor")

fig, ax = plt.subplots()
ax = result['Residual'].hist()
ax.set_xlabel('Residuals')
plt.title("Residuals Distribution - Random Forest Regressor")
plt.show()

In [None]:
#Validation Data: Predicting using "selected_features"
pred_y = rf.predict(valid_X[selected_features])
regressionSummary(valid_y, pred_y)

result = pd.DataFrame({'Predicted': pred_y,'Actual': valid_y,'Residual': valid_y - pred_y})

In [None]:
# Plot Actual vs Predicted for validation data
plt.figure()
sns.scatterplot(x=valid_y, y=pred_y)
plt.plot([valid_y.min(), valid_y.max()], [valid_y.min(), valid_y.max()], 'r--', label = "Regression Line")
plt.title(" Validation Data: Actual vs Predicted (Random Forest Regressor)")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
#Plot the actual, predicted and residuals of validation data
fig, ax = plt.subplots()
ax = valid_y.hist()
ax.set_xlabel('House Price Per Unit Area')
plt.title("Actual Validation Data Distribution - Random Forest Regressor")

fig, ax = plt.subplots()
ax = result['Predicted'].hist()
ax.set_xlabel('House Price Per Unit Area')
plt.title("Predicted Validation Data Distribution - Random Forest Regressor")

fig, ax = plt.subplots()
ax = result['Residual'].hist()
ax.set_xlabel('Residuals')
plt.title("Residuals Distribution - Random Forest Regressor")
plt.show()

In [None]:
#Model 3: K-Nearest Neighbors (KNN)
# Define predictors and outcome
predictors = ['x2_house_age', 'x3_distance_to_the_nearest_mrt_station',
              'x4_number_of_convenience_stores', 'x5_latitude', 'x6_longitude']
outcome = 'y_house_price_of_unit_area'

# Split original data into train and validation sets
train_data, valid_data = train_test_split(houseprice_df, test_size=0.4, random_state=1)

# Fit scalers separately for predictors and outcome on training data only
X_scaler = StandardScaler()
y_scaler = StandardScaler()

# Fit on training data
X_scaler.fit(train_data[predictors])
y_scaler.fit(train_data[[outcome]])

# Normalize full dataset (but based on training scalers only)
house_norm = pd.DataFrame(
    X_scaler.transform(houseprice_df[predictors]),
    columns=[f'z_{col}' for col in predictors],
    index=houseprice_df.index
)

# Add normalized outcome
house_norm['z_' + outcome] = y_scaler.transform(houseprice_df[[outcome]])

# Retrieve normalized train and validation sets using original split indices
trainNorm = house_norm.loc[train_data.index]
validNorm = house_norm.loc[valid_data.index]

# Define normalized predictors and target
normalized_predictors = [f'z_{col}' for col in predictors]
normalized_outcome = f'z_{outcome}'

# Partition the normalized data
train_X = trainNorm[normalized_predictors]
train_y = trainNorm[normalized_outcome]
valid_X = validNorm[normalized_predictors]
valid_y = validNorm[normalized_outcome]

print("Train shape:", train_X.shape, train_y.shape)
print("Validation shape:", valid_X.shape, valid_y.shape)

In [None]:
# Variable selection using GridSearchCV

pipe = Pipeline([
    ('select', SelectKBest(score_func=f_regression)),
    ('knn', KNeighborsRegressor())
])

param_grid = {
    'select__k': [2, 3, 4, 5],            # Try selecting 2 to 5 top features
    'knn__n_neighbors': [2, 3, 5, 7, 10]  # Try different k values for KNN
}

grid = GridSearchCV(pipe, param_grid, cv=5, scoring='neg_mean_squared_error')
grid.fit(train_X, train_y)

print("Best number of features:", grid.best_params_['select__k'])
print("Best K value for KNN:", grid.best_params_['knn__n_neighbors'])
print("Best CV score (neg MSE):", grid.best_score_)

In [None]:
# Refit with best parameters
best_k = grid.best_params_['select__k']
best_model = grid.best_estimator_

# Save feature names before passing into the pipeline
feature_names = train_X.columns.tolist()

# Fit the selector on the full training set to view the selected features
selector = best_model.named_steps['select']
feature_mask = selector.get_support()
selected_features = [feature for feature, keep in zip(feature_names, feature_mask) if keep]

print("Selected features:", list(selected_features))

In [None]:
#Train model on "selected_features"
best_model.fit(train_X[selected_features], train_y)

In [None]:
#Make Predictions on Training Data using "best_variables"
train_pred = best_model.predict(train_X[selected_features])
train_residuals = train_y - train_pred

result = pd.DataFrame({'Predicted':train_pred ,'Actual': train_y,'Residual': train_y - train_pred})

In [None]:
# Denormalize predictions and actuals
train_pred_orig = y_scaler.inverse_transform(train_pred.reshape(-1, 1))
train_y_orig = y_scaler.inverse_transform(train_y.values.reshape(-1, 1))


In [None]:
#Evaluate the Model
regressionSummary(train_y_orig ,train_pred_orig)

In [None]:
# Visualization – Training Data
train_y_orig = train_y_orig.ravel()
train_pred_orig = train_pred_orig.ravel()

# Plotting actual vs predicted values
plt.figure()
sns.scatterplot(x=train_y_orig, y=train_pred_orig, alpha=0.6, color='skyblue', edgecolor='k')

# Reference line (perfect prediction line)
min_val = min(train_y_orig.min(), train_pred_orig.min())
max_val = max(train_y_orig.max(), train_pred_orig.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Fit')

# Labels and title
plt.title("Training Data: Actual vs Predicted (KNN)", fontsize=14)
plt.xlabel("Actual House Price", fontsize=12)
plt.ylabel("Predicted House Price", fontsize=12)

# Additional plot features
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Recalculate residuals in original scale
train_residuals_orig = train_y_orig - train_pred_orig

# Create a result DataFrame (optional, for plotting consistency)
result = pd.DataFrame({
    'Actual': train_y_orig,
    'Predicted': train_pred_orig,
    'Residual': train_residuals_orig
})

# Plot Actual Training Data Distribution
plt.figure()
sns.histplot(result['Actual'], kde=True, color='skyblue')
plt.xlabel('House Price per Unit Area', fontsize=12)
plt.title("Actual Training Data Distribution - KNN", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Plot Predicted Training Data Distribution
plt.figure()
sns.histplot(result['Predicted'], kde=True, color='skyblue')
plt.xlabel('House Price per Unit Area', fontsize=12)
plt.title("Predicted Training Data Distribution - KNN", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Plot Training Residuals Distribution
plt.figure()
sns.histplot(result['Residual'], kde=True, color='skyblue')
plt.xlabel('Residual', fontsize=12)
plt.title("Training Residuals Distribution - KNN", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
#Make Predictions on Validation Data using "best_variables"
valid_pred = best_model.predict(valid_X[selected_features])
valid_residuals = valid_y - valid_pred

result1 = pd.DataFrame({'Predicted':valid_pred ,'Actual': valid_y,'Residual': valid_y - valid_pred})

In [None]:
# Denormalize predictions and actuals
valid_pred_orig = y_scaler.inverse_transform(valid_pred.reshape(-1, 1))
valid_y_orig = y_scaler.inverse_transform(valid_y.values.reshape(-1, 1))

In [None]:
#Evaluate the Model
regressionSummary(valid_y_orig, valid_pred_orig )

In [None]:
# Ensure both arrays are reshaped to 1D if needed
valid_y_orig = valid_y_orig.ravel()
valid_pred_orig = valid_pred_orig.ravel()
# Recalculate residuals in original scale
valid_residuals_orig = valid_y_orig - valid_pred_orig

# Create a result DataFrame for validation
result1 = pd.DataFrame({
    'Actual': valid_y_orig,
    'Predicted': valid_pred_orig,
    'Residual': valid_residuals_orig
})

# Plot 1: Actual vs Predicted for Validation Data
plt.figure()
sns.scatterplot(x=valid_y_orig, y=valid_pred_orig, alpha=0.6, color='skyblue', edgecolor='k')

# Perfect prediction line
min_val = min(valid_y_orig.min(), valid_pred_orig.min())
max_val = max(valid_y_orig.max(), valid_pred_orig.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', label='Perfect Fit')

plt.title("Validation Data: Actual vs Predicted (KNN)", fontsize=14)
plt.xlabel("Actual House Price", fontsize=12)
plt.ylabel("Predicted House Price", fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot Actual Validation Data Distribution
plt.figure()
sns.histplot(result1['Actual'], kde=True, color='skyblue')
plt.xlabel('House Price per Unit Area', fontsize=12)
plt.title("Actual Validation Data Distribution - KNN", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Plot Predicted Validation Data Distribution
plt.figure()
sns.histplot(result1['Predicted'], kde=True, color='skyblue')
plt.xlabel('House Price per Unit Area', fontsize=12)
plt.title("Predicted Validation Data Distribution - KNN", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Plot Residuals Distribution (Validation)
plt.figure()
sns.histplot(result1['Residual'], kde=True, color='skyblue')
plt.xlabel('Residual', fontsize=12)
plt.title("Validation Residuals Distribution - KNN", fontsize=14)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()