In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Load Dataset

In [None]:

data = 'https://raw.githubusercontent.com/hamant-jagwan/Machine_Learning_UPES_2nd_sem/refs/heads/main/Dataset/Housing.csv'
df = pd.read_csv(data, encoding= 'ISO-8859-1')

In [5]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,535,536,537,538,539,540,541,542,543,544
price,13300000,12250000,12250000,12215000,11410000,10850000,10150000,10150000,9870000,9800000,...,2100000,1960000,1890000,1890000,1855000,1820000,1767150,1750000,1750000,1750000
area,7420,8960,9960,7500,7420,7500,8580,16200,8100,5750,...,3360,3420,1700,3649,2990,3000,2400,3620,2910,3850
bedrooms,4,4,3,4,4,3,4,5,4,3,...,2,5,3,2,2,2,3,2,3,3
bathrooms,2,4,2,2,1,3,3,3,1,2,...,1,1,1,1,1,1,1,1,1,1
stories,3,4,2,2,2,1,4,2,2,4,...,1,2,2,1,1,1,1,1,1,2
mainroad,yes,yes,yes,yes,yes,yes,yes,yes,yes,yes,...,yes,no,yes,yes,no,yes,no,yes,no,yes
guestroom,no,no,no,no,yes,no,no,no,yes,yes,...,no,no,no,no,no,no,no,no,no,no
basement,no,no,yes,yes,yes,yes,no,no,yes,no,...,no,no,no,no,no,yes,no,no,no,no
hotwaterheating,no,no,no,no,no,no,no,no,no,no,...,no,no,no,no,no,no,no,no,no,no
airconditioning,yes,yes,no,yes,yes,yes,yes,no,yes,yes,...,no,no,no,no,no,no,no,no,no,no


### Feature Encoding

In [3]:
# Convert categorical variables into numerical values
categorical_cols = ["mainroad", "guestroom", "basement", "hotwaterheating", "airconditioning", "prefarea"]
for col in categorical_cols:
    df[col] = df[col].map({"yes": 1, "no": 0})

In [4]:
# Encode 'furnishingstatus' using Label Encoding
label_encoder = LabelEncoder()
df["furnishingstatus"] = label_encoder.fit_transform(df["furnishingstatus"])

In [5]:
# Define features (X) and target variable (y)
X = df.drop(columns=["price"])
y = df["price"]


In [6]:
# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Train Test split

In [7]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## Linear Regression

In [8]:
# --- Baseline Model: Linear Regression ---
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

In [9]:
# Evaluate Linear Regression
mae_linear = mean_absolute_error(y_test, y_pred_linear)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print(f"Linear Regression - MAE: {mae_linear:.2f}, MSE: {mse_linear:.2f}, R²: {r2_linear:.4f}")

Linear Regression - MAE: 979679.69, MSE: 1771751116594.04, R²: 0.6495


## Random Forest

In [10]:
# Random Forest Regression
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest Model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest - MAE: {mae_rf:.2f}, MSE: {mse_rf:.2f}, R²: {r2_rf:.4f}")


Random Forest - MAE: 1024279.18, MSE: 1970052920941.18, R²: 0.6102
