In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# Load dataset from a CSV file (replace with your dataset path)
file_path = '/content/BostonHousing.csv'  # e.g., 'C:/datasets/housing_data.csv'


In [None]:
# Load the dataset
try:
    df = pd.read_csv(file_path)  # Define df in this block so it is accessible
    print("Dataset loaded successfully!")

    # Display the first few rows of the dataframe
    print(df.head())  # Use .head() to display the top 5 rows of the dataframe
except FileNotFoundError:
    print(f"Error: The file at {file_path} was not found.")
    exit()


In [None]:
# Assuming 'medv' is the target column (modify if your dataset is different)
if 'medv' not in df.columns:
    print("Error: The target column 'medv' does not exist in the dataset.")
    exit()

In [None]:
# Inspect column names
print("Column names in the dataset:")
print(df.columns)


In [None]:
# Define the feature matrix (X) and target vector (y)
X = df.drop(columns=['medv'])
y = df['medv']


In [None]:
# Check for missing values
if df.isnull().values.any():
    print("Warning: Missing values detected. Dropping missing values...")
    df = df.dropna()

In [None]:
# Check that X and y have compatible shapes
print("X shape:", X.shape)
print("y shape:", y.shape)
if X.shape[0] != y.shape[0]:
    print(f"Error: X and y have incompatible shapes. X has {X.shape[0]} rows, but y has {y.shape[0]} rows.")
    exit()

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)


In [None]:
# Standardize the data (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Add intercept (bias) to X_train and X_test
X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]


In [None]:
# 1. Analytical Solution (Normal Equation)
theta_analytic = np.linalg.inv(X_train_b.T @ X_train_b) @ X_train_b.T @ y_train
y_pred_analytic = X_test_b @ theta_analytic


In [None]:
# Calculate performance metrics for the analytical solution
mse_analytic = mean_squared_error(y_test, y_pred_analytic)
r2_analytic = r2_score(y_test, y_pred_analytic)

print("\nAnalytic Solution:")
print(f"Theta (coefficients): {theta_analytic}")
print(f"Mean Squared Error: {mse_analytic}")
print(f"R²: {r2_analytic}")

In [None]:
# 2. Gradient Descent Implementation (Full-Batch)
theta_gd = np.zeros(X_train_b.shape[1])
alpha = 0.01
epochs = 1000

for epoch in range(epochs):
    gradients = 2 / len(X_train_b) * X_train_b.T @ (X_train_b @ theta_gd - y_train)
    theta_gd -= alpha * gradients

y_pred_gd = X_test_b @ theta_gd


In [None]:
# Calculate performance metrics for gradient descent
mse_gd = mean_squared_error(y_test, y_pred_gd)
r2_gd = r2_score(y_test, y_pred_gd)

print("\nGradient Descent Solution (Full-Batch):")
print(f"Theta (coefficients): {theta_gd}")
print(f"Mean Squared Error: {mse_gd}")
print(f"R²: {r2_gd}")


In [None]:
# Plot the predicted values vs actual values
plt.scatter(y_test, y_pred_analytic, label='Analytic', alpha=0.5)
plt.scatter(y_test, y_pred_gd, label='Gradient Descent', alpha=0.5)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='black', lw=2)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.legend()
plt.title('Comparison of Predicted Prices vs Actual Prices')
plt.show()