In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold


In [2]:

file_path = "/content/USA_Housing.csv"  # replace with your local file path
data = pd.read_csv(file_path)




In [3]:
X = data.drop('Price', axis=1).values
y = data['Price'].values


In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
r2_scores = []
beta_list = []

In [6]:
for train_idx, test_idx in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]


    X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
    X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))


    beta = np.linalg.inv(X_train.T @ X_train) @ (X_train.T @ y_train)
    beta_list.append(beta)


    y_pred = X_test @ beta
    score = r2_score(y_test, y_pred)
    r2_scores.append(score)
    print(f"Fold R²: {score:.4f}")

Fold R²: 0.9180
Fold R²: 0.9146
Fold R²: 0.9116
Fold R²: 0.9193
Fold R²: 0.9244


In [11]:
best_beta = beta_list[np.argmax(r2_scores)]
X_train70, X_test30, y_train70, y_test30 = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42
)


In [10]:
from sklearn.model_selection import train_test_split



In [12]:
X_train70 = np.hstack((np.ones((X_train70.shape[0], 1)), X_train70))
X_test30 = np.hstack((np.ones((X_test30.shape[0], 1)), X_test30))

y_pred_final = X_test30 @ best_beta
final_r2 = r2_score(y_test30, y_pred_final)
print("\nFinal R² on 30% Test Set:", round(final_r2, 4))


Final R² on 30% Test Set: 0.9147


In [16]:
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(30/44), random_state=42)

# Add bias
# Ensure y is reshaped to (n_samples, 1)
y_train = y_train.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)


In [17]:
def gradient_descent(X, y, lr, iterations=1000):
    n_samples, n_features = X.shape
    beta = np.zeros((n_features, 1))

    for _ in range(iterations):
        y_pred = X @ beta
        gradient = (2/n_samples) * (X.T @ (y_pred - y))
        beta -= lr * gradient
    return beta

In [18]:
learning_rates = [0.001, 0.01, 0.1, 1]
results = {}

for lr in learning_rates:
    beta_gd = gradient_descent(X_train, y_train, lr)
    val_pred = X_val @ beta_gd
    test_pred = X_test @ beta_gd

    val_r2 = r2_score(y_val, val_pred)
    test_r2 = r2_score(y_test, test_pred)

    results[lr] = {"val_r2": val_r2, "test_r2": test_r2}
    print(f"LR={lr}: Validation R²={val_r2:.4f}, Test R²={test_r2:.4f}")

LR=0.001: Validation R²=-11.9301, Test R²=-11.8173
LR=0.01: Validation R²=-11.8249, Test R²=-11.7898
LR=0.1: Validation R²=-11.8249, Test R²=-11.7898
LR=1: Validation R²=-inf, Test R²=-inf


  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)
  numerator = xp.sum(weight * (y_true - y_pred) ** 2, axis=0)


In [19]:
best_lr = max(results, key=lambda k: results[k]["val_r2"])
print("\nBest Learning Rate:", best_lr)


Best Learning Rate: 0.1


In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [21]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration","num_doors",
           "body_style", "drive_wheels", "engine_location", "wheel_base", "length", "width",
           "height", "curb_weight", "engine_type", "num_cylinders", "engine_size",
           "fuel_system", "bore", "stroke", "compression_ratio", "horsepower",
           "peak_rpm", "city_mpg", "highway_mpg", "price"]

data = pd.read_csv(url, names=columns, na_values="?")

In [23]:
for col in data.columns:
    if data[col].dtype == "object":
        # Fill categorical NaN with mode
        data[col] = data[col].fillna(data[col].mode()[0])
    else:
        # Fill numeric NaN with median
        data[col] = data[col].fillna(data[col].median())

# Drop rows where price is still missing
data = data.dropna(subset=["price"])
data["price"] = data["price"].astype(float)


In [24]:
from sklearn.preprocessing import LabelEncoder

# Convert words to numbers for num_doors and num_cylinders
word_to_num = {
    "two": 2, "three": 3, "four": 4, "five": 5,
    "six": 6, "eight": 8, "twelve": 12
}
data["num_doors"] = data["num_doors"].replace(word_to_num)
data["num_cylinders"] = data["num_cylinders"].replace(word_to_num)

# One-hot encoding for body_style, drive_wheels
data = pd.get_dummies(data, columns=["body_style", "drive_wheels"], drop_first=True)


  data["num_doors"] = data["num_doors"].replace(word_to_num)
  data["num_cylinders"] = data["num_cylinders"].replace(word_to_num)


In [25]:
for col in ["make", "aspiration", "engine_location", "fuel_type"]:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# Binary encoding for special conditions
data["fuel_system"] = data["fuel_system"].apply(lambda x: 1 if "pfi" in str(x) else 0)
data["engine_type"] = data["engine_type"].apply(lambda x: 1 if "ohc" in str(x) else 0)

print("Preprocessing complete. Data sample:")
print(data.head())

Preprocessing complete. Data sample:
   symboling  normalized_losses  make  fuel_type  aspiration  num_doors  \
0          3              115.0     0          1           0          2   
1          3              115.0     0          1           0          2   
2          1              115.0     0          1           0          2   
3          2              164.0     1          1           0          4   
4          2              164.0     1          1           0          4   

   engine_location  wheel_base  length  width  ...  peak_rpm  city_mpg  \
0                0        88.6   168.8   64.1  ...    5000.0        21   
1                0        88.6   168.8   64.1  ...    5000.0        21   
2                0        94.5   171.2   65.5  ...    5000.0        19   
3                0        99.8   176.6   66.2  ...    5500.0        24   
4                0        99.4   176.6   66.4  ...    5500.0        18   

   highway_mpg    price  body_style_hardtop  body_style_hatchback  