In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split

url = "USA_Housing.csv"   
data = pd.read_csv(url)

X = data.drop(columns=["Price"])   
y = data["Price"].values.reshape(-1, 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_beta = None
best_r2 = -np.inf

fold = 1
for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

    beta = np.linalg.inv(X_train_bias.T @ X_train_bias) @ X_train_bias.T @ y_train

    y_pred = X_test_bias @ beta

    r2 = r2_score(y_test, y_pred)
    print(f"Fold {fold} - R2 Score: {r2:.4f}")

    if r2 > best_r2:
        best_r2 = r2
        best_beta = beta

    fold += 1

print("\nBest R2 Score:", best_r2)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

y_pred_final = X_test_bias @ best_beta

final_r2 = r2_score(y_test, y_pred_final)
print("Final R2 Score on 30% test set:", final_r2)

Fold 1 - R2 Score: 0.9180
Fold 2 - R2 Score: 0.9146
Fold 3 - R2 Score: 0.9116
Fold 4 - R2 Score: 0.9193
Fold 5 - R2 Score: 0.9244

Best R2 Score: 0.9243869413350316
Final R2 Score on 30% test set: 0.9147458156636434


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

url = "USA_Housing.csv"  
data = pd.read_csv(url)

X = data.drop(columns=["Price"])
y = data["Price"].values.reshape(-1, 1)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.44, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(30/44), random_state=42)

X_train_bias = np.c_[np.ones((X_train.shape[0], 1)), X_train]
X_val_bias = np.c_[np.ones((X_val.shape[0], 1)), X_val]
X_test_bias = np.c_[np.ones((X_test.shape[0], 1)), X_test]

def gradient_descent(X, y, alpha, iterations):
    m, n = X.shape
    beta = np.zeros((n, 1))

    for i in range(iterations):
        gradients = (1/m) * X.T @ (X @ beta - y)
        beta = beta - alpha * gradients
    
    return beta

learning_rates = [0.001, 0.01, 0.1, 1]
best_beta = None
best_r2_val = -np.inf

for alpha in learning_rates:
    beta = gradient_descent(X_train_bias, y_train, alpha, iterations=1000)

    y_val_pred = X_val_bias @ beta
    y_test_pred = X_test_bias @ beta

    r2_val = r2_score(y_val, y_val_pred)
    r2_test = r2_score(y_test, y_test_pred)

    print(f"Learning Rate: {alpha}")
    print(f"Validation R2: {r2_val:.4f}, Test R2: {r2_test:.4f}\n")

    if r2_val > best_r2_val:
        best_r2_val = r2_val
        best_beta = beta

print("Best Validation R2:", best_r2_val)
print("Best Beta Coefficients:\n", best_beta)

Learning Rate: 0.001
Validation R2: -1.0428, Test R2: -0.9601

Learning Rate: 0.01
Validation R2: 0.9199, Test R2: 0.9134

Learning Rate: 0.1
Validation R2: 0.9200, Test R2: 0.9134

Learning Rate: 1
Validation R2: 0.9200, Test R2: 0.9134

Best Validation R2: 0.9199649194854793
Best Beta Coefficients:
 [[1232180.27200919]
 [ 230645.88389435]
 [ 165328.94019375]
 [ 120045.00851908]
 [   2945.02108903]
 [ 151375.22971285]]


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
columns = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
           "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
           "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
           "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

data = pd.read_csv(url, names=columns)
data = data.replace("?", np.nan)


data = data.dropna(subset=["price"]).reset_index(drop=True)

for col in data.columns:
    if data[col].dtype == "object":
        data[col] = data[col].fillna(data[col].mode()[0])
    else:
        data[col] = pd.to_numeric(data[col], errors="coerce")
        data[col] = data[col].fillna(data[col].median())


num_map = {"two": 2, "three": 3, "four": 4, "five": 5,
           "six": 6, "eight": 8, "twelve": 12}

for col in ["num_doors", "num_cylinders"]:
    if col in data.columns:
        data[col] = data[col].astype(str).str.lower().replace(num_map)
        data[col] = pd.to_numeric(data[col], errors="coerce").fillna(0)


if "body_style" in data.columns:
    data = pd.get_dummies(data, columns=["body_style"], drop_first=True)
if "drive_wheels" in data.columns:
    data = pd.get_dummies(data, columns=["drive_wheels"], drop_first=True)


for col in ["make", "aspiration", "engine_location", "fuel_type"]:
    if col in data.columns:
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))


if "fuel_system" in data.columns:
    data["fuel_system"] = data["fuel_system"].astype(str).str.lower().apply(lambda x: 1 if "pfi" in x else 0)


if "engine_type" in data.columns:
    data["engine_type"] = data["engine_type"].astype(str).str.lower().apply(lambda x: 1 if "ohc" in x else 0)

X = data.drop(columns=["price"])
y = pd.to_numeric(data["price"], errors="coerce")

X = X.apply(lambda col: pd.to_numeric(col, errors="coerce")).fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Feature matrix shape:", X_scaled.shape)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
r2_original = r2_score(y_test, y_pred)
print(f"R² Score (Original features): {r2_original:.4f}")

pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)

lr_pca = LinearRegression()
lr_pca.fit(X_train_pca, y_train)
y_pred_pca = lr_pca.predict(X_test_pca)
r2_pca = r2_score(y_test, y_pred_pca)

print(f"R² Score (After PCA): {r2_pca:.4f}")

if r2_pca > r2_original:
    print("PCA improved performance.")
else:
    print("PCA did not improve performance.")


Feature matrix shape: (201, 29)
R² Score (Original features): 0.8680
R² Score (After PCA): 0.8596
PCA did not improve performance.


  data[col] = data[col].astype(str).str.lower().replace(num_map)
  data[col] = data[col].astype(str).str.lower().replace(num_map)
