# Linear Regression**

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers
import numpy as np
import pandas as pd #I normally always import Pandas. I am not sure why you are not requiring it, since we have to read in the file right
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib
import matplotlib.pyplot as plt

2025-09-21 17:09:59.251939: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-09-21 17:09:59.252034: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-09-21 17:09:59.536060: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-09-21 17:10:02.062725: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


##**Problem 1**
In this problem, we will explore the basic linear regression: $y_n=w_0 +w_1x_n$, where $n=1,\dots, N$ is the index of the data sample. Your task is to determine the appropriate values of $w_0$ and $w_1$ for the given data samples in Lab1_1.csv.

Requirments:
*   You are required to use gradient descent algorithm to complete this problem.
*   You need to include the following four components in your lab report: (1) the codes, (2) the obtained appropriate value of $w_0$ and $w_1$, (3) the obtained training error, and (4) the obtained testing error.

In [None]:
df = pd.read_csv("Lab1_1.csv")
xcol, ycol = df.select_dtypes(include=[np.number]).columns[:2]

In [None]:
def split(df, test_ratio=0.2, seed=1):
    d = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    n_test = int(len(d)*test_ratio)
    return d.iloc[n_test:].reset_index(drop=True), d.iloc[:n_test].reset_index(drop=True)

train, test = split(df[[xcol, ycol]], test_ratio=0.2, seed=1)

In [None]:
# Build design matrices: X = [1, x]
X_tr = np.c_[np.ones(len(train)), train[xcol].to_numpy()]
y_tr = train[ycol].to_numpy().reshape(-1, 1)
X_te = np.c_[np.ones(len(test)),  test[xcol].to_numpy()]
y_te = test[ycol].to_numpy().reshape(-1, 1)

In [None]:
def mse(y, yhat): return float(np.mean((y - yhat)**2))

def gd(X, y, lr=0.01, epochs=5000):
    N, D = X.shape
    w = np.zeros((D,1))
    history = []
    step = max(1, epochs//100)
    for t in range(epochs):
        yhat = X @ w
        grad = (2.0/N) * (X.T @ (yhat - y))
        w -= lr * grad
        if t % step == 0 or t == epochs-1:
            history.append(mse(y, yhat))
    return w, history

w, hist = gd(X_tr, y_tr, lr=0.01, epochs=5000)
w0, w1 = float(w[0,0]), float(w[1,0])

In [None]:
tr_pred, te_pred = X_tr @ w, X_te @ w
tr_mse, te_mse = mse(y_tr, tr_pred), mse(y_te, te_pred)

print("Results")
print(f"Columns used: x='{xcol}', y='{ycol}'")
print(f"w0 (intercept): {w0:.6f}")
print(f"w1 (slope):     {w1:.6f}")
print(f"Training MSE:   {tr_mse:.6f}")
print(f"Testing MSE:    {te_mse:.6f}")

In [None]:
plt.figure(); plt.plot(range(len(hist)), hist); plt.title("GD Convergence (MSE)"); plt.xlabel("Checkpoint"); plt.ylabel("MSE"); plt.show()

plt.figure()
plt.scatter(df[xcol], df[ycol], s=14, label="data")
xs = np.linspace(df[xcol].min(), df[xcol].max(), 300)
ys = w0 + w1*xs
plt.plot(xs, ys, label="fit")
plt.title("Problem 1: Data & Learned Line"); plt.xlabel(xcol); plt.ylabel(ycol); plt.legend(); plt.show()



##**Problem 2**
In this problem, we will explore an extended linear regression: $y_n=w_0 +w_1x_n+w_2x_n^2$, where $n=1,\dots, N$ is the index of the data sample. Your task is to determine the appropriate values of $w_0$, $w_1$, and $w_2$ for the given data samples in Lab1_2.csv.

Requirments:
*   You are required to use gradient descent algorithm to complete this problem.
*   You need to include the following four components in your lab report: (1) the codes, (2) the obtained appropriate value of $w_0$, $w_1$, and $w_2$, (3) the obtained training error, and (4) the obtained testing error.

In [None]:
df2 = pd.read_csv("Lab1_2.csv")
xcol, ycol = df2.select_dtypes(include=[np.number]).columns[:2]  # e.g., "x", "y"
print(f"Using columns -> x: '{xcol}', y: '{ycol}'")

In [None]:
def split(df, test_ratio=0.2, seed=2):
    d = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    n_test = int(len(d)*test_ratio)
    return d.iloc[n_test:].reset_index(drop=True), d.iloc[:n_test].reset_index(drop=True)

train2, test2 = split(df2[[xcol, ycol]], test_ratio=0.2, seed=2)

In [None]:
x_tr = train2[xcol].to_numpy()
x_te = test2[xcol].to_numpy()

X_tr = np.c_[np.ones(len(x_tr)), x_tr, x_tr**2]
y_tr = train2[ycol].to_numpy().reshape(-1, 1)

X_te = np.c_[np.ones(len(x_te)), x_te, x_te**2]
y_te = test2[ycol].to_numpy().reshape(-1, 1)

In [None]:
def mse(y, yhat): return float(np.mean((y - yhat)**2))

def gd(X, y, lr=0.01, epochs=8000):
    N, D = X.shape
    w = np.zeros((D,1))
    history = []
    step = max(1, epochs//100)
    for t in range(epochs):
        yhat = X @ w
        grad = (2.0/N) * (X.T @ (yhat - y))
        w -= lr * grad
        if t % step == 0 or t == epochs-1:
            history.append(mse(y, yhat))
    return w, history

w, hist = gd(X_tr, y_tr, lr=0.01, epochs=8000)
w0, w1, w2 = (float(w[i,0]) for i in range(3))

In [None]:
tr_pred, te_pred = X_tr @ w, X_te @ w
tr_mse, te_mse = mse(y_tr, tr_pred), mse(y_te, te_pred)

print("=== Problem 2 Results ===")
print(f"w0 (bias):  {w0:.6f}")
print(f"w1 (x):     {w1:.6f}")
print(f"w2 (x^2):   {w2:.6f}")
print(f"Training MSE: {tr_mse:.6f}")
print(f"Testing MSE:  {te_mse:.6f}")

In [None]:
plt.figure(); plt.plot(range(len(hist)), hist)
plt.title("Problem 2: GD Convergence (MSE)"); plt.xlabel("Checkpoint"); plt.ylabel("MSE"); plt.show()

plt.figure()
plt.scatter(df2[xcol], df2[ycol], s=14, label="data")
xs = np.linspace(df2[xcol].min(), df2[xcol].max(), 400)
ys = w0 + w1*xs + w2*(xs**2)
plt.plot(xs, ys, label="quadratic fit")
plt.title("Problem 2: Data & Learned Quadratic"); plt.xlabel(xcol); plt.ylabel(ycol); plt.legend(); plt.show()

In [None]:
pd.DataFrame({"parameter":["w0","w1","w2","train_MSE","test_MSE"],
              "value":[w0, w1, w2, tr_mse, te_mse]}).to_csv("problem2_summary.csv", index=False)

##**Problem 3**
In this problem, we will explore to use extended linear regression: $y_n=w_0 +\sum_{k=1}^Kw_kx_{n,k}$ to solve a real-world problem on stock forecasting. Your task is predict the Close value based on the Open, High, and Low values given in Lab1_3.csv.  
Requirments:
*   You are required to use gradient descent algorithm to complete this problem.
*   You need to include the following four components in your lab report: (1) the codes, (2) the obtained appropriate value of $w_0$, $w_1$, ..., $w_K$ , (3) the obtained training error, and (4) the obtained testing error.

In [None]:
df3 = pd.read_csv("Lab1_3.csv")

In [None]:
expected = ["Open", "High", "Low", "Close"]
if all(c in df3.columns for c in expected):
    feature_cols = ["Open", "High", "Low"]
    target_col = "Close"
else:
    num_cols = df3.select_dtypes(include=[np.number]).columns.tolist()
    target_col = "Close" if "Close" in df3.columns else (num_cols[-1])
    feature_cols = [c for c in num_cols if c != target_col]
print("Features:", feature_cols, "| Target:", target_col)


In [None]:
def split(d, test_ratio=0.2, seed=3):
    d = d.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    n_test = int(len(d)*test_ratio)
    return d.iloc[n_test:].reset_index(drop=True), d.iloc[:n_test].reset_index(drop=True)

train3, test3 = split(df3[feature_cols + [target_col]], test_ratio=0.2, seed=3)

In [None]:
mu = train3[feature_cols].mean()
sd = train3[feature_cols].std(ddof=0).replace(0, 1.0)  # avoid divide-by-zero
Xtr = ((train3[feature_cols] - mu) / sd).to_numpy()
Xte = ((test3[feature_cols]  - mu) / sd).to_numpy()
ytr = train3[target_col].to_numpy().reshape(-1,1)
yte = test3[target_col].to_numpy().reshape(-1,1)

Xtr = np.c_[np.ones(len(Xtr)), Xtr]
Xte = np.c_[np.ones(len(Xte)), Xte]

In [None]:
def mse(y, yhat): return float(np.mean((y - yhat)**2))

def gd(X, y, lr=0.01, epochs=20000):
    N, D = X.shape
    w = np.zeros((D,1))
    history = []
    step = max(1, epochs//200)
    for t in range(epochs):
        yhat = X @ w
        grad = (2.0/N) * (X.T @ (yhat - y))
        w -= lr * grad
        if t % step == 0 or t == epochs-1:
            history.append(mse(y, yhat))
    return w, history

w_scaled, hist = gd(Xtr, ytr, lr=0.01, epochs=20000)

In [None]:
tr_pred = Xtr @ w_scaled
te_pred = Xte @ w_scaled
tr_mse = mse(ytr, tr_pred)
te_mse = mse(yte, te_pred)

In [None]:
w0_s = float(w_scaled[0,0])
wk_s = w_scaled[1:,0]
wk_orig = wk_s / sd.to_numpy()
w0_orig = w0_s - float((wk_s * (mu.to_numpy()/sd.to_numpy())).sum())

print("\nResults")
print(f"w0 (intercept, original scale): {w0_orig:.6f}")
for i, col in enumerate(feature_cols):
    print(f"w({col}) : {wk_orig[i]:.6f}")
print(f"Training MSE: {tr_mse:.6f}")
print(f"Testing  MSE: {te_mse:.6f}")

In [None]:
plt.figure()
plt.plot(range(len(hist)), hist)
plt.title("Problem 3: GD Convergence (MSE)")
plt.xlabel("Checkpoint"); plt.ylabel("MSE"); plt.show()

plt.figure()
plt.scatter(yte, te_pred, s=14)
plt.plot([yte.min(), yte.max()], [yte.min(), yte.max()])
plt.title("Problem 3: Test Predictions vs Actual (Close)")
plt.xlabel("Actual Close"); plt.ylabel("Predicted Close"); plt.show()

In [None]:
out_rows = [{"parameter":"w0","value":w0_orig}]
out_rows += [{"parameter":f"w({c})","value":float(wk_orig[i])} for i,c in enumerate(feature_cols)]
out_rows += [{"parameter":"train_MSE","value":tr_mse},{"parameter":"test_MSE","value":te_mse}]
pd.DataFrame(out_rows).to_csv("problem3_summary.csv", index=False)