In [None]:
import pdb
import numpy as np
from sklearn.preprocessing import StandardScaler
def preprocess_data(data):
    scaler = StandardScaler()
    # pdb.set_trace() # Start debugger here
    scaled_data = scaler.fit_transform(data)
    normalized_data = scaled_data / np.linalg.norm(scaled_data)
    return normalized_data
# Example Data
data = np.array([[1, 2, 3], [4, 5, np.nan], [7, 8, 9]])
col_means = np.nanmean(data, axis=0)
# Identify indices where NaN values are located
inds = np.isnan(data)
# Replace NaNs with the respective column means
data[inds] = np.take(col_means, np.where(inds)[1])
processed_data = preprocess_data(data)
print("Initial Data:\n", data)
print("Processed Data:\n", processed_data)


# (Pdb) pp np.linalg.norm(scaled_data) = nan this is the point of error because 
# norm cannot be computed correctly if any value is undefined.
#  pp normalized_data array = ([[nan, nan, nan],[nan, nan, nan],[nan, nan, nan]]) 



Initial Data:
 [[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]
Processed Data:
 [[-0.40824829 -0.40824829 -0.40824829]
 [ 0.          0.          0.        ]
 [ 0.40824829  0.40824829  0.40824829]]


In [22]:
from sklearn.linear_model import LinearRegression
import numpy as np
import pdb
def train_model(X, y):
    model = LinearRegression()
   #  pdb.set_trace() # Debugger for model training
    X_poly = np.hstack([X, X**2])  # Add quadratic terms
    print(X_poly)
    model.fit(X_poly, y)
    return model
# Example Data
X = np.array([[1], [2], [3], [4]])
y = np.array([1, 4, 9, 16, 25]) # Incorrect shape on purpose
y = y[:X.shape[0]]  # Slice y to keep the first 4 elements

trained_model = train_model(X, y)
print("Trained Model Coefficients:", trained_model.coef_)

# (Pdb) X.shape = (4, 1) implies that we have 4 samples, each with 1 feature.
# (Pdb) y.shape = (5,) implies that we have 5 target values, but only 4 samples in X, causing the mismatch.
# Use y = y[:X.shape[0]]  # Slice y to keep the first 4 elements


# (Pdb) X.shape
# (4, 1)
# (Pdb) y.shape
# (4,)

# Trained Model Coefficients: [5.]

[[ 1  1]
 [ 2  4]
 [ 3  9]
 [ 4 16]]
Trained Model Coefficients: [-1.06450587e-15  1.00000000e+00]


## TASK 3
The core issue arises because the relationship between the **features ($X$)** and the target variable ($y$) is **non-linear (quadratic)**, but the **Linear Regression model** assumes a linear relationship. 
---

## Proposed Solutions

### **1. Polynomial Model**
The polynomial model transforms the original feature space by including additional terms representing higher-degree powers of the original features (e.g., quadratic, cubic). By introducing these terms, the model can approximate non-linear relationships while still using a linear algorithm.

#### Implementation:
- Add quadratic terms to $X$ to create a new feature matrix: 
$$
  X_{\text{poly}} = [X, X^2]
  $$

#### Purpose:
This approach allows the **Linear Regression** model to approximate the true quadratic relationship by leveraging these additional polynomial features.

---

### **2. Using `np.hstack`**
The `np.hstack` function is used to horizontally stack arrays. It is particularly useful for creating the expanded feature matrix by combining the original features ($X$) with their polynomial terms ($X^2$).

#### Implementation:
<!-- ```python -->
$$
X_{\text{poly}} = \text{np.hstack}([X, X^2])
$$


Trained Model Coefficients:
$[-1.06450587 \times 10^{-15}, 1.00000000]$

Mean Squared Error:
$1.262177448353619 \times 10^{-28}$

Prediction:
$[25.0, 36.0, 49.0, 64.0]$

$y_{\text{test}}$:
$[25, 36, 49, 64]$


In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np
def evaluate_model(model, X_test, y_test):
    X_test_poly = np.hstack([X_test, X_test**2])
    predictions = model.predict(X_test_poly)
    # pdb.set_trace() # Debugger for evaluation
    print(predictions)
    mse = mean_squared_error(y_test, predictions)
    return mse
X_test = np.array([[5], [6], [7], [8]])
y_test = np.array([25, 36, 49, 64])
# Using trained_model from previous task
mse_score = evaluate_model(trained_model, X_test, y_test)
print("Mean Squared Error:", mse_score)



# Using trained_model from previous task
mse_score = evaluate_model(trained_model, X_test, y_test)
print("Mean Squared Error:", mse_score)

# > <ipython-input-5-b9bb8dc4aa8d>(7)evaluate_model()
#       5     # pdb.set_trace() # Debugger for evaluation
#       6     breakpoint()
# ----> 7     mse = mean_squared_error(y_test, predictions)
#       8     return mse
#       9 X_test = np.array([[5], [6], [7], [8]])

# ipdb> pp predictions
# array([20., 25., 30., 35.])
# ipdb> pp y_test
# array([25, 36, 49, 64])

[25. 36. 49. 64.]
Mean Squared Error: 1.262177448353619e-28
