# Problem 2

In [None]:
# Setup:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

import warnings

warnings.filterwarnings("ignore")

# Fix random seed for reproducibility
np.random.seed(42)

## Example code using the `polyfit` and `Kfold` functions

Note: This section is not part of the homework problem, but provides some potentially-helpful example code regarding the usage of `numpy.polyfit`, `numpy.polyval`, and `sklearn.model_selection.KFold`.

First, let's generate some synthetic data: a quadratic function plus some Gaussian noise.

In [None]:
# Coefficients of the quadratic function, y(x) = ax^2 + bx + c:
a = 2
b = 5
c = 7

N = 200  # Number of data points
x = np.linspace(-10, 10, num=N)  # x ranges from -10 to 10
# y is the quadratic function of x specified by a, b, and c, plus noise
y = a * x**2 + b * x + c + 15 * np.random.randn(N)

# Plot the data:
plt.figure()
plt.plot(x, y, marker="o", linewidth=0)

plt.xlabel("x")
plt.ylabel("y")
plt.title("Synthetic data")

plt.show()

Next, we'll use the `numpy.polyfit` function to fit a quadratic polynomial to this data. 
We can evaluate the resulting polynomial at arbitrary points.

In [None]:
# Fit a degree-2 polynomial to the data:
degree = 2
coefficients = np.polyfit(x, y, degree)

# Print out the resulting quadratic function:
print(f"We fit the following quadratic function: f(x) = {coefficients[0]}x^2 + {coefficients[1]}*x + {coefficients[2]}")

# Evaluate the fitted polynomial at x = 4:
x_test = 4
f_eval = np.polyval(coefficients, x_test)
print(f"\nf({x_test}) = {f_eval}")

# Let's visualize our fitted quadratic:
plt.figure()

plt.plot(x, y, marker="o", linewidth=0)
plt.plot(x, np.polyval(coefficients, x), color="red", linewidth=3)

plt.legend(["Data", "Fitted quadratic"], loc="best")
plt.xlabel("x")
plt.ylabel("y")
plt.title("Synthetic data with Fitted Quadratic")

plt.show()

Finally, assume that we'd like to perform 10-fold cross validation with this dataset. 
Let's divide it into training and test sets, and print out the test sets. 
To limit the amount of text that we are printing out, we'll modify the dataset to make it smaller.

In [None]:
# Coefficients of the quadratic function, y = ax^2 + bx + c:
a = 2
b = 5
c = 7

N = 80  # Number of points--fewer this time!
x = np.linspace(-10, 10, num=N)  # x ranges from -10 to 10
# y is the quadratic function of x specified by a, b, and c, plus noise
y = a * x**2 + b * x + c + 15 * np.random.randn(N)

# Initialize kfold cross-validation object with 10 folds:
num_folds = 10
kf = KFold(n_splits=num_folds)

# Iterate through cross-validation folds:
i = 1
for train_index, test_index in kf.split(x):

    # Print out test indices:
    print("Fold ", i, " of ", num_folds, " test indices:", test_index)

    # Training and testing data points for this fold:
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    i += 1

## Loading the Data for Problem 2

This code loads the data from `bv_data.csv` using the load_data helper function. Note that `data[:, 0]` is an array of all the $x$ values in the data and `data[:, 1]` is an array of the corresponding $y$ values.

In [None]:
def load_data(filename):
    """
    Function loads data stored in the file filename and returns it as a numpy ndarray.
    Input:
        filename: given as a string.
    Output:
        Data contained in the file, returned as a numpy ndarray
    """
    return np.loadtxt(filename, skiprows=1, delimiter=",")

In [None]:
data = load_data("../data/bv_data.csv")
x = data[:, 0]
y = data[:, 1]

Write your code below for solving problem 2 part B: