# Task a) and b) of Project 1 - FYS-STK4155

In [3]:
from functions import *

plt.rcParams.update({
    "text.usetex": True,       
    "font.family": "serif",    
    "font.size": 10, 
})

import matplotlib.style as mplstyle
mplstyle.use(['ggplot', 'fast'])

%load_ext autoreload
%autoreload 2

In [4]:
# Random seed for reproducibility
np.random.seed(42)

# Colormap for plotting
colormap = 'plasma'

# Size of test dataset
test_size = 0.2

# a) Ordinary Least Squares (OLS)
* Using OLS to predict the Runge function.
* Method is applied for the different number of data points in `n_vals` and for the different polynomial degrees in `p_vals`.
* Uses sklearn functionalities to make the model and assess the MSE and R^2 score.

In [5]:
n_vals = np.arange(50, 1050, 50)  # Range of number of data points, 50-1000
p_vals = np.arange(2, 16)  # Range of polynomial degrees, 2-15

In [None]:
(train, test, full) = make_dataset(n_points)
x_train, y_train = train
x_test, y_test = test
x_all, y_all, y_all_clean = full

In [None]:
results = []

for n in n_vals:
    train, test, full = make_dataset(n)  # making a dataset with size n
    x_train, y_train = train
    x_test, y_test = test
    x_all, y_all, y_all_clean = full

    x = x.reshape(-1, 1)

    # splitting the data into train and test data sets
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=test_size, random_state=42
    )
    
    # making an OLS model for a given polynomial degree, p
    for p in p_vals:
        model = make_pipeline(
            PolynomialFeatures(degree=p, include_bias=False),
            StandardScaler(with_mean=False),
            LinearRegression(fit_intercept=False)
        )
        
        # using the training data to train the model
        model.fit(x_train, y_train)

        # using the test data to make a prediction, unsee data for the model
        y_pred = model.predict(x_test)
        
        # assessing the model with scores
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # extracting the model features
        theta = model.named_steps['linearregression'].coef_
        
        # saving the results in a pandas dataframe
        results.append({
            'n': n,
            'p': p,
            'theta': theta,
            'MSE': mse,
            'R2': r2
        })

df_OLS = pd.DataFrame(results)

ValueError: too many values to unpack (expected 2)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

num_colors = len(n_vals)
cmap = plt.get_cmap(colormap, num_colors)

for i, en in enumerate(n_vals):
    n_df = df_OLS[df_OLS['n'] == en]
    color = cmap(i) 
    ax[0].plot(n_df['p'], n_df['MSE'], marker='o', markersize='3', linewidth='2', color=color, label=f'n: {en}')

ax[0].set_title('MSE as a function of polynomial degree')
ax[0].legend(loc='upper left')
ax[0].set_xlabel('Polynomial degree')
ax[0].set_ylabel('MSE')

for i, en in enumerate(n_vals):
    n_df = df_OLS[df_OLS['n'] == en]
    color = cmap(i) 
    ax[1].plot(n_df['p'], n_df['R2'], marker='o', markersize='3', linewidth='2', color=color, label=f'n: {en}')

ax[1].set_title(r'$R^2$ as a function of polynomial degree')
ax[1].legend(loc='upper left')
ax[1].set_xlabel('Polynomial degree')
ax[1].set_ylabel(r'$R^2$')

fig.suptitle('OLS')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

num_colors = len(p_vals)
cmap = plt.get_cmap(colormap, num_colors)

for i, pe in enumerate(p_vals):
    p_df = df_OLS[df_OLS['p'] == pe]
    color = cmap(i) 
    ax[0].plot(p_df['n'], p_df['MSE'], marker='o', markersize='3', linewidth='2', color=color, label=f'p: {pe}')

ax[0].set_title('MSE as a function of number of datapoints')
ax[0].legend(loc='upper right')
ax[0].set_xlabel('Number of datapoints')
ax[0].set_ylabel('MSE')

for i, pe in enumerate(p_vals):
    p_df = df_OLS[df_OLS['p'] == pe]
    color = cmap(i) 
    ax[1].plot(p_df['n'], p_df['R2'], marker='o', markersize='3', linewidth='2', color=color, label=f'p: {pe}')

ax[1].set_title(r'$R^2$ as a function of number of datapoints')
ax[1].legend(loc='upper right')
ax[1].set_xlabel('Number of datapoints')
ax[1].set_ylabel(r'$R^2$')

fig.suptitle('OLS')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,5))

num_colors = len(n_vals)
cmap = plt.get_cmap(colormap, num_colors)

for i, en in enumerate(n_vals):
    n_df = df_OLS[df_OLS['n'] == en]
    color = cmap(i) 
    ax[0].plot(n_df['p'], n_df['theta'].apply(lambda x: x[0]), marker='o', markersize='3', linewidth='2', color=color, label=f'N: {en}')
    ax[1].plot(n_df['p'], n_df['theta'].apply(lambda x: x[1]), marker='o', markersize='3', linewidth='2', color=color, label=f'N: {en}')

ax[0].set_title(r'$\theta_1$')
ax[1].set_title(r'$\theta_2$')

fig.suptitle(f'Features as a function of polynomial degree \n OLS', y=1.05)

for axs in ax:
    axs.legend(loc='upper left')
    axs.set_xlabel('Polynomial degree')
    axs.set_ylabel(r'$\theta$')

# b) Ridge regression
* Using Ridge regression to predict the Runge function.
* Method is applied for the different number of data points in `n_vals` and for the different polynomial degrees in `p_vals`, and for different values of the penalization parameter $\lambda$.
* Uses sklearn functionalities to make the model and assess the MSE and R^2 score.

In [None]:
lambdas = np.logspace(-8, 2, 15)

In [None]:
results = []

for n in n_vals:
    x, y = make_data(n)
    x = x.reshape(-1, 1)

    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=test_size, random_state=2018
    )

    for p in p_vals:
        for l in lambdas:
            model = make_pipeline(
                PolynomialFeatures(degree=p, include_bias=False),
                StandardScaler(with_mean=False),
                Ridge(alpha=l, fit_intercept=False)
            )
        
            model.fit(x_train, y_train)
            y_pred = model.predict(x_test)

            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            theta = model.named_steps['ridge'].coef_

            results.append({
                'n': n,
                'p': p,
                'lambda': l, 
                'theta': theta,
                'MSE': mse,
                'R2': r2
            })

df_Ridge = pd.DataFrame(results)

In [None]:
# Choosing a n-value to plot for
n_val = n_vals[-1]

fig, ax = plt.subplots(1, 2, figsize=(15, 5))

colormap='plasma'
num_colors = len(lambdas)
cmap = plt.get_cmap(colormap, num_colors)

for i, l in enumerate(lambdas):
    n_df = df_Ridge[(df_Ridge['lambda'] == l) & (df_Ridge['n'] == n_val)]
    color = cmap(i) 
    ax[0].plot(n_df['p'], n_df['MSE'], marker='o', markersize='3', linewidth='2', color=color, label=f'L: {l:.2e}')
    ax[1].plot(n_df['p'], n_df['R2'], marker='o', markersize='3', linewidth='2', color=color, label=f'L: {l:.2e}')

ax[0].set_title('MSE as a function of polynomial degree', fontsize=10)
ax[0].legend(loc='upper right', fontsize=8)
ax[0].set_xlabel('Polynomial degree')
ax[0].set_ylabel('MSE')

ax[1].set_title(r'$R^2$ as a function of polynomial degree', fontsize=10)
ax[1].legend(loc='upper right', fontsize=8)
ax[1].set_xlabel('Polynomial degree')
ax[1].set_ylabel(r'$R^2$')

fig.suptitle(f'Ridge \n Datapoints: {n_val}')

In [None]:
# Choosing a n-value to plot for
n_val = n_vals[-1]

fig, ax = plt.subplots(1, 2, figsize=(15, 5))

colormap='plasma'
num_colors = len(p_vals)
cmap = plt.get_cmap(colormap, num_colors)

for i, pe in enumerate(p_vals):
    n_df = n_df = df_Ridge[(df_Ridge['p'] == pe) & (df_Ridge['n'] == n_val)]
    color = cmap(i) 
    ax[0].plot(n_df['lambda'], n_df['MSE'], marker='o', markersize='3', linewidth='2', color=color, label=f'p: {pe:.0f}')
    ax[1].plot(n_df['lambda'], n_df['R2'], marker='o', markersize='3', linewidth='2', color=color, label=f'p: {pe:.0f}')

ax[0].set_title('MSE as a function of hyperparameter', fontsize=10)
ax[0].legend(loc='upper right', fontsize=8)
ax[0].set_xlabel(r'$\lambda$')
ax[0].set_ylabel('MSE')

ax[1].set_title(r'$R^2$ as a function of hyperparameter', fontsize=10)
ax[1].legend(loc='upper right', fontsize=8)
ax[1].set_xlabel(r'$\lambda$')
ax[1].set_ylabel(r'$R^2$')

fig.suptitle(f'Ridge \n Datapoints: {n_val}')

In [None]:
# Selecting specific lambda values to plot
lambda_subset = [lambdas[3], lambdas[6], lambdas[9], lambdas[13]]

fig, ax = plt.subplots(4, 2, figsize=(12, 8), sharex='col')
colormap = 'plasma'
num_colors = len(n_vals)
cmap = plt.get_cmap(colormap, num_colors)

for j, l_val in enumerate(lambda_subset):
    for i, en in enumerate(n_vals):
        n_df = df_Ridge[(df_Ridge['n'] == en) & (df_Ridge['lambda'] == l_val)]
        color = cmap(i)
        ax[j, 0].plot(
            n_df['p'], n_df['theta'].apply(lambda x: x[0]),
            marker='o', markersize=3, linewidth=1.5, color=color
        )
        ax[j, 1].plot(
            n_df['p'], n_df['theta'].apply(lambda x: x[1]),
            marker='o', markersize=3, linewidth=1.5, color=color
        )

    ax[j, 0].text(
        0.02, 0.95, rf'$\lambda$={l_val:.4f}',
        transform=ax[j, 0].transAxes,
        fontsize=8, va='top', ha='left',
        bbox=dict(facecolor='white', edgecolor='none', alpha=0.7, pad=1)
    )
    ax[j, 1].text(
        0.02, 0.95, rf'$\lambda$={l_val:.4f}',
        transform=ax[j, 1].transAxes,
        fontsize=8, va='top', ha='left',
        bbox=dict(facecolor='white', edgecolor='none', alpha=0.7, pad=1)
    )

fig.suptitle('Features as a function of Polynomial Degree \n Ridge regression', y=0.95)

for axs in ax[:, 0]:
    axs.set_ylabel(r'$\theta$')

ax[0, 0].set_title(r'$\theta_1$')
ax[0, 1].set_title(r'$\theta_2$')

ax[3, 0].set_xlabel('Polynomial Degree')
ax[3, 1].set_xlabel('Polynomial Degree')

# Colorbar
sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=min(n_vals), vmax=max(n_vals)))
sm.set_array([])

cbar = fig.colorbar(
    sm, ax=ax.ravel().tolist(), orientation='vertical',
    fraction=0.05, pad=0.05, location='right'
)
cbar.set_label('Number of datapoints values')

#plt.tight_layout(rect=[0, 0.05, 1, 0.96])
