In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Lasso
import numpy as np
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.metrics import r2_score

In [8]:
data = pd.read_csv('wine_agg.csv')


In [9]:
y = data['LogAuctionIndex']
x = data.drop(columns = 'LogAuctionIndex')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
#2A
lasso_cv = LassoCV(cv = 5, random_state=42).fit(x_train, y_train)
best_alpha = lasso_cv.alpha_
print(f"Best alpha: {best_alpha}")

Best alpha: 0.03273531874228395


In [13]:
lasso = Lasso(alpha=best_alpha)
lasso.fit(x_train, y_train)
y_pred = lasso.predict(x_test)

In [14]:
#OSR2
numerator = np.sum((y_test - y_pred) ** 2)
denominator = np.sum((y_test - y_train.mean())**2)
osr2 = 1 - numerator/denominator
print(f"OSR2: {osr2}")

OSR2: 0.3862533319966974


In [15]:
coeffs = pd.Series(lasso.coef_, index=x.columns)
zero_coeffs = coeffs[coeffs == 0]
print(f"Number of zero coefficients: {len(zero_coeffs)}")
print(zero_coeffs.index.tolist())

Number of zero coefficients: 2
['Year', 'USAlcConsump']


In [17]:
#2B
alphas = np.logspace(-4, 4, 100)
ridge_cv = RidgeCV(alphas=alphas, store_cv_values=True).fit(x_train, y_train)
best_beta = ridge_cv.alpha_
print(f"Best beta: {best_beta}")

Best beta: 1.592282793341094




In [18]:
ridge = Ridge(alpha=best_beta)
ridge.fit(x_train, y_train)
y_pred = ridge.predict(x_test)


In [19]:
numerator = np.sum((y_test - y_pred) ** 2)
denominator = np.sum((y_test - y_train.mean())**2)
osr2 = 1 - numerator/denominator
print(f"OSR2: {osr2}")

OSR2: 0.32856394760953944


In [20]:
coeffs = pd.Series(ridge.coef_, index=x.columns)
zero_coeffs = coeffs[coeffs == 0]
print(f"Number of zero coefficients: {len(zero_coeffs)}")
print(zero_coeffs.index.tolist())

Number of zero coefficients: 0
[]
