# Assignment 2

In [33]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression, LassoCV, ElasticNetCV, ElasticNet
from sklearn.model_selection import train_test_split, GridSearchCV

# 1) Read data

In [34]:
train_df = pd.read_csv(r"Assignment 1/data/train_data.csv")
test_df  = pd.read_csv(r"Assignment 1/data/test_data.csv")
print(train_df.head())

   ID         Y        X1        X2        X3        X4        X5        X6  \
0   0 -1.399091  1.174139  1.413109  0.164693 -1.067338  0.015324 -1.280970   
1   1  3.097990  0.208922  0.931231  0.838779  0.893483 -0.510555  0.900289   
2   2 -1.707346 -0.744982  0.962118  0.615392 -0.427943 -0.014912  1.138781   
3   3  0.610625 -0.170428 -1.361771  0.206042  0.623124  0.907441 -0.873814   
4   4 -0.689196 -0.858792  0.321308 -0.415649  1.014056 -0.522858  0.926634   

         X7        X8  ...       X41       X42       X43       X44       X45  \
0  0.489681 -0.371982  ... -0.115044 -2.580043 -0.812428  0.772820 -0.460444   
1 -0.042490  0.839400  ...  1.155635  0.673035 -0.438152 -0.001316 -0.761800   
2  1.159491  0.055467  ...  0.299277  1.387495 -0.007519 -0.464825  0.830986   
3  1.287383  0.901191  ...  1.209247  0.095866 -0.287905 -1.110714 -1.660352   
4 -0.390663  0.790054  ... -1.191989 -1.127448  0.246358  0.407769  1.132454   

        X46       X47       X48       X49   

# 2) Correlations with Y

In [35]:
correlations = train_df.filter(regex='^X').corrwith(train_df['Y'])
correlation_table = correlations.to_frame(name='Correlation with Y')
print(correlation_table.sort_values(by='Correlation with Y', ascending=False))

     Correlation with Y
X34            0.330143
X41            0.205901
X48            0.180286
X46            0.149633
X16            0.149480
X15            0.147115
X26            0.144194
X10            0.141671
X1             0.135257
X12            0.132727
X30            0.126337
X22            0.125427
X14            0.123509
X11            0.110485
X5             0.110160
X43            0.109876
X20            0.104147
X32            0.087633
X36            0.083855
X21            0.075033
X31            0.038369
X3             0.034430
X33            0.027626
X18            0.024968
X25            0.022851
X37            0.021334
X39            0.019053
X27            0.016782
X7             0.014777
X38            0.014546
X17            0.013900
X2             0.010244
X6             0.008458
X13            0.008407
X50            0.007810
X4             0.007688
X24            0.005755
X19            0.004383
X8             0.002121
X35            0.002038
X47           -0

# 3) Scaler and K-Fold CV

In [36]:
X_train = train_df[x_cols].values
y_train = train_df["Y"].values
X_test  = test_df[x_cols].values
kf = KFold(n_splits=10, shuffle=True, random_state=42)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train) 
X_test_std  = scaler.transform(X_test)  

# 4) LassoCV

In [37]:
alphas = np.logspace(-4, 1, 50)

lasso_cv = LassoCV(
    alphas=alphas,
    cv=kf,
    random_state=42,
    max_iter=10000
)
lasso_cv.fit(X_train_std, y_train)

lasso_mse_path = lasso_cv.mse_path_
lasso_cv_rmse  = float(np.sqrt(lasso_mse_path.mean(axis=1).min()))
print(f"\nLassoCV best alpha: {lasso_cv.alpha_:.6f}")
print(f"LassoCV RMSE: {lasso_cv_rmse:.6f}")


LassoCV best alpha: 0.035565
LassoCV RMSE: 2.724556


# 5) ElasticNetCV

In [38]:
l1_ratios = np.linspace(0.1, 0.9, 9)

enet_cv = ElasticNetCV(
    alphas=alphas,
    l1_ratio=l1_ratios,
    cv=kf,
    random_state=42,
    max_iter=10000
)
enet_cv.fit(X_train_std, y_train)

enet_mse_path = enet_cv.mse_path_
enet_cv_rmse  = float(np.sqrt(enet_mse_path.mean(axis=2).min()))
print(f"ElasticNetCV best alpha: {enet_cv.alpha_:.6f}")
print(f"ElasticNetCV best l1_ratio: {enet_cv.l1_ratio_:.2f}")
print(f"ElasticNetCV CV RMSE: {enet_cv_rmse:.6f}")

ElasticNetCV best alpha: 0.044984
ElasticNetCV best l1_ratio: 0.90
ElasticNetCV CV RMSE: 2.724970


# 6) Choose the best model by CV RMSE

In [39]:
if lasso_cv_rmse <= enet_cv_rmse:
    chosen_name = "Lasso"
    chosen_model = lasso_cv
else:
    chosen_name = "Elastic Net"
    chosen_model = enet_cv

print(f"\nChosen model: {chosen_name}")


Chosen model: Lasso


# 7) Predict on test set

In [40]:
test_pred = chosen_model.predict(X_test_std)
submission_df = pd.DataFrame({"ID": test_df["ID"], "Y": test_pred})

print(submission_df.head())

submission_df.to_csv(r"Assignment 2/kc/submission_assignment2.csv", index=False)

     ID         Y
0  2400  0.565183
1  2401 -0.190046
2  2402 -1.126926
3  2403 -2.092995
4  2404  2.912655
