In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LassoLarsCV
from sklearn.metrics import mean_squared_error

In [2]:
train = pd.read_csv('/kaggle/input/gdsc-ml-nsut-recruitment-2023/train (7).csv')
test = pd.read_csv("/kaggle/input/gdsc-ml-nsut-recruitment-2023/test (4).csv")
sample_submission = pd.read_csv("/kaggle/input/gdsc-ml-nsut-recruitment-2023/sample.csv")
train = train.drop("Unnamed: 0", axis=1)

In [3]:
train_x = train.drop("target", axis=1)
train_y = train['target']
test_x = test[[f'C_{i}' for i in range(200)]]

In [4]:
scaler = MinMaxScaler()
x_train_sc = scaler.fit_transform(train_x)
test_sc = scaler.transform(test_x)

In [5]:
rkf = RepeatedKFold(n_splits=50, n_repeats=10, random_state=42)

best_rmse = float('inf')
best_model = None

for train_idx, val_idx in rkf.split(x_train_sc):
    x_train_fold, x_val_fold = x_train_sc[train_idx], x_train_sc[val_idx]
    y_train_fold, y_val_fold = train_y.iloc[train_idx], train_y.iloc[val_idx]

    las = LassoLarsCV(eps=8, precompute=True, max_iter=500, cv=5, max_n_alphas=8, n_jobs=-1)
    las.fit(x_train_fold, y_train_fold)
    val_fold_preds = las.predict(x_val_fold)

    mse_fold = mean_squared_error(y_val_fold, val_fold_preds)
    rmse_fold = np.sqrt(mse_fold)

    if rmse_fold < best_rmse:
        best_rmse = rmse_fold
        best_model = las
        
print("Best RMSE from repeated k-fold:", best_rmse)

Best RMSE from repeated k-fold: 1.6670506064110575


In [6]:
best_model.fit(x_train_sc, train_y)

preds = best_model.predict(test_sc)
sample_submission['target'] = preds
sample_submission.to_csv("submission_repeated.csv", index=False)