In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import Lasso, ElasticNet, Ridge
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

In [2]:
# Load data and split features, target
data = pd.read_csv("data.csv")
data.drop("ID", axis=1, inplace=True)
target, features = data["y"], data.drop("y", axis=1)

X_train, X_test, y_train, y_test = train_test_split(features, target,
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=0)

In [3]:
# Select categorical columns
categorical_cols = [
    cname for cname in X_train.columns if X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [
    cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]

In [4]:
# Preprocessing for categorical data
# use different impute strategy for categorical data and use one hot to convert to numerical
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = SimpleImputer(strategy='median')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ])

preprocessor.set_output(transform='pandas')
preprocessor.fit_transform(X_train.head())

Unnamed: 0,cat__X0_aj,cat__X0_ak,cat__X0_ap,cat__X0_ay,cat__X0_h,cat__X1_a,cat__X1_l,cat__X1_r,cat__X1_v,cat__X2_ak,...,num__X375,num__X376,num__X377,num__X378,num__X379,num__X380,num__X382,num__X383,num__X384,num__X385
3540,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3748,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1287,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2856,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1380,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Grid search and predition
def pipeline_All(pipeline):
    param_grid = {model + "__alpha": [0.01, 0.1, 1]}
    scorer = make_scorer(mean_squared_error, greater_is_better=False)
    grid = GridSearchCV(pipeline, cv=5, param_grid=param_grid, scoring=scorer)
    grid.fit(X_train, y_train)

#  predition and result
    y_preds = grid.predict(X_test)
    means = grid.cv_results_['mean_test_score']
    stds = grid.cv_results_['std_test_score']
    params = grid.cv_results_['params']

    print(f"Model Name: {model}")
    for mean, stdev, param in zip(means, stds, params):
        print(f"平均準確率: {mean}, 標準差: {stdev}, 參數組合: {param}")
    print(f"最佳準確率: {grid.best_score_}，最佳參數組合：{grid.best_params_}")
    print(f"MSE: {mean_squared_error(y_test, y_preds)}", "\n")

In [6]:
# Testing different model
model_all = ['Lasso', 'ElasticNet', 'Ridge']

for model in model_all:
    if model == "Lasso":
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                                    ('pca', PCA(n_components=0.9)),
                                   ('Lasso', Lasso(max_iter=10000))])
    if model == "ElasticNet":
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('ElasticNet', ElasticNet())])
    if model == "Ridge":
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('Ridge', Ridge())])

    pipeline_All(pipeline)

Model Name: Lasso
平均準確率: -64.98740289905666, 標準差: 5.136347852809586, 參數組合: {'Lasso__alpha': 0.01}
平均準確率: -66.462815529369, 標準差: 5.452079050846119, 參數組合: {'Lasso__alpha': 0.1}
平均準確率: -91.7362542263196, 標準差: 4.247349227181149, 參數組合: {'Lasso__alpha': 1}
最佳準確率: -64.98740289905666，最佳參數組合：{'Lasso__alpha': 0.01}
MSE: 97.71311095102683 

Model Name: ElasticNet
平均準確率: -65.42221025327328, 標準差: 5.600548427899211, 參數組合: {'ElasticNet__alpha': 0.01}
平均準確率: -67.39460322631605, 標準差: 5.602832617561342, 參數組合: {'ElasticNet__alpha': 0.1}
平均準確率: -96.54802738423814, 標準差: 5.887853871026864, 參數組合: {'ElasticNet__alpha': 1}
最佳準確率: -65.42221025327328，最佳參數組合：{'ElasticNet__alpha': 0.01}
MSE: 98.43141155328058 

Model Name: Ridge
平均準確率: -73.01969276876399, 標準差: 7.29921795767681, 參數組合: {'Ridge__alpha': 0.01}
平均準確率: -72.2230680272748, 標準差: 7.018533350196532, 參數組合: {'Ridge__alpha': 0.1}
平均準確率: -69.82265609859135, 標準差: 6.312500911527025, 參數組合: {'Ridge__alpha': 1}
最佳準確率: -69.82265609859135，最佳參數組合：{'Ridge__alpha': 1}
MSE