In [1]:
# 라이브러리 import
import numpy as np
import pandas as pd

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
# csv 데이터 파일 불러오기
df = pd.read_csv('Violent_crimes.csv')
df

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X92,X93,X94,X95,X96,X97,X98,X99,X100,Violent_Crimes
0,0.19,0.33,0.02,0.90,0.12,0.17,0.34,0.47,0.29,0.32,...,0.12,0.42,0.50,0.51,0.64,0.12,0.26,0.20,0.32,0.20
1,0.00,0.16,0.12,0.74,0.45,0.07,0.26,0.59,0.35,0.27,...,0.21,0.50,0.34,0.60,0.52,0.02,0.12,0.45,0.00,0.67
2,0.00,0.42,0.49,0.56,0.17,0.04,0.39,0.47,0.28,0.32,...,0.14,0.49,0.54,0.67,0.56,0.01,0.21,0.02,0.00,0.43
3,0.04,0.77,1.00,0.08,0.12,0.10,0.51,0.50,0.34,0.21,...,0.19,0.30,0.73,0.64,0.65,0.02,0.39,0.28,0.00,0.12
4,0.01,0.55,0.02,0.95,0.09,0.05,0.38,0.38,0.23,0.36,...,0.11,0.72,0.64,0.61,0.53,0.04,0.09,0.02,0.00,0.03
5,0.02,0.28,0.06,0.54,1.00,0.25,0.31,0.48,0.27,0.37,...,0.70,0.42,0.49,0.73,0.64,0.01,0.58,0.10,0.00,0.14
6,0.01,0.39,0.00,0.98,0.06,0.02,0.30,0.37,0.23,0.60,...,0.15,0.81,0.77,0.91,0.84,0.05,0.08,0.06,0.00,0.03
7,0.01,0.74,0.03,0.46,0.20,1.00,0.52,0.55,0.36,0.35,...,0.59,0.58,0.52,0.79,0.78,0.01,0.33,0.00,0.00,0.55
8,0.03,0.34,0.20,0.84,0.02,0.00,0.38,0.45,0.28,0.48,...,0.01,0.78,0.48,0.79,0.75,0.04,0.17,0.04,0.00,0.53
9,0.01,0.40,0.06,0.87,0.30,0.03,0.90,0.82,0.80,0.39,...,0.22,0.42,0.34,0.23,0.09,0.00,0.47,0.11,0.00,0.15


In [3]:
# 독립변수와 종속변수 분리 (numpy array 형태로 추출)
X = df.to_numpy()[:, :-1]
y = df.to_numpy()[:, -1]

In [4]:
# cross validation index sets
cv = KFold(n_splits=5, shuffle=True)

In [25]:
alpha_list = [0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001]
for a in alpha_list:
    print('alpha:', a)
    mse_train_list = []
    mse_test_list = []
    for train_idx, test_idx in cv.split(X):
        X_train, y_train = X[train_idx, :], y[train_idx]
        X_test, y_test = X[test_idx, :], y[test_idx]

        lasso_reg = Lasso(alpha=a)
        lasso_reg.fit(X_train, y_train)

        y_pred_train = lasso_reg.predict(X_train)
        y_pred_test = lasso_reg.predict(X_test)

        mse_train = mean_squared_error(y_train, y_pred_train)
        mse_test = mean_squared_error(y_test, y_pred_test)

        mse_train_list.append(mse_train)
        mse_test_list.append(mse_test)
        
    print(np.mean(mse_train_list))
    print(np.mean(mse_test_list))

alpha: 0.5
0.054252854485206704
0.05427052894248716
alpha: 0.1
0.05424008185912399
0.054379870012529107
alpha: 0.05
0.054249009057914875
0.054306822001160326
alpha: 0.01
0.023818123685333554
0.024016140369333177
alpha: 0.005
0.020859908230403262
0.021272394774881773
alpha: 0.001
0.01838337674150941
0.018952132084961083
alpha: 0.0005
0.01799074335704004
0.01863814779983037
alpha: 0.0001
0.016853532559531346
0.018565236448176953


In [23]:
print(np.mean(mse_train_list))
print(np.mean(mse_test_list))

0.016839734106994485
0.01858767448809782


In [6]:
# train_idx, test_idx 생성
idx = np.arange(X.shape[0])
train_idx, test_idx = train_test_split(idx, test_size=0.2)

In [7]:
# train_idx, test_idx에 따라 데이터 분할
X_train, y_train = X[train_idx, :], y[train_idx]
X_test, y_test = X[test_idx, :], y[test_idx]

In [17]:
# lasso regression 객체 선언 및 fitting
lasso_reg = Lasso(alpha=0.0001)
lasso_reg.fit(X_train, y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [18]:
# 학습 데이터/검증 데이터에 대해 예측값 계산
y_pred_train = lasso_reg.predict(X_train)
y_pred_test = lasso_reg.predict(X_test)

In [19]:
# 계수 값 확인
lasso_reg.coef_

array([-0.00000000e+00,  0.00000000e+00,  1.55537866e-01, -4.16754354e-02,
       -0.00000000e+00,  5.43786532e-02,  0.00000000e+00, -1.57302805e-01,
        0.00000000e+00,  0.00000000e+00, -1.14513457e-01,  4.07825173e-02,
        0.00000000e+00, -1.45391709e-01,  3.54439293e-02, -8.92571362e-02,
        0.00000000e+00, -8.22936747e-03, -6.46933308e-02,  0.00000000e+00,
        0.00000000e+00, -1.07017425e-01, -2.33125255e-02, -1.80013609e-02,
        2.28839917e-02,  4.81560627e-02,  1.35614902e-02, -0.00000000e+00,
       -1.77429746e-01, -2.22007929e-02,  0.00000000e+00,  6.29633759e-02,
       -7.85949182e-03,  8.18951678e-02, -5.13762081e-02,  0.00000000e+00,
        4.26225611e-02,  0.00000000e+00,  1.33219601e-01,  1.34828245e-01,
       -5.83584425e-02, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
       -2.79955534e-01, -4.13117570e-02,  2.46099259e-02,  0.00000000e+00,
       -1.26404329e-01, -0.00000000e+00,  1.93369101e-01, -5.31480327e-02,
        2.60711226e-03, -

In [20]:
# 계수가 0이 아닌 변수의 index
np.where(lasso_reg.coef_ != 0)

(array([ 2,  3,  5,  7, 10, 11, 13, 14, 15, 17, 18, 21, 22, 23, 24, 25, 26,
        28, 29, 31, 32, 33, 34, 36, 38, 39, 40, 44, 45, 46, 48, 50, 51, 52,
        60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 74, 75, 77, 78,
        79, 82, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 98, 99],
       dtype=int64),)