In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


1) 데이터 준비

In [8]:
!head -n 5 /content/drive/MyDrive/Colab Notebooks/ML2024_A/balance.csv

head: cannot open '/content/drive/MyDrive/Colab' for reading: No such file or directory
head: cannot open 'Notebooks/ML2024_A/balance.csv' for reading: No such file or directory


In [9]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML2024_A/balance.csv', index_col=0)

In [10]:
df

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331
...,...,...,...,...,...,...,...,...,...,...,...
396,12.096,4100,307,3,32,13,Male,No,Yes,Caucasian,560
397,13.364,3838,296,5,65,17,Male,No,No,African American,480
398,57.872,4171,321,5,67,12,Female,No,Yes,Caucasian,138
399,37.728,2525,192,1,44,13,Male,No,Yes,Caucasian,0


In [11]:
X = df.drop('Balance', axis=1)
y = df['Balance']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

2) 상관계수 분석

방법1 : 연속형 특성의 상관계수와 레이브과의 상관계수 따로 구하기

In [13]:
X_train.iloc[:, :6].corr()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education
Income,1.0,0.781176,0.780746,0.021098,0.159934,-0.003064
Limit,0.781176,1.0,0.996763,0.047589,0.107293,-0.003193
Rating,0.780746,0.996763,1.0,0.093048,0.109355,-0.009736
Cards,0.021098,0.047589,0.093048,1.0,0.034392,-0.039842
Age,0.159934,0.107293,0.109355,0.034392,1.0,0.026998
Education,-0.003064,-0.003193,-0.009736,-0.039842,0.026998,1.0


In [14]:
np.corrcoef(X_train['Rating'], y_train)

array([[1.        , 0.87271449],
       [0.87271449, 1.        ]])

In [15]:
np.corrcoef(X_train['Limit'], y_train)

array([[1.        , 0.86997087],
       [0.86997087, 1.        ]])

방법2 : 연속형 특성과 레이블의 상관계수 한꺼번에 구하기

In [16]:
pd.concat([X_train.iloc[:, :6], y_train.to_frame()], axis=1).corr()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Balance
Income,1.0,0.781176,0.780746,0.021098,0.159934,-0.003064,0.469061
Limit,0.781176,1.0,0.996763,0.047589,0.107293,-0.003193,0.869971
Rating,0.780746,0.996763,1.0,0.093048,0.109355,-0.009736,0.872714
Cards,0.021098,0.047589,0.093048,1.0,0.034392,-0.039842,0.121864
Age,0.159934,0.107293,0.109355,0.034392,1.0,0.026998,0.022783
Education,-0.003064,-0.003193,-0.009736,-0.039842,0.026998,1.0,0.007401
Balance,0.469061,0.869971,0.872714,0.121864,0.022783,0.007401,1.0


상관관계가 가장 강한 연속형 특성은 Limit와 Rating이며, 이 둘 중 Balance와 더 강한 상관관계를 갖는 특성은 Rating이므로 Limit 특성을 삭제!

In [17]:
X_train = X_train.drop('Limit', axis=1)
X_test = X_test.drop('Limit', axis=1)

3) 특성 전처리(인코딩+스케일링)

방법1 : 직접 코드로 처리

In [18]:
X_train.columns

Index(['Income', 'Rating', 'Cards', 'Age', 'Education', 'Gender', 'Student',
       'Married', 'Ethnicity'],
      dtype='object')

In [19]:
num = list(X_train.columns[:5])   #['Income', 'Rating', 'Cards', 'Age', 'Education']
cat = list(X_train.columns[5:])    #['Gender', 'Student', Married', 'Ethnicity']

In [20]:
X_train_num = X_train[num]
X_test_num = X_test[num]

In [21]:
X_train_cat = X_train[cat]
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first', feature_name_combiner='concat')
one_hot_encoder.fit(X_train_cat)
X_train_cat_encoded = pd.DataFrame(one_hot_encoder.transform(X_train_cat),
                                   columns = one_hot_encoder.get_feature_names_out(),
                                   index=X_train_cat.index)
X_test_cat = X_test[cat]
X_test_cat_encoded = pd.DataFrame(one_hot_encoder.transform(X_test_cat),
                                   columns = one_hot_encoder.get_feature_names_out(),
                                   index=X_test_cat.index)

In [22]:
X_train_encoded = pd.concat([X_train_num, X_train_cat_encoded], axis=1)
X_test_encoded = pd.concat([X_test_num, X_test_cat_encoded], axis=1)

In [23]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train_encoded)
X_test_std = scaler.transform(X_test_encoded)

In [24]:
X_train_std

array([[-0.5885416 , -0.0268033 , -0.01426293, ..., -1.28641644,
        -0.59788466,  1.04780824],
       [-0.58496003, -0.56375316, -0.72740949, ..., -1.28641644,
        -0.59788466, -0.95437311],
       [ 0.42832821,  1.405063  ,  1.41203019, ..., -1.28641644,
        -0.59788466,  1.04780824],
       ...,
       [-0.89365582, -0.34499581,  1.41203019, ..., -1.28641644,
        -0.59788466, -0.95437311],
       [-0.9783407 , -0.76925249, -0.01426293, ...,  0.77735325,
        -0.59788466, -0.95437311],
       [-0.40102718,  0.75541995, -0.72740949, ...,  0.77735325,
        -0.59788466,  1.04780824]])

방법2 : ColumnTransformer 사용하기

In [25]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

num_pipeline = make_pipeline(StandardScaler())
cat_pipeline = make_pipeline(OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first'), StandardScaler())
preprocessing = make_column_transformer(
        (num_pipeline, make_column_selector(dtype_include=np.number)),
        (cat_pipeline, make_column_selector(dtype_include=object))
    )
X_train_preprocessed = preprocessing.fit_transform(X_train)
X_test_preprocessed = preprocessing.transform(X_test)

In [26]:
X_train_preprocessed

array([[-0.5885416 , -0.0268033 , -0.01426293, ..., -1.28641644,
        -0.59788466,  1.04780824],
       [-0.58496003, -0.56375316, -0.72740949, ..., -1.28641644,
        -0.59788466, -0.95437311],
       [ 0.42832821,  1.405063  ,  1.41203019, ..., -1.28641644,
        -0.59788466,  1.04780824],
       ...,
       [-0.89365582, -0.34499581,  1.41203019, ..., -1.28641644,
        -0.59788466, -0.95437311],
       [-0.9783407 , -0.76925249, -0.01426293, ...,  0.77735325,
        -0.59788466, -0.95437311],
       [-0.40102718,  0.75541995, -0.72740949, ...,  0.77735325,
        -0.59788466,  1.04780824]])

4) 선형회귀

In [27]:
reg = LinearRegression()
reg.fit(X_train_std, y_train)
print('결정계수(R^2) =', reg.score( X_test_std, y_test))

결정계수(R^2) = 0.9418621110308901


In [28]:
reg.coef_

array([-2.59243856e+02,  5.93404414e+02,  4.99217660e+00, -1.37514356e+01,
       -5.86051096e-01, -1.48895412e+00,  1.23124198e+02, -3.23142528e+00,
        1.45324156e+01,  1.04758759e+01])

5) 회귀트리

In [29]:
depth_grid = {'max_depth': range(1, 30)}
tree_cv = GridSearchCV(estimator=DecisionTreeRegressor(random_state=1),
                    param_grid = depth_grid, cv = 10,
                    scoring='r2', refit = True, n_jobs=-1)
tree_cv.fit(X_train_std, y_train)
print(tree_cv.best_score_)
print(tree_cv.best_params_)
print(tree_cv.score(X_test_std, y_test))

0.8972235528785537
{'max_depth': 9}
0.9027012490097133


6) 랜덤 포레스트

In [30]:
depth_grid = {'max_depth': range(1, 10)}
forest_cv = GridSearchCV(estimator=RandomForestRegressor(n_estimators=200, random_state=1, n_jobs=-1),
                    param_grid = depth_grid, cv = 10,
                    scoring='r2', refit = True, n_jobs=-1)
forest_cv.fit(X_train_std, y_train)
print(forest_cv.best_score_)
print(forest_cv.best_params_)
print(forest_cv.score(X_test_std, y_test))

0.9231796568748152
{'max_depth': 8}
0.9312983225953684


7) 모형 비교

결정계수로 성능을 비교할 때, 선형회귀가 가장 뛰어나다. 뿐만 아니라, 계산량을 비교하면 선형회귀<회귀트리<랜덤포레스트 이므로 계산의 효율면에서도 선형회귀가 베스트 모형이다.