In [1]:
# 차원축소의 필요성 검증 방안

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./sample_data.csv')
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [6]:
y = df['target'].copy()
X = df.drop(columns=['target']).copy()

In [7]:
from sklearn.preprocessing import StandardScaler
ssc = StandardScaler()

scaled_X = pd.DataFrame(ssc.fit_transform(X), index=X.index, columns=X.columns)

In [23]:
# 01 상관계수 기반의 검증


# 1) 상관계수 행렬 생성
def get_corr_mat(data):
    return data.corr()


# 2) 상관계수 행렬 필터링
def filter_corr_mat_by_criterion(corr_mat, criterion):
    high_corr_mat = corr_mat[(corr_mat.abs() > criterion) & (corr_mat != 1.0)]
    return high_corr_mat


# 3) 상관계수 행렬 변환
def format_corr_mat(corr_mat):
    corr_pairs = corr_mat.stack().reset_index(name='corr')
    corr_pairs['between'] = corr_pairs.apply(lambda x: sorted((x['level_0'], x['level_1'])), axis=1)
    corr_pairs = corr_pairs.drop_duplicates(subset=['between']).loc[:, ['between', 'corr']]
    return corr_pairs


data = scaled_X.copy()

corr_mat = get_corr_mat(data)
corr_mat = filter_corr_mat_by_criterion(corr_mat, 0.7)
corr_pairs = format_corr_mat(corr_mat)
corr_pairs

Unnamed: 0,between,corr
0,"[AveBedrms, AveRooms]",0.847621
2,"[Latitude, Longitude]",-0.924664


In [36]:
# 02 VIF(분산팽창계수) 기반의 검증
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


# VIF 계산
def calc_vif(data):
    vif_df = pd.DataFrame({
        'Feature': data.columns,
        'VIF': [variance_inflation_factor(data.values, i) for i in range(len(data.columns))]
    })
    vif_df = vif_df[vif_df['Feature'] != 'const']
    return vif_df


data = scaled_X.copy()
data = add_constant(data)

vif_df = calc_vif(data)
vif_df.sort_values('VIF',ascending=False)

Unnamed: 0,Feature,VIF
7,Latitude,9.297624
8,Longitude,8.962263
3,AveRooms,8.342786
4,AveBedrms,6.994995
1,MedInc,2.501295
2,HouseAge,1.241254
5,Population,1.138125
6,AveOccup,1.008324


In [47]:
# 04 Based on PCA
import numpy as np
from sklearn.decomposition import PCA

data = scaled_X.copy()


def fit_pca(data):
    pca = PCA()
    pca.fit(data)
    return pca


def get_n_components(pca, criterion):
    explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(explained_variance_ratio >= criterion) + 1
    return n_components


pca = fit_pca(data)
n_components = get_n_components(pca, 0.95)
n_components

6