In [55]:
# 차원축소의 필요성 검증 방안

In [56]:
import pandas as pd
import warnings

warnings.filterwarnings(action='ignore')

In [57]:
df = pd.read_csv('./dataset.csv')
df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_92,feature_93,feature_94,feature_95,feature_96,feature_97,feature_98,feature_99,feature_100,target
0,0.171125,-0.725577,-0.158922,0.128995,-0.179413,-2.471234,0.012505,0.675658,0.275776,0.158804,...,-0.873112,-0.151305,1.931169,0.996258,0.97943,2.10173,1.350916,0.659288,-0.745382,0
1,0.220779,-0.965366,-0.542392,1.991368,0.423319,-0.660269,0.664292,0.571453,0.61517,0.874445,...,3.300695,0.221097,3.024699,-0.907124,-1.15385,-2.261886,1.393726,-3.27012,0.137104,1
2,-0.945804,-3.603032,0.358725,1.027859,1.243222,0.475499,1.215004,-0.304598,1.342799,-0.349501,...,-2.634325,-0.950468,-1.676426,0.692519,-0.210483,2.141457,-0.265394,-3.509862,1.319618,1
3,0.70259,3.496643,-0.019218,-0.233541,-0.41648,1.759895,-1.57912,0.153862,0.4384,1.383703,...,-0.467512,0.171151,-3.173007,-0.904805,0.258243,2.237245,-0.850129,0.13394,0.820632,0
4,-0.147966,-0.25413,-0.947523,-0.330187,-0.364925,2.597718,-0.757183,-3.102468,-0.144755,0.872969,...,4.750582,0.425558,0.178075,-0.167888,1.466826,1.119306,2.45797,-3.693263,0.396422,0


In [58]:
y = df['target'].copy()
X = df.drop(columns=['target']).copy()

In [59]:
from sklearn.preprocessing import StandardScaler

ssc = StandardScaler()
scaled_X = pd.DataFrame(ssc.fit_transform(X), index=X.index, columns=X.columns)

In [60]:
# Is Decomposition Required : idr

In [61]:
from abc import ABC, abstractmethod


class IDR(ABC):
    def __init__(self, data):
        pass

    @abstractmethod
    def show_reason(self):
        pass

    def __call__(self, threshold):
        pass

In [62]:
import numpy as np
import pandas as pd

class IDRCorr(IDR):
    def __init__(self, data):
        self.data = data

    def get_corr_mat(self):
        self.corr_mat = self.data.corr()
        return self.corr_mat

    def is_decomposition_required(self, threshold):
        if not hasattr(self, 'corr_mat'):
            self.get_corr_mat()
        upper_triangle = np.triu(self.corr_mat, k=1)
        high_corr = np.any(np.abs(upper_triangle) > threshold)
        return high_corr

    def show_reason(self):
        corr_mat = self.corr_mat
        high_corr_mat = corr_mat[(corr_mat.abs() > self.threshold) & (corr_mat != 1.0)]
        corr_pairs = high_corr_mat.stack().reset_index(name='corr')
        corr_pairs['between'] = corr_pairs.apply(lambda x: sorted((x['level_0'], x['level_1'])), axis=1)
        corr_pairs = corr_pairs.drop_duplicates(subset=['between']).loc[:, ['between', 'corr']]
        return corr_pairs

    def __call__(self, threshold=0.8):
        self.threshold = threshold
        return self.is_decomposition_required(threshold)

data = scaled_X.copy()
idr_corr = IDRCorr(data)

if idr_corr(0.7):
    print('Decomposition required by [Corr]')
    display(idr_corr.show_reason())
else:
    print("Pass: No decomposition required.")

Decomposition required by [Corr]


Unnamed: 0,between,corr
0,"[feature_31, feature_64]",0.748642
2,"[feature_81, feature_94]",-0.712607


In [63]:
# 02 VIF(분산팽창계수) 기반의 검증
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant


class IDRVif(IDR):
    def __init__(self, data):
        self.data = data

    def get_vif_df(self):
        data = add_constant(self.data)  # Add a constant for the intercept
        vif_df = pd.DataFrame({
            'Feature': data.columns,
            'VIF': [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
        })
        vif_df = vif_df[vif_df['Feature'] != 'const']  # Exclude the constant
        self.vif_df = vif_df
        return vif_df

    def is_decomposition_required(self, threshold):
        if not hasattr(self, 'vif_df'):
            self.get_vif_df()
        high_vif_df = self.vif_df[self.vif_df['VIF'] > threshold]
        self.high_vif_df = high_vif_df
        return not high_vif_df.empty

    def show_reason(self):
        if hasattr(self, 'high_vif_df') and not self.high_vif_df.empty:
            return self.high_vif_df
        else:
            return "No features with high VIF."

    def __call__(self, threshold=10):
        return self.is_decomposition_required(threshold)


idr_vif = IDRVif(scaled_X)
if idr_vif(10):
    print('Decomposition required by [VIF]')
    display(idr_vif.show_reason())
else:
    print("Pass: No decomposition required.")

Decomposition required by [VIF]


Unnamed: 0,Feature,VIF
2,feature_2,inf
6,feature_6,inf
11,feature_11,inf
21,feature_21,inf
25,feature_25,inf
28,feature_28,inf
31,feature_31,inf
58,feature_58,inf
61,feature_61,inf
64,feature_64,inf


In [65]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np


class IDRPca:
    def __init__(self, data):
        self.data = data
        self.pca = None
        self.explained_variance_ratio_ = None

    def calculate_pca(self):
        self.pca = PCA()
        self.pca.fit(self.data)
        self.explained_variance_ratio_ = np.cumsum(self.pca.explained_variance_ratio_)

    def is_decomposition_required(self, threshold=0.95):
        if self.explained_variance_ratio_ is None:
            self.calculate_pca()
        required_components = np.sum(self.explained_variance_ratio_ < threshold) + 1
        self.required_components = required_components
        return required_components < self.data.shape[1]

    def show_reason(self):
        if self.explained_variance_ratio_ is None:
            self.calculate_pca()

        if self.required_components < self.data.shape[1]:
            return (f"Only {self.required_components} components "
                    f"are needed to explain {self.explained_variance_ratio_[self.required_components-1]*100:.2f}% "
                    f"of the variance, compared to the original {self.data.shape[1]} dimensions.")
        else:
            return "No decomposition required: All components are necessary to retain sufficient variance."

    def __call__(self, threshold=0.95):

        return self.is_decomposition_required(threshold)


idr_pca = IDRPca(data)
if idr_pca(0.95):
    print('Decomposition required by [PCA]')
    display(idr_pca.show_reason())
else:
    print("Pass: No decomposition required.")

Decomposition required by [PCA]


'Only 83 components are needed to explain 95.47% of the variance, compared to the original 100 dimensions.'