In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('./sample_data.csv')
df.head()

In [3]:
y = df['target'].copy()
X = df.drop(columns=['target']).copy()

from sklearn.preprocessing import StandardScaler
ssc = StandardScaler()

scaled_X = pd.DataFrame(ssc.fit_transform(X), index=X.index, columns=X.columns)

In [None]:
# 01 based on PCA
from sklearn.decomposition import PCA

# 1) 1차 적합
pca = PCA()
pca.fit(scaled_X)

# 2) criterion을 위한 pca 갯수 확인
criterion = 0.95

explained_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(explained_variance_ratio >= criterion) + 1

# 3) 2차 적합
pca = PCA(n_components=n_components)
pca_scaled_decomposed_X = pca.fit_transform(scaled_X)

# 4) 결과물 DataFrame로 변환
pca_scaled_decomposed_X = pd.DataFrame(pca_scaled_decomposed_X, columns=pca.get_feature_names_out())
pca_scaled_decomposed_X

In [None]:
# 01 based on SVD
# pca와 결과 동일 / 희소 데이터에 강점
from sklearn.decomposition import TruncatedSVD

# 1) 1차 적합
svd = TruncatedSVD(n_components=len(scaled_X.columns))
svd.fit(scaled_X)

# 2) criterion을 위한 svd 갯수 확인
criterion = 0.95

explained_variance_ratio = np.cumsum(svd.explained_variance_ratio_)
n_components = np.argmax(explained_variance_ratio >= criterion) + 1
n_components

# 3) 2차 적합
svd = TruncatedSVD(n_components=n_components)
svd_scaled_decomposed_X = svd.fit_transform(scaled_X)

# 4) 결과물 DataFrame로 변환
svd_scaled_decomposed_X = pd.DataFrame(svd_scaled_decomposed_X, columns=svd.get_feature_names_out())
svd_scaled_decomposed_X

In [None]:
# from sklearn.manifold import TSNE

# tsne = TSNE(n_components=n_components, random_state=42, method='exact')
# # default method barnes_hut은 빠름 / 하지만 최대 n_components가 3
# # exact를 하면 계산비용이 증가 / 하지만 n_components가 4이상도 가능
# tsne_scaled_decomposed_X = tsne.fit_transform(scaled_X.sample(500))
# tsne_scaled_decomposed_X = pd.DataFrame(tsne_scaled_decomposed_X, columns=tsne.get_feature_names_out())
# tsne_scaled_decomposed_X

In [7]:
from sklearn.manifold import Isomap
isomap = Isomap(n_components=n_components)
isomap_scaled_decomposed_X = isomap.fit_transform(scaled_X.sample(500))
isomap_scaled_decomposed_X = pd.DataFrame(isomap_scaled_decomposed_X, columns=isomap.get_feature_names_out())
isomap_scaled_decomposed_X

Unnamed: 0,isomap0,isomap1,isomap2,isomap3,isomap4,isomap5
0,-2.840024,-1.526056,-0.557812,-0.698347,0.845622,0.123895
1,-2.387329,-2.191925,-0.115647,0.162653,0.366112,0.379905
2,-4.185533,2.260146,-1.020447,0.235414,0.324135,1.316304
3,-2.331825,-0.898431,0.165755,0.619668,-0.578478,0.279465
4,-2.207018,-0.407122,0.284156,1.149626,-0.957953,0.242877
...,...,...,...,...,...,...
495,-5.203768,4.158118,-0.656371,3.447946,-0.527773,1.859715
496,4.325369,2.042511,-1.251492,0.909208,0.216360,0.766335
497,-2.752070,-2.314540,0.633844,0.752933,-0.405230,-0.194897
498,-2.672607,0.086414,-0.124605,1.823219,-1.098712,0.369930


In [9]:
from sklearn.feature_selection import VarianceThreshold

# 분산 임계값 설정
threshold = 0.3
selector = VarianceThreshold(threshold=threshold)
X_reduced = selector.fit_transform(scaled_X)

print("축소된 데이터 크기:", X_reduced.shape)

축소된 데이터 크기: (20640, 8)
