In [1]:
# data scaling
# scaling의 종류에는 대표적으로 StandardScaler, MinMaxScaler, Normalizer, RobustScaler가 있다. 
# scaling은 다차원의 값들을 비교 분석하기 쉽게 해주고 오버플로우(overflow)와 언더플로우(underflow)를 
# 방지시킬 수 있다. 또한 독립 변수의 공분산 행렬의 조건수를 감소시켜 최적화 과정에서의
# 안정성 및 수렴 속도를 향상시킨다.
# 대부분의 scaling은 이상치(outlier)에 민감하기 때문에 이상치 처리를 해준 뒤에 이루어져야 한다.

In [2]:
# 임의의 데이터 생성
import numpy as np
import pandas as pd
X = np.random.randint(1, 10, size = 100)
Y = np.random.randint(10000, 20000, size = 100)
Z = np.random.randint(100, 1000, size = 100)
df = pd.DataFrame({"X":X, "Y":Y, "Z":Z})
df.head(5)

Unnamed: 0,X,Y,Z
0,4,19199,975
1,7,18852,463
2,7,17696,541
3,8,16219,746
4,1,16999,530


In [3]:
# StandardScaler
# 각 feature들의 값을 평균 0, 분산 1이 되도록 만드는 과정이다.

# fit 함수는 분포 모수를 객체 내에 저장한다.
# transform 함수는 입력받은 데이터를 변환한다.
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled = scaler.fit_transform(df)
scaled = pd.DataFrame(scaled)
scaled

Unnamed: 0,0,1,2
0,-0.490750,1.277540,1.721448
1,0.668502,1.168442,-0.331905
2,0.668502,0.804993,-0.019090
3,1.054919,0.340620,0.803054
4,-1.650002,0.585854,-0.063205
...,...,...,...
95,-0.877167,0.557873,-1.190143
96,-0.877167,-0.324655,1.027639
97,1.054919,1.179132,0.646646
98,1.441337,-1.308420,-0.933474


In [4]:
# RobustScaler
# 중앙값(median)이 0, IQR이 1이 되도록 변환한다.
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaled = scaler.fit_transform(df)
scaled = pd.DataFrame(scaled)
scaled

Unnamed: 0,0,1,2
0,-0.50,0.602403,0.961358
1,0.25,0.542620,-0.237705
2,0.25,0.343455,-0.055035
3,0.50,0.088987,0.425059
4,-1.25,0.223371,-0.080796
...,...,...,...
95,-0.75,0.208037,-0.738876
96,-0.75,-0.275574,0.556206
97,0.50,0.548477,0.333724
98,0.75,-0.814662,-0.588993


In [5]:
# MinMaxScaler
# 최댓값이 1, 최솟값이 0이 되도록 변환한다.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df)
scaled = pd.DataFrame(scaled)
scaled

Unnamed: 0,0,1,2
0,0.375,0.922190,0.974273
1,0.750,0.887396,0.401566
2,0.750,0.771483,0.488814
3,0.875,0.623383,0.718121
4,0.000,0.701594,0.476510
...,...,...,...
95,0.250,0.692670,0.162192
96,0.250,0.411210,0.780761
97,0.875,0.890805,0.674497
98,1.000,0.097463,0.233781


In [6]:
# Normalizer
# 각 row마다 정규화해 유클리드 거리가 1이 되도록 변환한다.
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
scaled = scaler.fit_transform(df)
scaled = pd.DataFrame(scaled)
scaled

Unnamed: 0,0,1,2
0,0.000208,0.998713,0.050719
1,0.000371,0.999698,0.024552
2,0.000395,0.999533,0.030558
3,0.000493,0.998944,0.045947
4,0.000059,0.999514,0.031163
...,...,...,...
95,0.000177,0.999892,0.014723
96,0.000212,0.998387,0.056776
97,0.000423,0.999300,0.037409
98,0.000820,0.999593,0.028510
