In [3]:
# 데이터 분할
# 분석 데이터를 학습용 데이터와, 테스트 데이터로 나누기 위함
# scikit learn의 tran_test_split 함수 사용
import pandas as pd
from sklearn.datasets import load_iris

IRIS = load_iris()
iris = pd.DataFrame(IRIS.data, columns=IRIS.feature_names)
iris["class"] = IRIS.target
iris["class"] = iris["class"].map({0:'Setosa', 1:'Versicolor', 2:'Virginica'})
iris.columns = ['sep_len', 'sep_wid', 'pet_len', 'pet_wid', 'class']
del IRIS
iris


Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid,class
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris.drop(columns='class'), iris['class'], test_size=0.2, random_state=1004)
print("X 학습: ", X_train.shape, "X test: ", X_test.shape)
print("y 학습: ", y_train.shape, "y test: ", y_test.shape)

X 학습:  (120, 4) X test:  (30, 4)
y 학습:  (120,) y test:  (30,)


In [5]:
X_train.head(3)

Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid
87,6.3,2.3,4.4,1.3
67,5.8,2.7,4.1,1.0
131,7.9,3.8,6.4,2.0


In [6]:
y_train.head(3)

87     Versicolor
67     Versicolor
131     Virginica
Name: class, dtype: object

In [7]:
# 랜덤 추출에서 각 계층별 추출량이 다를 수 있음을 확인
iris['class'].value_counts()

class
Setosa        50
Versicolor    50
Virginica     50
Name: count, dtype: int64

In [8]:
y_train.value_counts()

class
Versicolor    41
Setosa        40
Virginica     39
Name: count, dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(iris.drop(columns='class'), iris['class'], test_size=0.2, random_state=1004, stratify=iris['class'])
print("X 학습: ", X_train.shape, "X test: ", X_test.shape)
print("y 학습: ", y_train.shape, "y test: ", y_test.shape)

X 학습:  (120, 4) X test:  (30, 4)
y 학습:  (120,) y test:  (30,)


In [10]:
y_train.value_counts()

class
Versicolor    40
Virginica     40
Setosa        40
Name: count, dtype: int64

In [11]:
# 데이터 스케일링이란 서로 값의 범위 차이가 큰 여러 데이터들을 비슷한 수준의 차이가 나도록 스케일링하는 것

from sklearn.preprocessing import StandardScaler
StdScaler = StandardScaler()

In [12]:

StdScaler.fit(X_train)
X_train

Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid
52,6.9,3.1,4.9,1.5
121,5.6,2.8,4.9,2.0
95,5.7,3.0,4.2,1.2
136,6.3,3.4,5.6,2.4
26,5.0,3.4,1.6,0.4
...,...,...,...,...
59,5.2,2.7,3.9,1.4
71,6.1,2.8,4.0,1.3
120,6.9,3.2,5.7,2.3
148,6.2,3.4,5.4,2.3


In [13]:
X_train_sc = StdScaler.transform(X_train)
X_test_sc = StdScaler.transform(X_test)

In [22]:
print(type(X_train))
print(type(X_train_sc))
print(len(X_train_sc))

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
120


In [23]:
print("\t\t(min, max) (mean std)")
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(),X_train_sc.std()))
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(),X_test_sc.std()))

		(min, max) (mean std)
Train_scaled (-2.37, 3.04) (0.00, 1.00)
Train_scaled (-1.76, 2.48) (-0.01, 0.97)


In [25]:
from sklearn.preprocessing import MinMaxScaler
MmScaler = MinMaxScaler()

In [26]:
MmScaler.fit(X_train)
X_train_sc = MmScaler.transform(X_train)
X_test_sc = MmScaler.transform(X_test)

In [27]:
print("\t\t(min, max) (mean std)")
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(),X_train_sc.std()))
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(),X_test_sc.std()))

		(min, max) (mean std)
Train_scaled (0.00, 1.00) (0.46, 0.27)
Train_scaled (0.03, 1.06) (0.45, 0.26)


In [28]:
from sklearn.preprocessing import MaxAbsScaler
MaScaler = MinMaxScaler()

In [29]:
MaScaler.fit(X_train)
X_train_sc = MaScaler.transform(X_train)
X_test_sc = MaScaler.transform(X_test)

In [30]:
print("\t\t(min, max) (mean std)")
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(),X_train_sc.std()))
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(),X_test_sc.std()))

		(min, max) (mean std)
Train_scaled (0.00, 1.00) (0.46, 0.27)
Train_scaled (0.03, 1.06) (0.45, 0.26)


In [31]:
from sklearn.preprocessing import RobustScaler
RbScaler = RobustScaler()

In [32]:
RbScaler.fit(X_train)
X_train_sc = RbScaler.transform(X_train)
X_test_sc = RbScaler.transform(X_test)

In [33]:
print("\t\t(min, max) (mean std)")
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(),X_train_sc.std()))
print("Train_scaled (%.2f, %.2f) (%.2f, %.2f)"%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(),X_test_sc.std()))

		(min, max) (mean std)
Train_scaled (-1.90, 2.67) (-0.02, 0.65)
Train_scaled (-1.14, 1.90) (-0.02, 0.62)


In [34]:
pd.DataFrame(X_train_sc).head(3)

Unnamed: 0,0,1,2,3
0,0.846154,0.190476,0.157143,0.133333
1,-0.153846,-0.380952,0.157143,0.466667
2,-0.076923,0.0,-0.042857,-0.066667


In [35]:
X_original = RbScaler.inverse_transform(X_train_sc)
pd.DataFrame(X_original).head(3)

Unnamed: 0,0,1,2,3
0,6.9,3.1,4.9,1.5
1,5.6,2.8,4.9,2.0
2,5.7,3.0,4.2,1.2


In [36]:
X_train.head(3)

Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid
52,6.9,3.1,4.9,1.5
121,5.6,2.8,4.9,2.0
95,5.7,3.0,4.2,1.2
