<img src="https://i.esdrop.com/d/7o0dj05m8rnz/JNGCMedl18.png" width="45%">

# Data Preprocessing

---

# 데이터 전처리

In [1]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# 1 Data Scaling
- Feature(X값): 수치형 데이터
- [scikit-learn preprocessing](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing)

1. Standard Scaling
2. MinMax Scaling
3. Robust Scaling
4. Normalization Scaling
5. Log Scaling
6. Exponential Scaling

##### <주의> 학습 데이터, 테스트 데이터 스케일 조정

#### 데이터 로딩

In [2]:
# Visual Python: Data Analysis > File
df_iris = pd.read_csv('./data/iris.csv')
df_iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica


#### 데이터 분할: 학습 데이터 + 테스트 데이터

In [3]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], df_iris['variety'])

### 1.1 Standard Scaling

#### 1.1.1 스케일러 생성

In [4]:
# Visual Python: Machine Learning > Data Prep
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

#### 1.1.2 스케일러 학습 - 규칙 설정

In [5]:
# Visual Python: Machine Learning > Fit/Predict
scaler.fit(X_train)

#### 1.1.3 학습 데이터 변환

In [6]:
# Visual Python: Machine Learning > Fit/Predict
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-0.96083593,  1.68252428, -1.3410563 , -1.25304594],
       [ 0.50829205, -1.62735955,  0.31890973,  0.11613596],
       [-0.10384461, -0.74472386,  0.71959118,  0.9376451 ],
       [-1.2056906 ,  1.24120643, -1.3982965 , -1.52688232],
       [-1.2056906 , -1.1860417 ,  0.37614994,  0.66380872],
       [ 1.8549927 , -0.52406494,  1.29199326,  0.9376451 ],
       [-1.45054526,  0.35857075, -1.28381609, -1.38996413],
       [-1.08326327,  1.02054751, -1.28381609, -0.84229137],
       [ 0.38586472, -1.84801847,  0.37614994,  0.38997234],
       [-0.59355394,  0.79988859, -1.22657588, -1.38996413],
       [ 0.38586472, -0.52406494,  0.54787056,  0.80072691],
       [ 1.12042871, -0.52406494,  0.54787056,  0.25305415],
       [-0.96083593,  1.02054751, -1.3982965 , -1.38996413],
       [-0.10384461,  2.12384212, -1.51277692, -1.38996413],
       [-0.96083593,  1.02054751, -1.3982965 , -1.25304594],
       [-1.57297259,  1.24120643, -1.62725733, -1.38996413],
       [ 2.09984736, -0.

#### 1.1.4 테스트 데이터 변환

In [7]:
# Visual Python: Machine Learning > Fit/Predict
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-1.57297259,  0.79988859, -1.3982965 , -1.25304594],
       [ 1.61013803,  1.24120643,  1.29199326,  1.75915424],
       [ 0.63071938, -0.52406494,  1.00579222,  1.34839967],
       [ 0.50829205, -1.1860417 ,  0.60511077,  0.38997234],
       [-1.45054526,  0.35857075, -1.45553671, -1.38996413],
       [ 0.99800137,  0.13791183,  0.49063035,  0.38997234],
       [ 0.01858272,  0.35857075,  0.54787056,  0.80072691],
       [-0.8384086 ,  0.79988859, -1.3982965 , -1.38996413],
       [-0.47112661, -0.96538278,  0.31890973, -0.02078223],
       [ 1.24285604,  0.13791183,  0.71959118,  1.48531786],
       [-1.81782726, -0.30340602, -1.3982965 , -1.38996413],
       [-0.96083593,  1.68252428, -1.11209546, -1.11612775],
       [-0.96083593,  0.57922967, -1.22657588, -0.97920956],
       [ 0.01858272, -0.0827471 ,  0.71959118,  0.80072691],
       [ 0.14101005, -0.0827471 ,  0.54787056,  0.80072691],
       [ 1.24285604,  0.13791183,  0.89131181,  1.21148148],
       [ 0.38586472,  0.

In [8]:
print('스케일 조정 후 특성별 최소값:', X_test_scaled.min(axis=0))
print('스케일 조정 후 특성별 최대값:', X_test_scaled.max(axis=0))

스케일 조정 후 특성별 최소값: [-1.81782726 -1.84801847 -1.45553671 -1.52688232]
스케일 조정 후 특성별 최대값: [2.22227469 1.68252428 1.29199326 1.75915424]


### 1.2 MinMaxScaler

#### 1.2.1 스케일러 생성

In [9]:
# Visual Python: Machine Learning > Data Prep
from sklearn.preprocessing import MinMaxScaler

scaler_m = MinMaxScaler()

#### 1.2.2 스케일러 학습 - 규칙 설정

In [10]:
# Visual Python: Machine Learning > Fit/Predict
scaler_m.fit(X_train)

#### 1.2.3 학습 데이터 변환

In [11]:
# Visual Python: Machine Learning > Fit/Predict
X_train_scaled_m = scaler_m.transform(X_train)
X_train_scaled_m

array([[0.22222222, 0.75      , 0.08474576, 0.08333333],
       [0.55555556, 0.125     , 0.57627119, 0.5       ],
       [0.41666667, 0.29166667, 0.69491525, 0.75      ],
       [0.16666667, 0.66666667, 0.06779661, 0.        ],
       [0.16666667, 0.20833333, 0.59322034, 0.66666667],
       [0.86111111, 0.33333333, 0.86440678, 0.75      ],
       [0.11111111, 0.5       , 0.10169492, 0.04166667],
       [0.19444444, 0.625     , 0.10169492, 0.20833333],
       [0.52777778, 0.08333333, 0.59322034, 0.58333333],
       [0.30555556, 0.58333333, 0.11864407, 0.04166667],
       [0.52777778, 0.33333333, 0.6440678 , 0.70833333],
       [0.69444444, 0.33333333, 0.6440678 , 0.54166667],
       [0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.22222222, 0.625     , 0.06779661, 0.08333333],
       [0.08333333, 0.66666667, 0.        , 0.04166667],
       [0.91666667, 0.41666667, 0.94915254, 0.83333333],
       [0.13888889, 0.45833333,

In [12]:
print('스케일 조정 후 특성별 최소값:', X_train_scaled_m.min(axis=0))
print('스케일 조정 후 특성별 최대값:', X_train_scaled_m.max(axis=0))

스케일 조정 후 특성별 최소값: [0. 0. 0. 0.]
스케일 조정 후 특성별 최대값: [1. 1. 1. 1.]


#### 1.2.4 테스트 데이터 변환

In [13]:
# Visual Python: Machine Learning > Fit/Predict
X_test_scaled_m = scaler_m.transform(X_test)
X_test_scaled_m

array([[0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.80555556, 0.66666667, 0.86440678, 1.        ],
       [0.58333333, 0.33333333, 0.77966102, 0.875     ],
       [0.55555556, 0.20833333, 0.66101695, 0.58333333],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.66666667, 0.45833333, 0.62711864, 0.58333333],
       [0.44444444, 0.5       , 0.6440678 , 0.70833333],
       [0.25      , 0.58333333, 0.06779661, 0.04166667],
       [0.33333333, 0.25      , 0.57627119, 0.45833333],
       [0.72222222, 0.45833333, 0.69491525, 0.91666667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.22222222, 0.75      , 0.15254237, 0.125     ],
       [0.22222222, 0.54166667, 0.11864407, 0.16666667],
       [0.44444444, 0.41666667, 0.69491525, 0.70833333],
       [0.47222222, 0.41666667, 0.6440678 , 0.70833333],
       [0.72222222, 0.45833333, 0.74576271, 0.83333333],
       [0.52777778, 0.58333333, 0.74576271, 0.91666667],
       [0.19444444, 0.58333333,

In [14]:
print('스케일 조정 후 특성별 최소값:', X_test_scaled_m.min(axis=0))
print('스케일 조정 후 특성별 최대값:', X_test_scaled_m.max(axis=0))

스케일 조정 후 특성별 최소값: [0.02777778 0.08333333 0.05084746 0.        ]
스케일 조정 후 특성별 최대값: [0.94444444 0.75       0.86440678 1.        ]


### 1.7 스케일링 적용 - 지도학습

In [15]:
# Visual Python: Machine Learning > Data Sets
from sklearn.datasets import load_breast_cancer

ldata = load_breast_cancer()
# Create DataFrame
df_ldata = pd.DataFrame(data=ldata.data, columns=ldata.feature_names)
df_ldata['target'] = ldata.target
df_ldata

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,0
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,0
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,0.2597,0.09744,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,0
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,0
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,0
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,0
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,0


#### 1.7.1 데이터 분할

#### 1.7.2 Supprt Vector Machine

In [16]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_ldata[['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'mean smoothness',
                                                              'mean compactness', 'mean concavity', 'mean concave points', 'mean symmetry',
                                                              'mean fractal dimension', 'radius error', 'texture error', 'perimeter error',
                                                              'area error', 'smoothness error', 'compactness error', 'concavity error',
                                                              'concave points error', 'symmetry error', 'fractal dimension error', 'worst radius',
                                                              'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness',
                                                              'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']],
                                                    df_ldata['target'])

# [2] Classifier
from sklearn.svm import SVC

svc = SVC()

# [3] Fit
svc.fit(X_train, y_train)

# [4] Predict
pred = svc.predict(X_test)

# [5] Evaluation
from sklearn import metrics
from IPython.display import display, Markdown
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))
# Classification report
print(metrics.classification_report(y_test, pred))

### Confusion Matrix

col_0,0,1,All
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,53,7,60
1,2,81,83
All,55,88,143


              precision    recall  f1-score   support

           0       0.96      0.88      0.92        60
           1       0.92      0.98      0.95        83

    accuracy                           0.94       143
   macro avg       0.94      0.93      0.93       143
weighted avg       0.94      0.94      0.94       143



#### 1.7.3 Supprt Vector Machine - StandardScaler 적용

In [17]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Prep
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# [2] Fit
scaler.fit(X_train)

# [3] Transform
X_train_scaled = scaler.transform(X_train)

In [18]:
X_test_scaled = scaler.transform(X_test)

In [19]:
# Visual Python: Machine Learning > Pipeline
# [1] Classifier
from sklearn.svm import SVC

svc_s = SVC()

# [2] Fit
svc_s.fit(X_train_scaled, y_train)

# [3] Predict
pred = svc_s.predict(X_test_scaled)

# [4] Evaluation
from sklearn import metrics
from IPython.display import display, Markdown
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))
# Classification report
print(metrics.classification_report(y_test, pred))

### Confusion Matrix

col_0,0,1,All
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,59,1,60
1,1,82,83
All,60,83,143


              precision    recall  f1-score   support

           0       0.98      0.98      0.98        60
           1       0.99      0.99      0.99        83

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143



#### 1.7.4 Supprt Vector Machine - MinMaxScaler 적용

In [20]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Prep
from sklearn.preprocessing import MinMaxScaler

scaler_m = MinMaxScaler()

# [2] Fit
scaler_m.fit(X_train)

# [3] Transform
X_train_scaled_m = scaler_m.transform(X_train)

In [21]:
X_test_scaled_m = scaler_m.transform(X_test)

In [22]:
# Visual Python: Machine Learning > Pipeline
# [1] Classifier
from sklearn.svm import SVC

svc_m = SVC()

# [2] Fit
svc_m.fit(X_train_scaled_m, y_train)

# [3] Predict
pred = svc_m.predict(X_test_scaled_m)

# [4] Evaluation
from sklearn import metrics
from IPython.display import display, Markdown
# Confusion Matrix
display(Markdown('### Confusion Matrix'))
display(pd.crosstab(y_test, pred, margins=True))
# Classification report
print(metrics.classification_report(y_test, pred))

### Confusion Matrix

col_0,0,1,All
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,59,1,60
1,1,82,83
All,60,83,143


              precision    recall  f1-score   support

           0       0.98      0.98      0.98        60
           1       0.99      0.99      0.99        83

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143



# 2 Data Encoding
- Feature(X값): 범주형 데이터
- [scikit-learn preprocessing](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing)

1. Label Encoding
2. Ordinal Encoding
3. One-hot Encoding
4. [Mean Encoding](https://dailyheumsi.tistory.com/120)

#### 데이터 로딩

In [23]:
df = pd.DataFrame({'col1': [70000, 64000, 72000, 59000, 57000, 67000, 62000 , 69000],
                   'col2': ['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']})
df

Unnamed: 0,col1,col2
0,70000,TV
1,64000,냉장고
2,72000,전자렌지
3,59000,컴퓨터
4,57000,선풍기
5,67000,선풍기
6,62000,믹서
7,69000,믹서


### 2.1 Label Encoding

#### 2.1.1 Pandas DataFrame 이용

In [24]:
# Visual Python: Data Analysis > Frame
df['col2_label'] = pd.Categorical(df['col2']).codes
df

Unnamed: 0,col1,col2,col2_label
0,70000,TV,0
1,64000,냉장고,1
2,72000,전자렌지,4
3,59000,컴퓨터,5
4,57000,선풍기,3
5,67000,선풍기,3
6,62000,믹서,2
7,69000,믹서,2


#### 2.1.2 scikit-learn 이용

In [25]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Prep
from sklearn.preprocessing import LabelEncoder

encoder_l = LabelEncoder()

# [2] Fit
encoder_l.fit(df_iris['variety'])

# [3] Transform
enc_l = encoder_l.transform(df_iris['variety'])

### 2.2 One-hot Encoding

#### 2.1.2 Pandas DataFrame 이용

In [26]:
# Visual Python: Data Analysis > Frame
df = pd.get_dummies(data=df, columns=['col2'], dtype=int)
df

Unnamed: 0,col1,col2_label,col2_TV,col2_냉장고,col2_믹서,col2_선풍기,col2_전자렌지,col2_컴퓨터
0,70000,0,1,0,0,0,0,0
1,64000,1,0,1,0,0,0,0
2,72000,4,0,0,0,0,1,0
3,59000,5,0,0,0,0,0,1
4,57000,3,0,0,0,1,0,0
5,67000,3,0,0,0,1,0,0
6,62000,2,0,0,1,0,0,0
7,69000,2,0,0,1,0,0,0


#### 2.2.2 scikit-learn 이용

In [27]:
# Visual Python: Machine Learning > Pipeline
# [1] Data Prep
from sklearn.preprocessing import OneHotEncoder

encoder_o = OneHotEncoder(sparse_output=False)

# [2] Fit
encoder_o.fit(df_iris[['variety']])

# [3] Transform
enc_o = encoder_o.transform(df_iris[['variety']])

---

In [28]:
# End of file