## 데이터 핸들링 및 전처리 

## 08. 데이터 스케일링하기

<img src = "https://images.unsplash.com/photo-1507344694028-6dab396bed2b?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1757&q=80" width=80% align="center"/>

<div align="right">사진: <a href="https://unsplash.com/ko/%EC%82%AC%EC%A7%84/4i2crgbc0Gs?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText">Unsplash</a>의<a href="https://unsplash.com/@wanderfleur?utm_source=unsplash&utm_medium=referral&utm_content=creditCopyText">Wander Fleur</a>
  
</div>
  
  

## 0. 데이터 불러오기
- ### 데이터 설명
1. preprocessing_08.csv : 이전 실습에서 범주형 데이터를 수치형으로 변경 완료한 데이터
> - MovieId : (int) 영화 아이디 <br>
> - ImdbId : (int) IMDb 데이터베이스 관리 아이디<br>
> - TmdbId : (float) TMDB 데이터베이스 관리 아이디<br>
> - Title : (str) 영화 제목 <br> 
> - Year : (int) 제작년도 <br> 
> - UserId : (int) 유저 아이디 <br>
> - Rating : (float) 영화 평점 
> - Gender : (int) 성별, F:0, M:1 <br>
> ---
> (영화 장르 컬럼) <br>
> - Genre_adventure <br>
> &emsp;&emsp;&emsp;⇣<br>
> - Genre_Western<br>
> ---
> (고객 직업 컬럼) <br>
> - Job_academic/educator <br>
> &emsp;&emsp;&emsp;⇣<br>
> - Job_writer <br>


In [1]:
# 라이브러리 불러오기
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# 데이터 불러오기
df = pd.read_csv("./data/preprocessing_08.csv")

In [3]:
df.head()

Unnamed: 0,MovieId,ImdbId,TmdbId,Title,Year,UserId,Rating,Gender,Age,Genre_Action,...,Job_other,Job_programmer,Job_retired,Job_sales/marketing,Job_scientist,Job_self-employed,Job_technician/engineer,Job_tradesman/craftsman,Job_unemployed,Job_writer
0,1,114709,862,Toy Story,1995,1,4.0,0,2,0,...,0,0,0,0,0,0,0,0,0,0
1,1,114709,862,Toy Story,1995,5,4.0,1,30,0,...,0,0,0,0,0,0,0,0,0,1
2,1,114709,862,Toy Story,1995,7,4.5,1,39,0,...,0,0,0,0,0,0,0,0,0,0
3,1,114709,862,Toy Story,1995,15,2.5,1,29,0,...,0,0,0,0,0,0,0,0,0,0
4,1,114709,862,Toy Story,1995,17,4.5,1,52,0,...,0,0,0,0,0,0,0,0,0,0


---

### 1. 데이터 정규화(Normalization)
데이터를 일정 범위로 변환하는 스케일링 방법으로, 데이터를 0과 1 사이의 값으로 변환하는 것을 의미합니다. <br> 
정규화는 다양한 스케일을 가진 변수들을 동일한 범위로 맞춰줌으로써, 변수 간의 크기 차이를 제거하여 모델이 각 변수를 공평하게 처리할 수 있도록 돕습니다. <br>
대표적으로 Min-Max Scaler가 있습니다.

#### 1-1. Min-Max Scaler 사용하기
- #### 'Age' 컬럼과 'Rating' 컬럼 Min-Max Scaling 하기

In [4]:
# 라이브러리 불러오기
from sklearn.preprocessing import MinMaxScaler

In [6]:
# MinMaxScaler 호출하기
min_max_scaler = MinMaxScaler()

# 'Age', 'Rating' 컬럼의 데이터를 Min Max Scaling 하기
df_normalized = min_max_scaler.fit_transform(df[['Age', 'Rating']])

df_normalized

array([[0.00917431, 0.77777778],
       [0.26605505, 0.77777778],
       [0.34862385, 0.88888889],
       ...,
       [0.29357798, 0.66666667],
       [0.29357798, 0.66666667],
       [0.28440367, 0.77777778]])

### 2. 데이터 표준화(Standardization)
데이터를 평균이 0이고 표준편차가 1인 분포로 변환하는 스케일링 방법으로, 데이터를 표준정규분포에 따르는 값으로 변환하는 것을 의미합니다.  <br> 
표준화는 데이터의 분포를 중심으로 조절하여 이상치에 덜 민감하게 만들어줌으로써, 모델의 안정성을 높이고 예측 결과를 개선하는 데 도움을 줍니다. <br>

#### 2-1. Standard Scaler 사용하기
- #### 'Age' 컬럼과 'Rating' 컬럼 Standard Scaling 하기

In [7]:
df.describe()

Unnamed: 0,MovieId,ImdbId,TmdbId,Year,UserId,Rating,Gender,Age,Genre_Action,Genre_Adventure,...,Job_other,Job_programmer,Job_retired,Job_sales/marketing,Job_scientist,Job_self-employed,Job_technician/engineer,Job_tradesman/craftsman,Job_unemployed,Job_writer
count,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,...,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0
mean,19412.974083,351140.9,20054.997144,1994.441298,326.184769,3.501597,0.697529,33.399264,0.3038,0.239608,...,0.089593,0.070252,0.018071,0.04492,0.021007,0.036748,0.097686,0.007588,0.0121,0.040021
std,35490.865272,620702.6,53099.343977,14.367674,182.641372,1.04233,0.45933,15.584995,0.4599,0.426846,...,0.285599,0.255572,0.13321,0.20713,0.143409,0.188142,0.296891,0.086776,0.109335,0.196009
min,1.0,417.0,2.0,1902.0,1.0,0.5,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1199.0,99685.0,712.0,1990.0,177.0,3.0,0.0,23.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2991.0,118755.0,6957.0,1997.0,325.0,3.5,1.0,32.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8044.5,314979.0,11635.0,2003.0,477.0,4.0,1.0,42.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,193609.0,8391976.0,525662.0,2018.0,611.0,5.0,1.0,110.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
# 라이브러리 불러오기
from sklearn.preprocessing import StandardScaler

# StandardScaler 호출하기
standard_scaler = StandardScaler()

# 'Age', 'Rating' 컬럼의 데이터를 StandardScaling 하기
df_standardized = standard_scaler.fit_transform(df[['Age', 'Rating']])

df_standardized

array([[-2.01472118e+00,  4.78164901e-01],
       [-2.18112414e-01,  4.78164901e-01],
       [ 3.59368974e-01,  9.57861818e-01],
       ...,
       [-2.56186180e-02, -1.53201558e-03],
       [-2.56186180e-02, -1.53201558e-03],
       [-8.97832167e-02,  4.78164901e-01]])

---

## 실습문제

#### Q1. 데이터프레임 'df' 의 컬럼 중 숫자로 표현된 범주형 데이터('MovieId', 'UserId' 등)가 아닌 수치형 특성을 가진 컬럼을 선별해서 'df_num' 변수를 만들어 저장하세요.
📌 탐색적 분석을 하면 해당 컬럼을 쉽게 찾을 수 있습니다.


In [22]:
df.describe()

Unnamed: 0,MovieId,ImdbId,TmdbId,Year,UserId,Rating,Gender,Age,Genre_Action,Genre_Adventure,...,Job_other,Job_programmer,Job_retired,Job_sales/marketing,Job_scientist,Job_self-employed,Job_technician/engineer,Job_tradesman/craftsman,Job_unemployed,Job_writer
count,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,...,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0,100823.0
mean,19412.974083,351140.9,20054.997144,1994.441298,326.184769,3.501597,0.697529,33.399264,0.3038,0.239608,...,0.089593,0.070252,0.018071,0.04492,0.021007,0.036748,0.097686,0.007588,0.0121,0.040021
std,35490.865272,620702.6,53099.343977,14.367674,182.641372,1.04233,0.45933,15.584995,0.4599,0.426846,...,0.285599,0.255572,0.13321,0.20713,0.143409,0.188142,0.296891,0.086776,0.109335,0.196009
min,1.0,417.0,2.0,1902.0,1.0,0.5,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1199.0,99685.0,712.0,1990.0,177.0,3.0,0.0,23.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2991.0,118755.0,6957.0,1997.0,325.0,3.5,1.0,32.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8044.5,314979.0,11635.0,2003.0,477.0,4.0,1.0,42.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,193609.0,8391976.0,525662.0,2018.0,611.0,5.0,1.0,110.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
# 여기에 작성하세요.
df_copy = df.copy()

print(df_copy.dtypes)

df_num = df[['Year', 'Rating', 'Age']]

df_num

MovieId                       int64
ImdbId                        int64
TmdbId                        int64
Title                        object
Year                          int64
UserId                        int64
Rating                      float64
Gender                        int64
Age                           int64
Genre_Action                  int64
Genre_Adventure               int64
Genre_Animation               int64
Genre_Children                int64
Genre_Comedy                  int64
Genre_Crime                   int64
Genre_Documentary             int64
Genre_Drama                   int64
Genre_Fantasy                 int64
Genre_Film-Noir               int64
Genre_Horror                  int64
Genre_IMAX                    int64
Genre_Musical                 int64
Genre_Mystery                 int64
Genre_Romance                 int64
Genre_Sci-Fi                  int64
Genre_Thriller                int64
Genre_War                     int64
Genre_Western               

Unnamed: 0,Year,Rating,Age
0,1995,4.0,2
1,1995,4.0,30
2,1995,4.5,39
3,1995,2.5,29
4,1995,4.5,52
...,...,...,...
100818,2017,4.0,33
100819,2017,3.5,33
100820,2017,3.5,33
100821,2018,3.5,33


#### Q2. 데이터프레임 'df_num'을 Min_Max Scaler를 사용하여 데이터를 정규화 해보세요.

In [26]:
# 여기에 작성하세요.
# 라이브러리 불러오기.
from sklearn.preprocessing import MinMaxScaler

# 스케일러 호출하기
min_max_scaler = MinMaxScaler()

# 스케일링하기
scalered = min_max_scaler.fit_transform(df[['Age', 'Rating', 'Year']])


#### Q3. 스케일링 된 데이터를 기존의 데이터프레임 'df'에 적용해 주세요.

In [27]:
# 여기에 작성하세요.
df[['Age', 'Rating', 'Year']] = scalered

df



Unnamed: 0,MovieId,ImdbId,TmdbId,Title,Year,UserId,Rating,Gender,Age,Genre_Action,...,Job_other,Job_programmer,Job_retired,Job_sales/marketing,Job_scientist,Job_self-employed,Job_technician/engineer,Job_tradesman/craftsman,Job_unemployed,Job_writer
0,1,114709,862,Toy Story,0.801724,1,0.777778,0,0.009174,0,...,0,0,0,0,0,0,0,0,0,0
1,1,114709,862,Toy Story,0.801724,5,0.777778,1,0.266055,0,...,0,0,0,0,0,0,0,0,0,1
2,1,114709,862,Toy Story,0.801724,7,0.888889,1,0.348624,0,...,0,0,0,0,0,0,0,0,0,0
3,1,114709,862,Toy Story,0.801724,15,0.444444,1,0.256881,0,...,0,0,0,0,0,0,0,0,0,0
4,1,114709,862,Toy Story,0.801724,17,0.888889,1,0.467890,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100818,193581,5476944,432131,Black Butler: Book of the Atlantic,0.991379,184,0.777778,0,0.293578,1,...,1,0,0,0,0,0,0,0,0,0
100819,193583,5914996,445030,No Game No Life: Zero,0.991379,184,0.666667,0,0.293578,0,...,1,0,0,0,0,0,0,0,0,0
100820,193585,6397426,479308,Flint,0.991379,184,0.666667,0,0.293578,0,...,1,0,0,0,0,0,0,0,0,0
100821,193587,8391976,483455,Bungo Stray Dogs: Dead Apple,1.000000,184,0.666667,0,0.293578,1,...,1,0,0,0,0,0,0,0,0,0


---