In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing

In [2]:
# 일단 csv파일에서 데이터를 받아 온 뒤
nba_df = pd.read_csv('../data/NBA_player_of_the_week.csv')
nba_df.head()

Unnamed: 0,Player,Team,Conference,Date,Position,Height,Weight,Age,Draft Year,Seasons in league,Season,Season short,Pre-draft Team,Real_value,Height CM,Weight KG,Last Season
0,Jayson Tatum,Boston Celtics,East,"Feb 10, 2020",SF,6'8,208,21,2017,2,2019-2020,2020,Duke,0.5,203,94,1
1,Nikola Jokic,Denver Nuggets,West,"Feb 10, 2020",C,7'0,250,25,2014,4,2019-2020,2020,KK Mega Bemax (Serbia),0.5,213,113,1
2,Jaylen Brown,Boston Celtics,East,"Feb 3, 2020",SF,6'7,220,23,2016,3,2019-2020,2020,California,0.5,201,99,1
3,Damian Lillard,Portland Trail Blazers,West,"Feb 3, 2020",G,6'3,195,29,2012,7,2019-2020,2020,Weber State,0.5,190,88,1
4,Pascal Siakam,Toronto Raptors,East,"Jan 27, 2020",F,6'9,230,25,2016,3,2019-2020,2020,New Mexico State,0.5,206,104,1


In [3]:
# normalize를 할 새로운 데이터 프레임 생성
new_df = nba_df[['Height CM', 'Weight KG', 'Age']]

In [4]:
# preprocessing 모듈의 MinMaxScaler 함수를 사용
scaler = preprocessing.MinMaxScaler()
normalized_data = scaler.fit_transform(new_df)

In [5]:
normalized_df = pd.DataFrame(normalized_data, columns=['Height', 'Weight', 'Age'])
normalized_df

Unnamed: 0,Height,Weight,Age
0,0.518519,0.329114,0.095238
1,0.703704,0.569620,0.285714
2,0.481481,0.392405,0.190476
3,0.277778,0.253165,0.476190
4,0.574074,0.455696,0.285714
...,...,...,...
1335,0.240741,0.139241,0.238095
1336,0.574074,0.594937,0.047619
1337,0.481481,0.379747,0.238095
1338,0.388889,0.215190,0.238095


In [6]:
# 상세 정보를 살펴보면 모든 데이터 값들이 최소 0과 최대 1 사이에 있다는 것을 알 수 있다
normalized_df.describe()

Unnamed: 0,Height,Weight,Age
count,1340.0,1340.0,1340.0
mean,0.482808,0.422586,0.368479
std,0.173481,0.177357,0.161937
min,0.0,0.0,0.0
25%,0.333333,0.316456,0.238095
50%,0.481481,0.392405,0.333333
75%,0.611111,0.56962,0.47619
max,1.0,1.0,1.0


In [7]:
# 이번에는 표준화(standardization)을 해보자. 함수만 MinMaxScaler 에서 StandardScaler로 바꿔주면 된다
scaler = preprocessing.StandardScaler()
standardized_data = scaler.fit_transform(new_df)
standardized_df = pd.DataFrame(standardized_data, columns=['Height', 'Weight', 'Age'])
standardized_df

Unnamed: 0,Height,Weight,Age
0,0.205923,-0.527226,-1.687956
1,1.273788,0.829336,-0.511282
2,-0.007650,-0.170236,-1.099619
3,-1.182303,-0.955614,0.665391
4,0.526282,0.186754,-0.511282
...,...,...,...
1335,-1.395876,-1.598196,-0.805451
1336,0.526282,0.972132,-1.982124
1337,-0.007650,-0.241634,-0.805451
1338,-0.541583,-1.169808,-0.805451


In [8]:
standardized_df.describe()

Unnamed: 0,Height,Weight,Age
count,1340.0,1340.0,1340.0
mean,-1.070288e-15,-7.049088e-16,-3.557685e-16
std,1.000373,1.000373,1.000373
min,-2.784101,-2.383574,-2.276292
25%,-0.8619429,-0.5986242,-0.8054505
50%,-0.007650381,-0.1702362,-0.2171138
75%,0.7398556,0.8293356,0.6653913
max,2.982373,3.256867,3.901243
