# 전처리

 - 누락된 데이터 처리 (isnull, fillna, dropna)
 - 보간법 (simpleimputer)


In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame([
    [2, 1, 3],
    [3, 2, 5],
    [3, np.nan, 7],
    [5, np.nan, 10],
    [7, 5, 12],
    [2, 5, 7],
    [8, 9, 13],
    [np.nan, 10, 13],
    [6, 12, 12],
    [9, 2, np.nan],
    [6, 10, 12],
    [2, 4, 6]
], columns=['hour', 'attendance', 'score'])


In [4]:
# 누락된 데이터 확인
df.isnull().sum()
# df.info()

hour          1
attendance    2
score         1
dtype: int64

In [5]:
# 특정 값으로 데이터 채우기
df.fillna(df.mean())

Unnamed: 0,hour,attendance,score
0,2.0,1.0,3.0
1,3.0,2.0,5.0
2,3.0,6.0,7.0
3,5.0,6.0,10.0
4,7.0,5.0,12.0
5,2.0,5.0,7.0
6,8.0,9.0,13.0
7,4.818182,10.0,13.0
8,6.0,12.0,12.0
9,9.0,2.0,9.090909


In [6]:
# 누락된 값이 있는 행에 대해서 데이터 제외
df.dropna(axis=0)

Unnamed: 0,hour,attendance,score
0,2.0,1.0,3.0
1,3.0,2.0,5.0
4,7.0,5.0,12.0
5,2.0,5.0,7.0
6,8.0,9.0,13.0
8,6.0,12.0,12.0
10,6.0,10.0,12.0
11,2.0,4.0,6.0


In [None]:
# 누락된 값이 있는 열 제외
df.dropna(axis=1)

In [8]:
from sklearn.impute import SimpleImputer
#Specify the strategy to be the median class (mean, most_frequent)
fea_transformer = SimpleImputer(strategy="median")
values = fea_transformer.fit_transform(df[["Distance"]])
pd.DataFrame(values)

KeyError: "None of [Index(['Distance'], dtype='object')] are in the [columns]"

- 클래스 데이터 처리
- label encoder
- ont-hot encoder

In [9]:
# 중고차 데이터로드
df = pd.read_excel('./hyundaiCar.xlsx')

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
label_encoder = LabelEncoder()

In [12]:
encoded_ts = label_encoder.fit_transform(df['변속기'].values) 
df['변속기_encode'] = encoded_ts

In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
onehot_encoder = OneHotEncoder()

In [15]:
encoded_oil = onehot_encoder.fit_transform(df['연료'].values) 

# 연료_가솔린, 연료_디젤로 칼럼을 만든다.
oil_columns = ['연료_' + oil for oil in onehot_encoder.categories_]

# 만든 칼럼에 각 데이터를 추가
df[oil_columns] = encoded_oil

ValueError: Expected 2D array, got 1D array instead:
array=['가솔린' '가솔린' '가솔린' '가솔린' '디젤' '디젤' '디젤' '가솔린' '가솔린' '가솔린' '디젤' '디젤' '디젤'
 'LPG' '가솔린' '디젤' '디젤' '가솔린' '디젤' '가솔린' 'LPG' '가솔린' '디젤' '디젤' '디젤' '디젤'
 '디젤' '가솔린' '가솔린' '가솔린' '가솔린' '가솔린' '디젤' 'LPG' '디젤' '디젤' '가솔린' 'LPG' 'LPG'
 '가솔린' '가솔린' '가솔린' '디젤' '가솔린' '디젤' '가솔린' '가솔린' '가솔린' '디젤' '가솔린' '가솔린'
 '가솔린' '가솔린' '디젤' '가솔린' '디젤' '디젤' '디젤' '디젤' '디젤' '디젤' '가솔린' '가솔린' '디젤'
 '가솔린' '가솔린' '가솔린' '가솔린' '디젤' '가솔린' '디젤'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

## 스케일링

- MinMaxScaler
- StandardScaler
- RobustScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

In [None]:
mms = MinMaxScaler()

numeric_columns = ['가격', '연비', '마력', '토크', '배기량', '중량']

df_norm_mm = mms.fit_transform(df[numeric_columns])

In [None]:
sds = StandardScaler()

numeric_columns = ['가격', '연비', '마력', '토크', '배기량', '중량']

df_norm_sd = sds.fit_transform(df[numeric_columns])

In [None]:
rbs = RobustScaler()

numeric_columns = ['가격', '연비', '마력', '토크', '배기량', '중량']

df_norm_rb = rbs.fit_transform(df[numeric_columns])