# 전처리

 - 누락된 데이터 처리 (isnull, fillna, dropna)
 - 보간법 (simpleimputer)


In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([
    [2, 1, 3],
    [3, 2, 5],
    [3, np.nan, 7],
    [5, np.nan, 10],
    [7, 5, 12],
    [2, 5, 7],
    [8, 9, 13],
    [np.nan, 10, 13],
    [6, 12, 12],
    [9, 2, np.nan],
    [6, 10, 12],
    [2, 4, 6]
], columns=['hour', 'attendance', 'score'])

In [None]:
df

In [None]:
# 누락된 데이터 확인
df.isnull().sum()
# df.info()

In [None]:
df.mean()

In [None]:
# 특정 값으로 데이터 채우기
df.fillna(df.mean())

In [None]:
# 누락된 값이 있는 행에 대해서 데이터 제외
df.dropna(axis=0)

In [None]:
# 누락된 값이 있는 열 제외
df.dropna(axis=1)

In [None]:
from sklearn.impute import SimpleImputer
# Specify the strategy to be the median class (mean, most_frequent)
simple_imp = SimpleImputer(strategy="median")
values = simple_imp.fit_transform(df[["hour"]])
pd.DataFrame(values)

In [14]:
simple_imp = SimpleImputer(strategy="median")
simple_imp

In [15]:
simple_imp.fit(df[['hour']])

In [16]:
simple_imp.transform(df[['hour']])

array([[2.],
       [3.],
       [3.],
       [5.],
       [7.],
       [2.],
       [8.],
       [5.],
       [6.],
       [9.],
       [6.],
       [2.]])

In [22]:
simple_imp = SimpleImputer(strategy="median")
simple_imp.fit_transform(df[['hour']])

array([[2.],
       [3.],
       [3.],
       [5.],
       [7.],
       [2.],
       [8.],
       [5.],
       [6.],
       [9.],
       [6.],
       [2.]])

In [23]:
# 학습 데이터 (train) -> fit 하고 transform까지
df['hour'] = simple_imp.fit_transform(df[['hour']])

In [24]:
# 테스트 데이터 (test) -> transform만, fit은 하지 않음
# simple_imp.transform(df_test['hour'])

Unnamed: 0,hour,attendance,score
0,2.0,1.0,3.0
1,3.0,2.0,5.0
2,3.0,,7.0
3,5.0,,10.0
4,7.0,5.0,12.0
5,2.0,5.0,7.0
6,8.0,9.0,13.0
7,5.0,10.0,13.0
8,6.0,12.0,12.0
9,9.0,2.0,


- 클래스 데이터 처리
- label encoder
- ont-hot encoder

In [25]:
# 중고차 데이터로드
df = pd.read_excel('./hyundaiCar.xlsx')

In [29]:
df.head(1).T

Unnamed: 0,0
가격,1885
년식,2015
종류,준중형
연비,11.8
마력,172
토크,21.0
연료,가솔린
하이브리드,0
배기량,1999
중량,1300


In [27]:
from sklearn.preprocessing import LabelEncoder

In [28]:
label_encoder = LabelEncoder()

In [30]:
encoded_ts = label_encoder.fit_transform(df['변속기']) 
df['변속기_encode'] = encoded_ts

In [None]:
df

In [32]:
from sklearn.preprocessing import OneHotEncoder

In [33]:
onehot_encoder = OneHotEncoder()

In [36]:
encoded_oil = onehot_encoder.fit_transform(df['연료'].values.reshape(-1, 1)) 

In [44]:
df['연료'].unique()

array(['가솔린', '디젤', 'LPG'], dtype=object)

In [46]:
onehot_encoder.categories_

[array(['LPG', '가솔린', '디젤'], dtype=object)]

In [None]:
encoded_oil.toarray()

In [62]:
onehot_encoder.categories_[0]

array(['LPG', '가솔린', '디젤'], dtype=object)

In [58]:
# 연료_LPG, 연료_가솔린, 연료_디젤로 칼럼을 만든다.
oil_columns = ['연료_' + oil for oil in onehot_encoder.categories_[0]]

# 만든 칼럼에 각 데이터를 추가
df[oil_columns] = encoded_oil.toarray()

In [None]:
df.head(1).T

## 스케일링

- MinMaxScaler
- StandardScaler
- RobustScaler

In [66]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

In [None]:
df.head(1).T

In [64]:
df.columns

Index(['가격', '년식', '종류', '연비', '마력', '토크', '연료', '하이브리드', '배기량', '중량', '변속기',
       '변속기_encode', '연료_LPG', '연료_가솔린', '연료_디젤'],
      dtype='object')

In [67]:
mms = MinMaxScaler()

numeric_columns = ['가격', '연비', '마력', '토크', '배기량', '중량']

df_norm_mm = mms.fit_transform(df[numeric_columns])

In [None]:
df_norm_mm

In [69]:
sds = StandardScaler()

numeric_columns = ['가격', '연비', '마력', '토크', '배기량', '중량']

df_norm_sd = sds.fit_transform(df[numeric_columns])

In [None]:
df_norm_sd

In [71]:
rbs = RobustScaler()

numeric_columns = ['가격', '연비', '마력', '토크', '배기량', '중량']

df_norm_rb = rbs.fit_transform(df[numeric_columns])

In [73]:
df[numeric_columns] = df_norm_rb

In [75]:
df.head(3).T

Unnamed: 0,0,1,2
가격,-0.15795,0.05616,-0.684451
년식,2015,2015,2015
종류,준중형,준중형,소형
연비,-0.141176,-0.023529,0.611765
마력,0.09375,0.59375,-1.03125
토크,-0.262582,0.0,-0.586433
연료,가솔린,가솔린,가솔린
하이브리드,0,0,0
배기량,0.0,-0.487455,-0.753883
중량,-0.490654,-0.490654,-0.903427


In [76]:
df.columns

Index(['가격', '년식', '종류', '연비', '마력', '토크', '연료', '하이브리드', '배기량', '중량', '변속기',
       '변속기_encode', '연료_LPG', '연료_가솔린', '연료_디젤'],
      dtype='object')

In [78]:
target_column = '가격'
train_columns = ['년식', '연비', '마력', '토크', '하이브리드', 
                 '배기량', '중량', '변속기_encode', 
                 '연료_LPG', '연료_가솔린', '연료_디젤']
train_data = df[train_columns].values
target = df[target_column].values

In [None]:
train_data

In [None]:
target