In [1]:
!wget https://raw.githubusercontent.com/rickiepark/ml-with-pytorch/main/python_environment_check.py

--2024-03-08 12:03:25--  https://raw.githubusercontent.com/rickiepark/ml-with-pytorch/main/python_environment_check.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1629 (1.6K) [text/plain]
Saving to: ‘python_environment_check.py’


2024-03-08 12:03:25 (11.6 MB/s) - ‘python_environment_check.py’ saved [1629/1629]



누락된 데이터 다루기
  - 테이블 형태 데이터에서 누락된 값 식별

In [6]:
import pandas as pd
import numpy as np

csv_data = [
    [1.0,2.0,3.0,4.0],
    [5.0,6.0,np.nan,8.0],
    [10.0,11.0,12.0,np.nan]
]


df= pd.DataFrame(csv_data,columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [7]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

누락된 값이 있는 셈플이나 특성 제외

In [8]:
# 누락된 값이 있는 행을 삭제
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [9]:
# 누락된 값이 있는 열을 삭제
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [10]:
# 모든 열이 NaA인 행을 삭제
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [11]:
# NaN  아닌 값이 네 개보다 작은 행을 삭제
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [12]:
# 특정 열에  NaN이 있는 행만 삭제(C)
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


누락된 값 대체

In [13]:
# 행의 평균으로 누락된 값 대체하기
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [16]:
df['C'].fillna( df['C'].mean())

0     3.0
1     7.5
2    12.0
Name: C, dtype: float64

In [17]:
df['D'].fillna( df['D'].mean())

0    4.0
1    8.0
2    6.0
Name: D, dtype: float64

In [18]:
from sklearn.impute import SimpleImputer
imr = SimpleImputer(missing_values=np.nan,strategy='mean')
imputed_data = imr.fit_transform(df.values)
imputed_data


array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [19]:
from sklearn.preprocessing import FunctionTransformer
ftr_imr = FunctionTransformer(lambda x : imr.fit_transform(x.T).T)
imputed_data =  ftr_imr.fit_transform(df.values)
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  6.33333333,  8.        ],
       [10.        , 11.        , 12.        , 11.        ]])

In [21]:
df.fillna(method='bfill')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,12.0,8.0
2,10.0,11.0,12.0,


In [23]:
df.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,3.0,8.0
2,10.0,11.0,12.0,8.0


In [24]:
df.fillna(method='ffill',axis=1)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,6.0,8.0
2,10.0,11.0,12.0,12.0


범주형 데이터 인코딩

In [26]:
import pandas as pd

df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'],
                   ['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


순서가 있는 특성 매핑

In [29]:
size_mapping = {
    'XL' : 3,
    'L' : 2,
    'M' : 1
}
df['size_mapping'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel,size_mapping
0,green,M,10.1,class2,1
1,red,L,13.5,class1,2
2,blue,XL,15.3,class2,3


In [31]:
inv_size_mapping = { v:k    for k, v in size_mapping.items()   }
inv_size_mapping

{3: 'XL', 2: 'L', 1: 'M'}

클래스 레이블 인코딩

In [35]:
class_mapping = { value:idx for idx, value in enumerate(np.unique(df['classlabel'])) }
df['classlabel_mapping'] = df['classlabel'].map(class_mapping)

In [36]:
inv_class_mapping = { value:key  for key,value in class_mapping.items()  }
inv_class_mapping

{0: 'class1', 1: 'class2'}

In [37]:
df

Unnamed: 0,color,size,price,classlabel,size_mapping,classlabel_mapping
0,green,M,10.1,class2,1,1
1,red,L,13.5,class1,2,0
2,blue,XL,15.3,class2,3,1


In [38]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
class_le.fit_transform(df['classlabel'].values)

array([1, 0, 1])

순서가 없는 특성에 원-핫 인코딩 적용

In [64]:
X = df[['classlabel_mapping','size_mapping','price']].values
X

array([[ 0. ,  1. , 10.1],
       [ 1. ,  2. , 13.5],
       [ 0. ,  3. , 15.3]])

In [66]:
X[:,0]

array([0., 1., 0.])

In [70]:
from sklearn.preprocessing import OneHotEncoder
ohe_data = OneHotEncoder().fit_transform(X[:,0].reshape(-1,1))
ohe_data.toarray()

array([[1., 0.],
       [0., 1.],
       [1., 0.]])

In [71]:
from sklearn.compose import ColumnTransformer
ctf =  ColumnTransformer([('onehot',OneHotEncoder(), [0]),
                  ('nothing','passthrough',[1,2])
                  ])

ctf.fit_transform(X)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  3. , 15.3]])