In [61]:
import pandas as pd
import numpy as np
from io import StringIO

In [62]:
csv_data=(
    '''A,B,C,D
    1.0,2.0,3.0,4.0
    5.0,6.0,8.0
    10.0,11.0,12.0,'''
)

In [63]:
df=pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,8.0,
2,10.0,11.0,12.0,


In [64]:
df.isnull().sum()

A    0
B    0
C    0
D    2
dtype: int64

In [65]:
# 4.1.2 누락된 값이 있는 샘플이나 특정 제외(삭제) : dropna
# axis=0, how=any 
df.dropna(axis=0, how='any')
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [66]:
df.dropna(axis=1, how='any')

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,5.0,6.0,8.0
2,10.0,11.0,12.0


In [67]:
df.dropna(axis=0, how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,8.0,
2,10.0,11.0,12.0,


In [68]:
# 4.1 -> 누락된 값 대체 : simpleimputer
from sklearn.impute import SimpleImputer

imr=SimpleImputer()
# 계산 : 평균 계산 
imr=imr.fit(df.values) # ndarray 가 들어가야 한다 

#컬럼별로 위 평균값으로 대체
imputed_data=imr.transform(df.values)
imputed_data 

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6.,  8.,  4.],
       [10., 11., 12.,  4.]])

In [69]:
df.T

Unnamed: 0,0,1,2
A,1.0,5.0,10.0
B,2.0,6.0,11.0
C,3.0,8.0,12.0
D,4.0,,


In [70]:
# simpeldimputer : 컬럼별 처리만 가능 
# functiontransformer : 행별로 처리한다 
from sklearn.preprocessing import FunctionTransformer

ftr_imr=FunctionTransformer(lambda X:imr.fit_transform(X.T).T, validate=False)
imputed_data=ftr_imr.fit_transform(df.values)
imputed_data

array([[ 1.        ,  2.        ,  3.        ,  4.        ],
       [ 5.        ,  6.        ,  8.        ,  6.33333333],
       [10.        , 11.        , 12.        , 11.        ]])

In [71]:
# fillna(함수) : 대체 
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,8.0,
2,10.0,11.0,12.0,


In [72]:
# df.fillna(df.mean())
df.mean() # 컬럼별 평균 

A    5.333333
B    6.333333
C    7.666667
D    4.000000
dtype: float64

In [73]:
df.fillna(method='bfill')

  df.fillna(method='bfill')


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,8.0,
2,10.0,11.0,12.0,


In [74]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,8.0,
2,10.0,11.0,12.0,


In [75]:
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,8.0,4.0
2,10.0,11.0,12.0,4.0


In [76]:
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,8.0,4.0
2,10.0,11.0,12.0,4.0


In [77]:
# 4.2 범주형 데이터 다루기 
# 4.2.1 판다스를 사용한 범주형 데이터 인코딩
df=pd.DataFrame([
    ['green','M',10.1,'class2']
    , ['red','L',13.5,'class1']
    , ['vlue','XL',15.3,'class2']
])

In [78]:
df.columns=['color','size','price','classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,vlue,XL,15.3,class2


In [79]:
# M=1, L=2, XL=3
# 규칙 정의 
size_mapping={
    'XL':3,
    'l':2,
    'M':1
}
df['size']=df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1.0,10.1,class2
1,red,,13.5,class1
2,vlue,3.0,15.3,class2


In [80]:
inv_size_mapping={
    v:k for k,v in size_mapping.items()
}
inv_size_mapping

{3: 'XL', 2: 'l', 1: 'M'}

In [81]:
size_mapping.items()

dict_items([('XL', 3), ('l', 2), ('M', 1)])

In [82]:
df['size'].map(inv_size_mapping)

0      M
1    NaN
2     XL
Name: size, dtype: object

In [83]:
a=[10,20,30]
for idx, value in enumerate(a):
    print(idx, ' : ',value)

0  :  10
1  :  20
2  :  30


In [84]:
np.unique(df['classlabel'])

array(['class1', 'class2'], dtype=object)

In [85]:
class_mapping={
    label:idx for idx, label in enumerate(np.unique(df['classlabel']))
}
class_mapping

{'class1': 0, 'class2': 1}

In [86]:
df['classlabel']=df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1.0,10.1,1
1,red,,13.5,0
2,vlue,3.0,15.3,1


In [87]:
inv_class_mapping={
    v:k for k,v in class_mapping.items()
}
inv_class_mapping

{0: 'class1', 1: 'class2'}

In [88]:
df['classlabel']=df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1.0,10.1,class2
1,red,,13.5,class1
2,vlue,3.0,15.3,class2


In [89]:
# LabelEncoder : 원래답을 정수로 변환
from sklearn.preprocessing import LabelEncoder # class

class_le=LabelEncoder() # object 생성
# y=class_le.fit_transform(ndarray)
y=class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

In [90]:
df['classlable'], df['classlabel'].values
y

KeyError: 'classlable'

In [None]:
print(y)
print(class_le.inverse_transform(y))

[1 0 1]
['class2' 'class1' 'class2']


In [None]:
# 순서가 없는 특성에 원-핫 인코딩 적용
X=df[['color','size','price']].values # ndarray
color_le=LabelEncoder()
X[:,0]=color_le.fit_transform(X[:,0])
X

array([[0, 1.0, 10.1],
       [1, nan, 13.5],
       [2, 3.0, 15.3]], dtype=object)

In [None]:
X[:,0]

array(['green', 'red', 'vlue'], dtype=object)

In [None]:
X[:,0]=color_le.inverse_transform(X[:,0].astype("int32"))
X

array([['green', 1.0, 10.1],
       ['red', nan, 13.5],
       ['vlue', 3.0, 15.3]], dtype=object)

In [None]:
from sklearn.preprocessing import OneHotEncoder
X=df[['color','size','price']].values
color_ohe=OneHotEncoder()

color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()
# ['green' 'red' 'blue'] => 정렬 ['blue','green','red']

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [None]:
print(X[:,0].shape)
print(X[:,0].ndim)
print(X[:,0].reshape(-1,1))

(3,)
1


In [None]:
a=np.array([1,2,3,4,5,6])
print(a)
# print(a.reshape(2,3))
# print(a.reshape(3,2))
print(a.reshape(1,-1))
print(a.reshape(-1,1))


[1 2 3 4 5 6]
[[1 2 3 4 5 6]]
[[1]
 [2]
 [3]
 [4]
 [5]
 [6]]


In [None]:
# columntransformer
# 데이터셋의 특성마다 서로 다른 전처리 좌정을 적용해야 하는 경우
# 데이터 생성 
# 예제 데이터 생성
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
df_= pd.DataFrame({
    '나이': [30, 25, 40, 35], #나이, 소득 : 연속형 데이터 (standerd)
    '성별': ['남성', '여성', '여성', '남성'], # 범주형 데이터(onehot) 
    '소득': [50000, 48000, 62000, 58000],
    '도시': ['서울', '부산', '서울', '대전'] # 순서없는 범주형 데이터 
})
df_

Unnamed: 0,나이,성별,소득,도시
0,30,남성,50000,서울
1,25,여성,48000,부산
2,40,여성,62000,서울
3,35,남성,58000,대전


In [None]:
# 적용할 columntransformer 정의 : 오브젝트 생성 
prepro=ColumnTransformer([
    ('num_scaler', StandardScaler(), ['나이','소득']), #standerdscaler : 나이, 소득 -> (범주형, 오브젝트생성,적용할피쳐)
    ('cat_encoder', OneHotEncoder(), ['성별','도시']) # onehot: 성별, 도시 
],
# remainder='drop' # 나머지 열은 모두 삭제 
)

In [None]:
# prepro_data=prepro.fit_transform(데이터프레임)
prepro_data=prepro.fit_transform(df_) #array return
print(prepro_data)

[[-0.4472136  -0.78633365  1.          0.          0.          0.
   1.        ]
 [-1.34164079 -1.13581527  0.          1.          0.          1.
   0.        ]
 [ 1.34164079  1.31055608  0.          1.          0.          0.
   1.        ]
 [ 0.4472136   0.61159284  1.          0.          1.          0.
   0.        ]]


In [91]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1.0,10.1,class2
1,red,,13.5,class1
2,vlue,3.0,15.3,class2


In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   color       3 non-null      object 
 1   size        2 non-null      float64
 2   price       3 non-null      float64
 3   classlabel  3 non-null      object 
dtypes: float64(2), object(2)
memory usage: 224.0+ bytes


In [93]:
print(df)
pd.get_dummies(df[['price','color','size']])

   color  size  price classlabel
0  green   1.0   10.1     class2
1    red   NaN   13.5     class1
2   vlue   3.0   15.3     class2


Unnamed: 0,price,size,color_green,color_red,color_vlue
0,10.1,1.0,True,False,False
1,13.5,,False,True,False
2,15.3,3.0,False,False,True


In [95]:
# 4.2 범주형 데이터 다루기 
# 4.2.1 판다스를 사용한 범주형 데이터 인코딩
df=pd.DataFrame([
    ['green','M',10.1,'class2']
    , ['red','L',13.5,'class1']
    , ['vlue','XL',15.3,'class2']
])
df.columns['color','size','price','classlable']
df

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [98]:
# x<M , x>L
df['x>M']=df['size'].apply(lambda s: 1 if x in {'L', 'XL'} else 0)
df['x>M']=df['size'].apply(lambda s: 1 if x == 'XL' else 0)
df

KeyError: 'size'