## LabelEncoder
* sklearn.preprocessing.LabelEncoder
* Encode target labels with value between 0 and n_classes-1.
* This transformer should be used to encode target values, i.e. y, and not the input X.


In [1]:
from sklearn.preprocessing import LabelEncoder

items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

# LabelENcoder를 객체로 생성한 후, fit()과 transform()으로 label 인코딩 수행.
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print('인코딩 변환값:', labels)

인코딩 변환값: [0 1 4 5 3 3 2 2]


In [2]:
print('인코딩 클래스:', encoder.classes_)

인코딩 클래스: ['TV' '냉장고' '믹서' '선풍기' '전자렌지' '컴퓨터']


In [3]:
print('디코딩 원본 값:', encoder.inverse_transform([4,5,2,0,1,1,3,3]))

디코딩 원본 값: ['전자렌지' '컴퓨터' '믹서' 'TV' '냉장고' '냉장고' '선풍기' '선풍기']


## Titanic 데이터 전처리
* Survived, Sex, Age, Embarked 컬럼만 이용
* 피처는 Sex, Age, Embarked 사용
* 레이블은 Survived 사용하여 지도 학습
* 데이터의 전처리가 필요한 상황이라
데이터 전처리를 해야하는데 결측치가 관찰이 되어서 결측치 처리를 
해야하고 문자열이 보여서 인코딩을 해줘야한다.
결측치는 평균값으로 인코딩은 labelencoding을 이요하여 처리한 결과를
데이터 프레임으로 출력
* 판다스를 이용해서 csv 읽어 오기
필요한 컬럼만 남기고 drop
fillna를 활용하여 결측치 처리(mean)
LabelEncoder 활용하여 인코딩

In [4]:
import pandas as pd

titanic = pd.read_csv('../datasets/titanic_train.csv')

titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
titanic.shape

(891, 12)

In [6]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
data = titanic[['Sex', 'Age', 'Embarked']]
target = titanic['Survived']

In [8]:
data.head()

Unnamed: 0,Sex,Age,Embarked
0,male,22.0,S
1,female,38.0,C
2,female,26.0,S
3,female,35.0,S
4,male,35.0,S


In [9]:
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [10]:
data.isna().sum()

Sex           0
Age         177
Embarked      2
dtype: int64

In [11]:
data['Age'] = data['Age'].fillna(data['Age'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Age'] = data['Age'].fillna(data['Age'].mean())


In [12]:
data['Embarked'] = data['Embarked'].fillna('S')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Embarked'] = data['Embarked'].fillna('S')


In [13]:
data.isna().sum()

Sex         0
Age         0
Embarked    0
dtype: int64

In [45]:
from sklearn.preprocessing import LabelEncoder

# LabelENcoder를 객체로 생성한 후, fit()과 transform()으로 label 인코딩 수행.
encoder = LabelEncoder()

titanic["Sex"] = encoder.fit_transform(titanic["Sex"])
titanic["Embarked"] = encoder.fit_transform(titanic["Embarked"])

titanic.head()

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,1,22.0,2
1,1,0,38.0,0
2,1,0,26.0,2
3,1,0,35.0,2
4,0,1,35.0,2


In [29]:
titanic.head()

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,male,22.0,S
1,1,female,38.0,C
2,1,female,26.0,S
3,1,female,35.0,S
4,0,male,35.0,S


In [35]:
import pandas as pd

df = pd.read_csv('../datasets/titanic_train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [36]:
df.drop(columns = ['PassengerId', 'Pclass', 'Name', 'SibSp', 
                        'Parch', 'Ticket', 'Fare', 'Cabin'], 
                         inplace = True)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   Embarked  889 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 28.0+ KB


In [38]:
df["Age"].isna().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of 0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888     True
889    False
890    False
Name: Age, Length: 891, dtype: bool>

In [39]:
df["Age"] = df["Age"].fillna(df["Age"].mean())

In [40]:
df["Age"].isna().sum()

0

In [41]:
df["Embarked"] = df["Embarked"].fillna('S')

In [42]:
df["Embarked"].isna().sum()

0

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       891 non-null    float64
 3   Embarked  891 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 28.0+ KB


In [44]:
from sklearn.preprocessing import LabelEncoder

# LabelENcoder를 객체로 생성한 후, fit()과 transform()으로 label 인코딩 수행.
encoder = LabelEncoder()

df["Sex"] = encoder.fit_transform(df["Sex"])
df["Embarked"] = encoder.fit_transform(df["Embarked"])

df.head()

Unnamed: 0,Survived,Sex,Age,Embarked
0,0,1,22.0,2
1,1,0,38.0,0
2,1,0,26.0,2
3,1,0,35.0,2
4,0,1,35.0,2


## 원-핫 인코딩
* sklearn.preprocessing.OneHotEncoder
* class sklearn.preprocessing.OneHotEncoder(*, categories='auto', drop=None, sparse=True, dtype=<class 'numpy.float64'>, handle_unknown='error', min_frequency=None, max_categories=None)

In [46]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ['TV', '냉장고', '전자렌지', '컴퓨터', '선풍기', 
         '선풍기', '믹서', '믹서']

In [59]:
# 먼저 숫자값으로 변환을 위해 LabelEncoder로 변환
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)

# 2차원 데이터로 변환
labels = labels.reshape(-1, 1)


In [60]:
# 원-핫 인코딩 적용
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)
print('원-핫 인코딩 데이터')
print(oh_labels.toarray())
print('원-핫 인코딩 데이터 차원')
print(oh_labels.shape)

원-핫 인코딩 데이터
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
원-핫 인코딩 데이터 차원
(8, 6)


In [61]:
import pandas as pd

df = pd.read_csv('../datasets/train.csv')
oh_encoder = OneHotEncoder()
trans_data = oh_encoder.fit_transform(df['Sex'])
trans_data.toarray()


ValueError: Expected 2D array, got 1D array instead:
array=['male' 'female' 'female' 'female' 'male' 'male' 'male' 'male' 'female'
 'female' 'female' 'female' 'male' 'male' 'female' 'female' 'male' 'male'
 'female' 'female' 'male' 'male' 'female' 'male' 'female' 'female' 'male'
 'male' 'female' 'male' 'male' 'female' 'female' 'male' 'male' 'male'
 'male' 'male' 'female' 'female' 'female' 'female' 'male' 'female'
 'female' 'male' 'male' 'female' 'male' 'female' 'male' 'male' 'female'
 'female' 'male' 'male' 'female' 'male' 'female' 'male' 'male' 'female'
 'male' 'male' 'male' 'male' 'female' 'male' 'female' 'male' 'male'
 'female' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'female' 'male'
 'male' 'female' 'male' 'female' 'female' 'male' 'male' 'female' 'male'
 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'female' 'male'
 'female' 'male' 'male' 'male' 'male' 'male' 'female' 'male' 'male'
 'female' 'male' 'female' 'male' 'female' 'female' 'male' 'male' 'male'
 'male' 'female' 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'male'
 'female' 'male' 'male' 'male' 'female' 'female' 'male' 'male' 'female'
 'male' 'male' 'male' 'female' 'female' 'female' 'male' 'male' 'male'
 'male' 'female' 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'male'
 'female' 'male' 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'male'
 'female' 'female' 'male' 'male' 'male' 'male' 'female' 'male' 'male'
 'male' 'male' 'female' 'male' 'male' 'female' 'male' 'male' 'male'
 'female' 'male' 'female' 'male' 'male' 'male' 'female' 'male' 'female'
 'male' 'female' 'female' 'male' 'male' 'female' 'female' 'male' 'male'
 'male' 'male' 'male' 'female' 'male' 'male' 'female' 'male' 'male'
 'female' 'male' 'male' 'male' 'female' 'female' 'male' 'female' 'male'
 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'female'
 'female' 'male' 'male' 'female' 'male' 'female' 'male' 'female' 'male'
 'male' 'female' 'female' 'male' 'male' 'male' 'male' 'female' 'female'
 'male' 'male' 'male' 'female' 'male' 'male' 'female' 'female' 'female'
 'female' 'female' 'female' 'male' 'male' 'male' 'male' 'female' 'male'
 'male' 'male' 'female' 'female' 'male' 'male' 'female' 'male' 'female'
 'female' 'female' 'male' 'male' 'female' 'male' 'male' 'male' 'male'
 'male' 'male' 'male' 'male' 'male' 'female' 'female' 'female' 'male'
 'female' 'male' 'male' 'male' 'female' 'male' 'female' 'female' 'male'
 'male' 'female' 'male' 'male' 'female' 'female' 'male' 'female' 'female'
 'female' 'female' 'male' 'male' 'female' 'female' 'male' 'female'
 'female' 'male' 'male' 'female' 'female' 'male' 'female' 'male' 'female'
 'female' 'female' 'female' 'male' 'male' 'male' 'female' 'male' 'male'
 'female' 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'female'
 'female' 'female' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male'
 'female' 'female' 'female' 'female' 'male' 'male' 'female' 'male' 'male'
 'male' 'female' 'female' 'female' 'female' 'male' 'male' 'male' 'male'
 'female' 'female' 'female' 'male' 'male' 'male' 'female' 'female' 'male'
 'female' 'male' 'male' 'male' 'female' 'male' 'female' 'male' 'male'
 'male' 'female' 'female' 'male' 'female' 'male' 'male' 'female' 'male'
 'male' 'female' 'male' 'female' 'male' 'male' 'male' 'male' 'female'
 'male' 'male' 'female' 'male' 'male' 'female' 'female' 'female' 'male'
 'female' 'male' 'male' 'male' 'female' 'male' 'male' 'female' 'female'
 'male' 'male' 'male' 'female' 'female' 'male' 'male' 'female' 'female'
 'female' 'male' 'male' 'female' 'male' 'male' 'female' 'male' 'male'
 'female' 'male' 'female' 'male' 'male' 'male' 'male' 'male' 'male' 'male'
 'male' 'female' 'female' 'male' 'male' 'male' 'male' 'male' 'male' 'male'
 'male' 'male' 'male' 'female' 'male' 'male' 'female' 'female' 'female'
 'male' 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'female' 'male'
 'female' 'female' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male'
 'male' 'female' 'male' 'female' 'male' 'male' 'female' 'female' 'female'
 'female' 'male' 'female' 'male' 'male' 'male' 'male' 'male' 'male'
 'female' 'male' 'male' 'female' 'male' 'female' 'male' 'female' 'male'
 'male' 'female' 'male' 'male' 'female' 'male' 'male' 'male' 'female'
 'male' 'male' 'female' 'female' 'female' 'male' 'female' 'male' 'female'
 'female' 'female' 'female' 'male' 'male' 'male' 'female' 'male' 'male'
 'male' 'male' 'male' 'male' 'male' 'female' 'male' 'female' 'male'
 'female' 'female' 'male' 'male' 'male' 'male' 'female' 'male' 'male'
 'female' 'male' 'male' 'male' 'female' 'male' 'female' 'male' 'male'
 'female' 'female' 'female' 'male' 'female' 'female' 'male' 'male' 'male'
 'female' 'male' 'male' 'male' 'male' 'male' 'female' 'male' 'female'
 'male' 'male' 'female' 'male' 'male' 'male' 'female' 'male' 'male' 'male'
 'male' 'male' 'male' 'male' 'female' 'female' 'female' 'male' 'female'
 'male' 'male' 'female' 'male' 'female' 'female' 'male' 'male' 'male'
 'male' 'male' 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'male'
 'male' 'male' 'female' 'female' 'male' 'male' 'female' 'male' 'male'
 'female' 'female' 'male' 'female' 'male' 'male' 'male' 'male' 'female'
 'male' 'female' 'male' 'female' 'female' 'male' 'male' 'female' 'male'
 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male' 'male'
 'female' 'female' 'male' 'male' 'male' 'male' 'male' 'male' 'female'
 'female' 'male' 'female' 'male' 'male' 'male' 'male' 'male' 'male' 'male'
 'male' 'female' 'male' 'female' 'male' 'male' 'male' 'male' 'male'
 'female' 'male' 'male' 'female' 'male' 'female' 'male' 'male' 'male'
 'female' 'male' 'female' 'male' 'female' 'male' 'male' 'male' 'male'
 'male' 'female' 'female' 'male' 'male' 'female' 'male' 'male' 'male'
 'male' 'male' 'female' 'female' 'male' 'female' 'female' 'male' 'male'
 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'male' 'male' 'female'
 'male' 'male' 'male' 'male' 'female' 'male' 'male' 'female' 'male' 'male'
 'male' 'female' 'male' 'male' 'male' 'male' 'female' 'male' 'male' 'male'
 'female' 'male' 'female' 'male' 'female' 'male' 'male' 'male' 'male'
 'female' 'male' 'female' 'male' 'male' 'female' 'male' 'female' 'female'
 'female' 'male' 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'male'
 'male' 'female' 'male' 'male' 'male' 'female' 'female' 'male' 'female'
 'male' 'female' 'male' 'male' 'male' 'male' 'male' 'female' 'male'
 'female' 'male' 'male' 'male' 'female' 'male' 'male' 'female' 'male'
 'male' 'male' 'female' 'male' 'male' 'female' 'male' 'male' 'male' 'male'
 'male' 'female' 'female' 'male' 'male' 'male' 'male' 'female' 'male'
 'male' 'male' 'male' 'male' 'male' 'female' 'male' 'male' 'male' 'male'
 'male' 'male' 'female' 'male' 'male' 'female' 'female' 'female' 'female'
 'female' 'male' 'female' 'male' 'male' 'male' 'female' 'female' 'male'
 'female' 'female' 'male' 'male' 'male' 'male' 'female' 'male' 'male'
 'female' 'female' 'male' 'male' 'male' 'female' 'female' 'male' 'female'
 'male' 'male' 'female' 'male' 'female' 'female' 'male' 'male'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

## pandas.get_dummies
pandas.get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None)

In [62]:
import pandas as pd 
df = pd.read_csv('../datasets/train.csv')
df.drop(columns = ['PassengerId','Pclass','Name','SibSp', 'Parch','Ticket','Fare','Cabin'],inplace=True)

In [63]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
df["Embarked"] =df['Embarked'].fillna('S')


In [65]:
data = df.loc[:, 'Embarked']
data

0      S
1      C
2      S
3      S
4      S
      ..
886    S
887    S
888    S
889    C
890    Q
Name: Embarked, Length: 891, dtype: object

In [66]:
dummy_embark = pd.get_dummies(data)
dummy_embark

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [69]:
dummy_data = pd.get_dummies(df[['Sex', "Embarked"]])
dummy_data

Unnamed: 0,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,1,0,0,1
1,1,0,1,0,0
2,1,0,0,0,1
3,1,0,0,0,1
4,0,1,0,0,1
...,...,...,...,...,...
886,0,1,0,0,1
887,1,0,0,0,1
888,1,0,0,0,1
889,0,1,1,0,0


In [76]:
df_concat = pd.concat([df, dummy_data], axis=1)
df_concat.drop(columns = ['Sex', 'Embarked'], inplace=True)
df_concat

Unnamed: 0,Survived,Age,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.000000,0,1,0,0,1
1,1,38.000000,1,0,1,0,0
2,1,26.000000,1,0,0,0,1
3,1,35.000000,1,0,0,0,1
4,0,35.000000,0,1,0,0,1
...,...,...,...,...,...,...,...
886,0,27.000000,0,1,0,0,1
887,1,19.000000,1,0,0,0,1
888,0,29.699118,1,0,0,0,1
889,1,26.000000,0,1,1,0,0
