In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [4]:
df = pd.DataFrame([
        ['green', 'M', 1000, 'class1'],
        ['red', 'L', 1500, 'class2'],
        ['blue', 'XL', 2000, 'class1'],
        ['orange', 'M', 1000, 'class3'],
        ['orange', 'L', 1000, 'class2']
    ])

In [5]:
df.columns = ['color', 'size', 'price', 'classlabel']

In [6]:
df

Unnamed: 0,color,size,price,classlabel
0,green,M,1000,class1
1,red,L,1500,class2
2,blue,XL,2000,class1
3,orange,M,1000,class3
4,orange,L,1000,class2


## ベタ書きでエンコーディング

In [7]:
size_mapping = {'XL' : 3, 'L' : 2, 'M' : 1}

In [8]:
df['size'] = df['size'].map(size_mapping)

In [9]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,1000,class1
1,red,2,1500,class2
2,blue,3,2000,class1
3,orange,1,1000,class3
4,orange,2,1000,class2


## クラス値の種類を自動で読み込んでエンコーディング

In [10]:
class_mapping = {label : idx for idx, label in enumerate (np.unique(df['classlabel']))}

In [11]:
class_mapping

{'class1': 0, 'class2': 1, 'class3': 2}

In [12]:
df['classlabel'] = df['classlabel'].map(class_mapping)

In [13]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,1000,0
1,red,2,1500,1
2,blue,3,2000,0
3,orange,1,1000,2
4,orange,2,1000,1


## 元の文字列に戻す

In [14]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}

In [15]:
inv_class_mapping

{0: 'class1', 1: 'class2', 2: 'class3'}

In [16]:
df['classlabel'] = df['classlabel'].map(inv_class_mapping)

In [17]:
df

Unnamed: 0,color,size,price,classlabel
0,green,1,1000,class1
1,red,2,1500,class2
2,blue,3,2000,class1
3,orange,1,1000,class3
4,orange,2,1000,class2


## sklearn のLabelEncoder 

In [21]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()

In [28]:
X

array([[1, 1, 1000],
       [3, 2, 1500],
       [0, 3, 2000],
       [2, 1, 1000],
       [2, 2, 1000]], dtype=object)

In [22]:
X[:, 0] = color_le.fit_transform(X[:, 0])

In [23]:
X

array([[1, 1, 1000],
       [3, 2, 1500],
       [0, 3, 2000],
       [2, 1, 1000],
       [2, 2, 1000]], dtype=object)

## one-hot エンコーディング

In [24]:
ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()

array([[  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   1.00000000e+03],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00000000e+00,   2.00000000e+00,   1.50000000e+03],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   3.00000000e+00,   2.00000000e+03],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   1.00000000e+00,   1.00000000e+03],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          0.00000000e+00,   2.00000000e+00,   1.00000000e+03]])

In [25]:
pd.get_dummies(df[['price', 'color', 'size']])

Unnamed: 0,price,size,color_blue,color_green,color_orange,color_red
0,1000,1,0.0,1.0,0.0,0.0
1,1500,2,0.0,0.0,0.0,1.0
2,2000,3,1.0,0.0,0.0,0.0
3,1000,1,0.0,0.0,1.0,0.0
4,1000,2,0.0,0.0,1.0,0.0
