## 5.1 순서가 없는 범주형 특성 인코딩하기

In [None]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer

In [3]:
feature = np.array([["Texas"],
                   ["California"],
                   ["Texas"],
                   ["Delaware"],
                   ["Texas"]])

In [4]:
one_hot = LabelBinarizer()

In [5]:
one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [7]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [8]:
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [9]:
import pandas as pd

In [12]:
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


In [17]:
multiclass_feature = [("Texas","Florida"),
                      ("California","Alabama"),
                      ("Texas","Florida"),
                      ("Delware","Florida"),
                      ("Texas","Alabama")]

In [18]:
one_hot_multiclass = MultiLabelBinarizer()

In [19]:
one_hot_multiclass.fit_transform(multiclass_feature)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [20]:
one_hot_multiclass.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'],
      dtype=object)

In [22]:
from sklearn.preprocessing import OneHotEncoder

In [23]:
feature = np.array([["Texas",1],
                   ["California",1],
                   ["Texas",3],
                   ["Delaware",1],
                   ["Texas",1]])

In [24]:
one_hot_encoder = OneHotEncoder(sparse=False)
one_hot_encoder.fit_transform(feature)

array([[0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 0., 1., 1., 0.]])

In [26]:
one_hot_encoder.categories_

[array(['California', 'Delaware', 'Texas'], dtype='<U10'),
 array(['1', '3'], dtype='<U10')]

## 5.2 순서가 있는 범주형 특성 인코딩하기

In [27]:
dataframe = pd.DataFrame({"Score": ["Low","Low","Medium","Medium","High"]})

In [28]:
scale_mapper = {"Low":1,
             "Medium":2,
             "High":3}

In [29]:
dataframe["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [30]:
dataframe = pd.DataFrame({"Score": ["Low",
                                   "Low",
                                   "Medium",
                                   "Medium",
                                   "High",
                                   "Barely More Than Medium"]})

scale_mapper = {"Low":1,
                "Medium":2,
                "Barely More Than Medium":3,
                "High":4}

In [31]:
dataframe["Score"].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    4
5    3
Name: Score, dtype: int64

In [32]:
scale_mapper = {"Low":1,
                "Medium":2,
                "Barely More Than Medium":2.1,
                "High":3}

In [33]:
dataframe["Score"].replace(scale_mapper)

0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

In [34]:
from sklearn.preprocessing import OrdinalEncoder

In [35]:
features = np.array([["Low",10],
                    ["High", 50],
                    ["Medium", 3]])

In [36]:
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit_transform(features)

array([[1., 0.],
       [0., 2.],
       [2., 1.]])

In [37]:
ordinal_encoder.categories_

[array(['High', 'Low', 'Medium'], dtype='<U6'),
 array(['10', '3', '50'], dtype='<U6')]

## 5.3 특성 딕셔너리를 인코딩하기

In [38]:
from sklearn.feature_extraction import DictVectorizer

In [39]:
data_dict = [{"Red":2, "Blue":4},
            {"Red":4, "Blue":3},
            {"Red":1, "Yellow":2},
            {"Red": 2, "Yellow":2}]

In [40]:
dictvectorizer = DictVectorizer(sparse=False)

In [41]:
features = dictvectorizer.fit_transform(data_dict)

In [42]:
features

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

In [43]:
feature_names = dictvectorizer.get_feature_names()
feature_names

['Blue', 'Red', 'Yellow']

In [44]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


In [45]:
doc_1_word_count = {"Red":2, "Blue":4}
doc_2_word_count = {"Red":4, "Blue":3}
doc_3_word_count = {"Red":1, "Yellow":2}
doc_4_word_count = {"Red":2, "Yellow":2}

In [46]:
doc_word_counts = [doc_1_word_count,
                  doc_2_word_count,
                  doc_3_word_count,
                  doc_4_word_count]

In [47]:
dictvectorizer.fit_transform(doc_word_counts)

array([[4., 2., 0.],
       [3., 4., 0.],
       [0., 1., 2.],
       [0., 2., 2.]])

## 5.4 누락된 클래스 값 대체하기

In [48]:
from sklearn.neighbors import KNeighborsClassifier

In [49]:
X = np.array([[0, 2.10, 1.45],
             [1, 1.18, 1.33],
             [0, 1.22, 1.27],
             [1, -0.21, -1.19]])


In [50]:
X_with_nan = np.array([[np.nan, 0.87, 1.31],
                      [np.nan, -0.67, -0.22]])

In [51]:
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:, 1:], X[:,0])

In [52]:
imputed_values = trained_model.predict(X_with_nan[:, 1:])

In [53]:
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:, 1:]))

In [54]:
np.vstack((X_with_imputed, X))

array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

In [59]:
from sklearn.impute import SimpleImputer

In [62]:
X_complete = np.vstack((X_with_nan, X))

In [63]:
imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(X_complete)

array([[ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

## 5.5 불균형한 클래스 다루기

In [64]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

In [65]:
iris = load_iris()
features = iris.data
target = iris.target
features = features[40:,:]
target = target[40:]
target = np.where((target == 0), 0, 1)
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [66]:
weights = {0: .9, 1: 0.1}
RandomForestClassifier(class_weight=weights)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 0.9, 1: 0.1}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=None, verbose=0, warm_start=False)

In [67]:
RandomForestClassifier(class_weight="balanced")

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [68]:
i_class0 = np.where(target == 0)[0]
i_class1 = np.where(target == 1)[0]

In [69]:
n_class0 = len(i_class0)
n_class1 = len(i_class1)

In [70]:
i_class1_downsampled = np.random.choice(i_class1, size=n_class0,
                                       replace=False)

In [71]:
np.hstack((target[i_class0], target[i_class1_downsampled]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [72]:
np.vstack((features[i_class0, :], features[i_class1_downsampled, :]))[0:5]

array([[5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4]])

In [73]:
i_class0_upsampled = np.random.choice(i_class0, size = n_class1, replace = True)

In [74]:
np.concatenate((target[i_class0_upsampled], target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [75]:
np.vstack((features[i_class0_upsampled, :], features[i_class1,:]))[0:5]

array([[4.5, 2.3, 1.3, 0.3],
       [5.1, 3.8, 1.9, 0.4],
       [4.4, 3.2, 1.3, 0.2],
       [5.1, 3.8, 1.9, 0.4],
       [5. , 3.5, 1.6, 0.6]])