In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd  

sns.set()
%matplotlib inline

## one hot

In [4]:
# sklearnでone hot
from sklearn.preprocessing import LabelBinarizer,MultiLabelBinarizer

feature = np.array([["Texas"],
                    ["California"],
                    ["Texas"],
                    ["Delaware"],
                    ["Texas"]])

one_hot=LabelBinarizer()

one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [6]:
one_hot.classes_

array(['California', 'Delaware', 'Texas'], dtype='<U10')

In [8]:
# 逆変換
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Delaware', 'Texas'], dtype='<U10')

In [11]:
# pandas 
pd.get_dummies(feature[:,0],drop_first=True)

Unnamed: 0,Delaware,Texas
0,0,1
1,0,0
2,0,1
3,1,0
4,0,1


In [17]:
# クラスを決定する要素が多変数の場合

multiclass_feature = [("Texas", "Florida"),
                      ("California", "Alabama"),
                      ("Texas", "Florida",'Asia'),
                      ("Delware", "Florida"),
                      ("Texas", "Alabama")]

one_hot_multi=MultiLabelBinarizer()

one_hot_multi.fit_transform(multiclass_feature)

array([[0, 0, 0, 0, 1, 1],
       [1, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 1, 1],
       [0, 0, 0, 1, 1, 0],
       [1, 0, 0, 0, 0, 1]])

## さまざまな型のデータを特徴量に変換

In [19]:
# 順序特徴量を数値に変換
df= pd.DataFrame({"Score": ["Low", "Low", "Medium", "Medium", "High"]})

scale_mapper={'Low':0,'Medium':1,'High':2}

df.replace(scale_mapper)

Unnamed: 0,Score
0,0
1,0
2,1
3,1
4,2


In [26]:
# dict型を特徴量ベクトルに変換
from sklearn.feature_extraction import DictVectorizer

data_dict = [{"Red": 2, "Blue": 4},
             {"Red": 4, "Blue": 3},
             {"Red": 1, "Yellow": 2},
             {"Red": 2, "Yellow": 2}]

# Trueにするとscipyのsparse matrixでメモリ効率化してくれる
dictvect=DictVectorizer(sparse=False)

f=dictvect.fit_transform(data_dict)

pd.DataFrame(f,columns=dictvect.get_feature_names())

Unnamed: 0,Blue,Red,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


## 欠損データの補完

In [39]:
# KNNで欠損データの補完
from sklearn.neighbors import KNeighborsClassifier

X = np.array([[0, 2.10, 1.45],
              [1, 1.18, 1.33],
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

X_with_nan = np.array([[np.nan, 0.87, 1.31],
                       [np.nan, -0.67, -0.22]])

clf=KNeighborsClassifier(3,weights='distance')

# 0列目に欠損データが存在するため、0列目を予測するmodelを構成
trained_model=clf.fit(X[:,1:],X[:,0])

imputed_values=trained_model.predict(X_with_nan[:,1:])

X_with_imputed=np.hstack((imputed_values.reshape(-1,1),X_with_nan[:,1:]))

np.vstack((X,X_with_imputed))

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22]])

In [41]:
# より少ない計算量で最頻度の値で埋める
from sklearn.preprocessing import Imputer

X_complete=np.vstack((X,X_with_nan))

imputer=Imputer(strategy='most_frequent',axis=0)

imputer.fit_transform(X_complete)

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22]])

## 偏りがあるサンプリングを補正

In [51]:
# 偏りがあるデータを構成

from sklearn.datasets import load_iris

iris=load_iris()

feature=iris.data
target=iris.target


feature=feature[40:]
target=target[40:]

target=np.where(target==0,0,1)

target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [60]:
# それぞれのclassに所属するデータのindexを取得
i_class0=np.where(target==0)[0]
i_class1=np.where(target==1)[0]

n_class0=len(i_class0)
n_class1=len(i_class1)


# ランダムにサンプリングしたindexを取得
i_class1_down=np.random.choice(i_class1,size=n_class0,replace=False)

np.hstack((target[i_class0],target[i_class1_down]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [64]:
# 逆に少ないサンプルを復元抽出して水増し

i_class0_up=np.random.choice(i_class0,size=n_class1,replace=True)

np.hstack((target[i_class0_up],target[i_class1]))

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])