## 4.1 특성 스케일 바꾸기

In [2]:
import numpy as np
from sklearn import preprocessing

In [3]:
feature = np.array([[-500.5],
                   [-100.1],
                   [0],
                   [100.1],
                   [900.9]])

In [4]:
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))

In [5]:
scaled_feature = minmax_scale.fit_transform(feature)

In [6]:
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [7]:
preprocessing.MinMaxScaler().fit_transform(feature[:3])

array([[0. ],
       [0.8],
       [1. ]])

In [8]:
preprocessing.MinMaxScaler().fit_transform(feature[3:])

array([[0.],
       [1.]])

In [9]:
preprocessing.MinMaxScaler().fit_transform(feature[:3])

array([[0. ],
       [0.8],
       [1. ]])

In [10]:
preprocessing.MinMaxScaler().fit_transform(feature[3:])

array([[0.],
       [1.]])

In [12]:
scaler = preprocessing.MinMaxScaler().fit(feature[:3])
scaler.transform(feature[:3])

array([[0. ],
       [0.8],
       [1. ]])

In [13]:
scaler.transform(feature[3:])

array([[1.2],
       [2.8]])

## 4.2 특성을 표준화하기

In [42]:
x = np.array([[-1000.1],
                   [-200.2],
                   [500.5],
                   [600.6],
                   [9000.9]])

In [32]:
scaler=preprocessing.StandardScaler()

In [33]:
standardized = scaler.fit_transform(x)

In [34]:
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [35]:
print("평균:", round(standardized.mean()))

평균: 0.0


In [36]:
print("표준편차:", standardized.std())

표준편차: 1.0


In [37]:
robust_scaler = preprocessing.RobustScaler()

In [44]:
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

In [39]:
interquatile_range = x[3] - x[1]

In [40]:
(x - np.median(x)) / interquatile_range

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

In [43]:
preprocessing.QuantileTransformer().fit_transform(x)

  % (self.n_quantiles, n_samples))


array([[0.  ],
       [0.25],
       [0.5 ],
       [0.75],
       [1.  ]])

## 4.3 정규화하기

In [53]:
from sklearn.preprocessing import Normalizer

In [55]:
features = np.array([[0.5, 0.5],
                    [1.1, 3.4],
                    [1.5, 20.2],
                    [1.63, 34.4],
                    [10.9, 3.3]])

normalizer = Normalizer(norm="l2")

In [56]:
normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [57]:
features_l2_norm = Normalizer(norm="l2").transform(features)

In [58]:
features_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [59]:
features_l1_norm = Normalizer(norm="l1").transform(features)

In [60]:
features_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [61]:
print("첫 번째 샘플값의 합:",
     features_l1_norm[0,0] + features_l1_norm[0,1])

첫 번째 샘플값의 합: 1.0


In [62]:
features / np.sum(np.abs(features), axis=1, keepdims=True)

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [64]:
features / np.sqrt(np.sum(np.square(features), axis=1, keepdims=True))

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [65]:
Normalizer(norm="max").transform(features)

array([[1.        , 1.        ],
       [0.32352941, 1.        ],
       [0.07425743, 1.        ],
       [0.04738372, 1.        ],
       [1.        , 0.30275229]])

## 4.4 다항 특성과 교차항 특성 생성하기

In [66]:
from sklearn.preprocessing import PolynomialFeatures

In [68]:
features = np.array([[2,3],
                   [2,3],
                   [2,3]])

In [69]:
polynomial_interaction = PolynomialFeatures(degree=2, include_bias=False)

In [71]:
polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [72]:
interaction = PolynomialFeatures(degree=2,
                                interaction_only=True, include_bias=False)

In [74]:
interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

In [75]:
polynomial_bias = PolynomialFeatures(degree=2,
                                    include_bias=True).fit(features)

In [76]:
polynomial_bias.transform(features)

array([[1., 2., 3., 4., 6., 9.],
       [1., 2., 3., 4., 6., 9.],
       [1., 2., 3., 4., 6., 9.]])

In [77]:
polynomial_bias.get_feature_names()

['1', 'x0', 'x1', 'x0^2', 'x0 x1', 'x1^2']

## 4.5 특성 변환하기

In [78]:
from sklearn.preprocessing import FunctionTransformer

In [79]:
features = np.array([[2,3],
                    [2,3],
                    [2,3]])

In [80]:
def add_ten(x):
    return x +10

In [81]:
ten_transformer = FunctionTransformer(add_ten)

In [82]:
ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [85]:
df=pd.DataFrame(features, columns=["feature_1", "feature_2"])

In [86]:
df.apply(add_ten)

Unnamed: 0,feature_1,feature_2
0,12,13
1,12,13
2,12,13


In [87]:
FunctionTransformer(add_ten, validate=False).transform(np.array([1,2,3]))

array([11, 12, 13])

In [89]:
from sklearn.compose import ColumnTransformer

In [90]:
def add_hundred(x):
    return x+100

In [91]:
ct=ColumnTransformer(
[("add_ten", FunctionTransformer(add_ten, validate=True),['feature_1']),
("add_hundred", FunctionTransformer(add_hundred, validate=True),
['feature_2'])])

In [93]:
ct.fit_transform(df)

array([[ 12, 103],
       [ 12, 103],
       [ 12, 103]])

In [95]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [96]:
features, _ = make_blobs(n_samples = 10,
                        n_features=2,
                        centers=1,
                        random_state=1)

In [97]:
features[0,0] = 10000
features[0,1] = 10000

In [98]:
outlier_detector = EllipticEnvelope(contamination=.1)

In [99]:
outlier_detector.fit(features)
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [105]:
feature=features[:0]

In [103]:
def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr*1.5)
    upper_bound=q3 + (iqr*1.5)
    return np.where((x>upper_bound) | (x <lower_bound))

In [104]:
indicies_of_outliers(feature)

(array([0, 4], dtype=int64), array([0, 0], dtype=int64))

## 4.7 이상치 다루기

In [106]:
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2,3.5,2,116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

In [107]:
houses[houses['Bathrooms']<20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [108]:
houses["Outlier"] = np.where(houses["Bathrooms"]<20,0,1)

In [109]:
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [111]:
houses["Log_of_Square_Feet"]=[np.log(x) for x in houses["Square_Feet"]]

In [112]:
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


## 4.8 특성 이산화하기

In [114]:
age = np.array([[6],
              [12],
                [20],
                [36],
                [65]])

In [115]:
np.digitize(age,bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [116]:
np.digitize(age, bins=[20,30,64], right=True)

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

In [117]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]], dtype=int64)

In [118]:
from sklearn.preprocessing import KBinsDiscretizer

In [121]:
kb=KBinsDiscretizer(4, encode='ordinal', strategy = 'quantile')

kb.fit_transform(age)

array([[0.],
       [1.],
       [2.],
       [3.],
       [3.]])

In [124]:
kb=KBinsDiscretizer(4, encode='onehot-dense', strategy = 'uniform')


In [125]:
kb.fit_transform(age)

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [126]:
kb.bin_edges_

array([array([ 6.  , 20.75, 35.5 , 50.25, 65.  ])], dtype=object)

## 4.9 군집으로 샘플을 그룹으로 묶기

In [128]:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [129]:
features,_ = make_blobs(n_samples = 50,
                       n_features = 2,
                       centers=3,
                       random_state=1)

In [130]:
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

In [131]:
clusterer = KMeans(3, random_state=0)

In [132]:
clusterer.fit(features)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [133]:
dataframe["group"] = clusterer.predict(features)

In [134]:
dataframe.head(5)

Unnamed: 0,feature_1,feature_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


## 4.10 누락된 값을 가진 샘플을 삭제하기

In [142]:
features = np.array([[1.1, 11.1],
                    [2.2, 22.2],
                    [3.3, 33.3],
                    [4.4, 44.4],
                    [np.nan, 55]])

In [143]:
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

In [143]:
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])