In [2]:
%matplotlib inline
import sys 
sys.path.append('..')
from preamble import *

## 4. Representing Data and Engineering Features

- 특성(Feature)의 종류
  - Continuous Feature (연속형 특성)
    - 정량적
    - 예
      - 픽셀 밝기
      - 붓꽃 측정값
  - Categorical Feature (범주형 특성) or Discrete Feature (이산형 특성)
    - 정성적
    - 예
      - 제품의 브랜드
      - 색상
      - 판매분류(책, 옷, 하드웨어)
- Feature Engineering (특성 공학)
  - 특정 어플리케이션에 가장 적합한 데이터의 표현을 찾는 것
  - 일반적으로 데이터가 어떤 형태로 구성되어 있는가보다 데이터를 어떻게 표현하는가가 머신러닝 모델의 성능에 더 많은 영향을 줌
  - 올바른 데이터 표현은 지도학습 모델에서 적절한 매개변수를 선택하는 것보다 성능에 더 많은 영향을 줌

### 4.1 Categorical Variables

- Adult Data Set
  - https://archive.ics.uci.edu/ml/datasets/adult
  - 1994년 인구 조사 데이터베이스에서 추출한 미국 성인의 소득 데이터셋
  - 특성(Feature)
    - 연속형 특성
      - 근로자 나이(age)
      - 주당 근로시간(hours-per-week)
    - 범주형 특성
      - 고용형태(workclass)
        - 자영업(self-emp-not-inc)
        - 사업체 근로자(private)
        - 공공 근로자(state-gov)
      - 교육수준(education)
        - 학사(Bachelors)
        - 석사(Masters)
        - ...
      - 성별(gender)
      - 직업(occupation)
- 풀려는 문제
  - 분류 문제: 어떤 근로자의 수입이 50,000달러를 초과하는지 그 이하인지를 예측 
  - 타깃 특성: income
    - <=50K
    - \>50K
- 사용하려는 머신러닝 모델
  - 로지스틱 회귀
  - 문제
    - 범주형 특성값을 로지스틱 회귀식에 곧바로 넣을 수가 없음

#### 4.1.1 One-Hot-Encoding (Dummy variables)
- One-out-of-N encoding
- Dummy variable

- pandas를 이용하여 데이터를 로드하고 범주형 변수를 원-핫 인코딩으로 변경

In [7]:
import os
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
adult_path = os.path.join(mglearn.datasets.DATA_PATH, "adult.data")

data = pd.read_csv(
    adult_path, 
    header=None, 
    index_col=False,
    names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])

# For illustration purposes, we only select some of the columns:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
# IPython.display allows nice output formatting within the Jupyter notebook
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


##### Checking string-encoded categorical data

- Colume의 내용 확인
  - value_counts()
    - 각 유일한 값들이 몇 번 출현하는지 출력 

In [5]:
print(data.gender.value_counts())

 Male      21790
 Female    10771
Name: gender, dtype: int64


In [8]:
print("Original features:", list(data.columns))
print("length of features:", len(data.columns))

Original features: ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']
length of features: 7


- pandas.get_dummies(data)
  - 주어진 data에서 범주형(문자열)으로 분류되는 열을 자동으로 수치형으로 변환

In [9]:
data_dummies = pd.get_dummies(data)
print("Features after get_dummies:\n", list(data_dummies.columns))
print("length of dummy features:", len(data_dummies.columns))

Features after get_dummies:
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupatio

In [10]:
display(data_dummies.head(n=10))

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,...,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,...,0,0,1,0
1,50,13,0,0,...,0,0,1,0
2,38,40,0,0,...,0,0,1,0
3,53,40,0,0,...,0,0,1,0
4,28,40,0,0,...,0,0,1,0
5,37,40,0,0,...,0,0,1,0
6,49,16,0,0,...,0,0,1,0
7,52,45,0,0,...,0,0,0,1
8,31,50,0,0,...,0,0,0,1
9,42,40,0,0,...,0,0,0,1


In [11]:
one_hot_encoded = data_dummies.loc[:, 'workclass_ ?':'workclass_ Without-pay']
display(one_hot_encoded.head(n=10))
print(one_hot_encoded.values[:10])

Unnamed: 0,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,...,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay
0,0,0,0,0,...,0,0,1,0
1,0,0,0,0,...,0,1,0,0
2,0,0,0,0,...,0,0,0,0
3,0,0,0,0,...,0,0,0,0
4,0,0,0,0,...,0,0,0,0
5,0,0,0,0,...,0,0,0,0
6,0,0,0,0,...,0,0,0,0
7,0,0,0,0,...,0,1,0,0
8,0,0,0,0,...,0,0,0,0
9,0,0,0,0,...,0,0,0,0


[[0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 1 0 0]
 [0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0]]


- 훈련 데이터에서 타깃 속성 제외하기

In [12]:
# Get only the columns containing features
# that is all columns from 'age' to 'occupation_ Transport-moving'
# This range contains all the features but not the target

features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']

# extract NumPy arrays
X = features.values
y = data_dummies['income_ >50K'].values
print("X.shape: {}  y.shape: {}".format(X.shape, y.shape))

X.shape: (32561, 44)  y.shape: (32561,)


- 로지스틱 회귀 모델 적용 및 평가

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print("Test score: {:.2f}".format(logreg.score(X_test, y_test)))

Test score: 0.81
