# encoding_columns 레이블 인코딩 처리


- 범주형: 'workclass','education', 'education-num', 'marital-status', 'occupation','relationship', 'race', 'gender','native-country', 'income'
- 연속형: 'age', fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'

**아래 encoding_columns 컬럼들은 Label Encoding 처리,**  
**not_encoding_columns 컬럼들의 값들은 그대로 유지.**

##### apply 함수 이용
- DataFrame 이나 Series의 원소들을 일괄처리하는 함수
- DataFrame객체.apply(함수)
    - 컬럼, 행 단위 일괄처리
- Series객체.apply(함수)
    - 원소별 일괄처리

In [11]:
d = pd.DataFrame({
    "a":[1,2,3,],
    "b":[10,20,30]
})
d

Unnamed: 0,a,b
0,1,10
1,2,20
2,3,30


In [12]:
def func(x):
#     print(type(x),x)
    return x*2

In [13]:
d.apply(func)

Unnamed: 0,a,b
0,2,20
1,4,40
2,6,60


In [14]:
import pandas as pd

cols = ['age', 'workclass','fnlwgt','education', 'education-num', 'marital-status', 'occupation','relationship', 'race', 'gender','capital-gain','capital-loss', 'hours-per-week','native-country', 'income']
data  = pd.read_csv('data/adult.data', 
                    header=None, 
                    names=cols, na_values='?',   # '?'는 NA(결측치)
                    skipinitialspace=True)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


##### 결측치 처리 

In [15]:
# data.isnull()   # cell별로 결측치 여부 확인
data.isnull().sum()   # True : 1 / False : 0 => 컬럼단위로 합계를 구함

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
gender               0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [16]:
data.dropna(inplace=True)

In [17]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [18]:
data.shape

(30162, 15)

In [19]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [20]:
encoding_columns = ['workclass','education','marital-status', 'occupation','relationship','race','gender','native-country', 'income']
not_encoding_columns = ['age','fnlwgt', 'education-num','capital-gain','capital-loss','hours-per-week']

In [21]:
# data(DateFrame_)를 복사
adult_df = data.copy()

In [22]:
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [23]:
from sklearn.preprocessing import LabelEncoder
dic={}
# key : column 이름, value: [classes_, labelEncoder객체]

for col in encoding_columns:
    le = LabelEncoder()
    # Labelencoding된 값을 원래 칼럼의 값으로 대체
    adult_df[col] = le.fit_transform(adult_df[col])
    # dic에 column명:[classes_, le] 추가
    dic[col] = [le.classes_, le]

In [24]:
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38,0
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38,0
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38,0
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38,0
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,2,257302,7,12,2,12,5,4,0,0,0,38,38,0
32557,40,2,154374,11,9,2,6,0,4,1,0,0,40,38,1
32558,58,2,151910,11,9,6,0,4,4,0,0,0,40,38,0
32559,22,2,201490,11,9,4,0,3,4,1,0,0,20,38,0


In [25]:
dic

{'workclass': [array(['Federal-gov', 'Local-gov', 'Private', 'Self-emp-inc',
         'Self-emp-not-inc', 'State-gov', 'Without-pay'], dtype=object),
  LabelEncoder()],
 'education': [array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th',
         'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad',
         'Masters', 'Preschool', 'Prof-school', 'Some-college'],
        dtype=object),
  LabelEncoder()],
 'marital-status': [array(['Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
         'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'],
        dtype=object),
  LabelEncoder()],
 'occupation': [array(['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial',
         'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
         'Other-service', 'Priv-house-serv', 'Prof-specialty',
         'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'],
        dtype=object),
  LabelEncoder()],
 'relationship': [array(['

In [26]:
dic['education'][0]

array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th',
       'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad',
       'Masters', 'Preschool', 'Prof-school', 'Some-college'],
      dtype=object)

In [27]:
# LabelEncoding 처리 함수
dic2={}
def le_func(X):
    # 범주형  Feature를 받아서 LabelEncoding 후 반환
    le = LabelEncoder()
    label = le.fit_transform(X)
    # Series.name : Series의 이름 - DataFrame의 컬럼에서 조회된 시리즈일 경우 컬럼명이 반환
    dic2[X.name] = [le.classes_, le]
    return label

In [28]:
encoding_df = data[encoding_columns].apply(le_func)
encoding_df

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
0,5,9,4,0,1,4,1,38,0
1,4,9,2,3,0,4,1,38,0
2,2,11,0,5,1,4,1,38,0
3,2,1,2,5,0,2,1,38,0
4,2,9,2,9,5,2,0,4,0
...,...,...,...,...,...,...,...,...,...
32556,2,7,2,12,5,4,0,38,0
32557,2,11,2,6,0,4,1,38,1
32558,2,11,6,0,4,4,0,38,0
32559,2,11,4,0,3,4,1,38,0


In [29]:
dic2

{'workclass': [array(['Federal-gov', 'Local-gov', 'Private', 'Self-emp-inc',
         'Self-emp-not-inc', 'State-gov', 'Without-pay'], dtype=object),
  LabelEncoder()],
 'education': [array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th',
         'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad',
         'Masters', 'Preschool', 'Prof-school', 'Some-college'],
        dtype=object),
  LabelEncoder()],
 'marital-status': [array(['Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
         'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'],
        dtype=object),
  LabelEncoder()],
 'occupation': [array(['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial',
         'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
         'Other-service', 'Priv-house-serv', 'Prof-specialty',
         'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'],
        dtype=object),
  LabelEncoder()],
 'relationship': [array(['

In [30]:
# 인코딩된 DF과 원본의 연속형 Feature 들을 합친다.
result_df = encoding_df.join(data[not_encoding_columns])

In [31]:
result_df.shape

(30162, 15)

In [32]:
result_df.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
0,5,9,4,0,1,4,1,38,0,39,77516,13,2174,0,40
1,4,9,2,3,0,4,1,38,0,50,83311,13,0,0,13
2,2,11,0,5,1,4,1,38,0,38,215646,9,0,0,40
3,2,1,2,5,0,2,1,38,0,53,234721,7,0,0,40
4,2,9,2,9,5,2,0,4,0,28,338409,13,0,0,40


## Adult datast의 income 추론 모델링

### 데이터 분할
- X, y 나누기 : Feature, Label 분리
- train/validation/test set 나누기

In [49]:
# X, y 분리 (y=income, X=나머지)
y = adult_df['income']
X = adult_df.drop(columns='income')

In [50]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: income, dtype: int32

In [51]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,39,5,77516,9,13,4,0,1,4,1,2174,0,40,38
1,50,4,83311,9,13,2,3,0,4,1,0,0,13,38
2,38,2,215646,11,9,0,5,1,4,1,0,0,40,38
3,53,2,234721,1,7,2,5,0,2,1,0,0,40,38
4,28,2,338409,9,13,2,9,5,2,0,0,0,40,4


In [52]:
from sklearn.model_selection import train_test_split
# Train/Validation/Test set 분리

# test set 분리
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, stratify=y)
y_train.shape, y_test.shape

((24129,), (6033,))

In [53]:
# train/validation 분리
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25,
                                                 stratify=y_train)
y_train.shape, y_val.shape

((18096,), (6033,))

### 모델생성, 학습
- DecisionTreeClassifier

In [117]:
from sklearn.tree import DecisionTreeClassifier
# 모델생성
dt_clf = DecisionTreeClassifier(max_depth=7)  # max_depth:None, 1 ~

In [118]:
# 학습 - train set
dt_clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=7)

### 검증
- 정확도(accuracy)
- train/validation 데이터셋을 기준으로 검증

In [119]:
# 추론(예측) - predict(Feature:X), predict_proba(X)
pred_train = dt_clf.predict(X_train)
pred_val = dt_clf.predict(X_val)

print(pred_train.shape, pred_val.shape)

(18096,) (6033,)


In [120]:
print(pred_train[:10])
print(y_train[:10].values)

[1 1 0 0 1 1 0 0 0 0]
[0 1 0 1 1 1 1 1 0 0]


In [121]:
# 정확도 평가
from sklearn.metrics import accuracy_score
acc_train = accuracy_score(y_train, pred_train) # (정답, 예측)
acc_val = accuracy_score(y_val, pred_val)

print('train set: {}, validation set: {}'.format(acc_train, acc_val))

train set: 0.8544429708222812, validation set: 0.8511519973479198


- 학습결과
    1. max_depth: None 
       - train: 1.0, validation: 0.8027515332338804
    2. max_depth: 1
       - train: 0.751105216622458, validation: 0.7510359688380573
    3. max_depth: 2
       - train: 0.8220048629531388, validation: 0.8234709099950274 
    4. max_depth: 3
       - train: 0.8357648099027409, validation: 0.8377258412066965
    5. max_depth: 4 
       - train set: 0.8400198938992043, validation set: 0.8407094314603016
    6. max_depth: 5 
       - train set: 0.8432802829354553, validation set: 0.8443560417702636
    7. max_depth: 6
       - train set: 0.8521220159151194, validation set: 0.8508204873197415
    8. max_depth: 7 
       - train set: 0.8544429708222812, validation set: 0.8508204873197415
    9. max_depth: 8 
       - train set: 0.8584770114942529, validation set: 0.849494447207028
    10. max_depth: 9 
       - train set: 0.8636715296198055, validation set: 0.8461793469252444
    11. max_depth: 10
       - train set: 0.8703580901856764, validation set: 0.8465108569534229

max_depth : 7 선택

### 최종평가
- test set으로 최종평가

In [122]:
pred_test = dt_clf.predict(X_test)
accuracy_score(y_test, pred_test)

0.849494447207028