# TODO: adult dataset에 one-hot encoding 적용
- 범주형: 'workclass','education', 'education-num', 'marital-status', 'occupation','relationship', 'race', 'gender', 'native-country', 'income'
- 연속형: 'age', fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week'
- **위 Feature 중 'age', 'workclass','education', 'occupation', 'gender', 'hours-per-week' 만 사용한다.**
- 범주형 Feature중 **income은 출력 데이터이므로 Label Encoding 처리**를 한다.
- 나머지 범주형Feature들은 One-hot encoding 처리한다.

1. DataSet 읽기

2. 결측치 처리 (제거)

3. One hot encoding

4. 모델링
    - Train/Test set 분리
    - 모델 생성
    - 학습
    - 검증

##### 데이터 로딩

In [1]:
import pandas as pd
import numpy as np

In [2]:
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','gender','capital-gain','capital-loss','hours-per-week','native-country','income']

In [3]:
cols = ['age', 'workclass','fnlwgt','education', 'education-num', 'marital-status', 'occupation','relationship', 'race', 'gender','capital-gain','capital-loss', 'hours-per-week','native-country', 'income']
df  = pd.read_csv('data/adult.data', 
                    header=None, 
                    names=cols, na_values='?',   # '?'는 NA(결측치)
                    skipinitialspace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


##### 결측치 제거 

In [4]:
df.isnull().sum() 

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
gender               0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [5]:
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
gender            0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [7]:
df.shape

(30162, 15)

##### 필요한 Feature들만 추출

In [8]:
c = ['age', 'workclass','education', 'occupation', 'gender', 'hours-per-week']
X = df[['age', 'workclass','education', 'occupation', 'gender', 'hours-per-week']]
X.head()

Unnamed: 0,age,workclass,education,occupation,gender,hours-per-week
0,39,State-gov,Bachelors,Adm-clerical,Male,40
1,50,Self-emp-not-inc,Bachelors,Exec-managerial,Male,13
2,38,Private,HS-grad,Handlers-cleaners,Male,40
3,53,Private,11th,Handlers-cleaners,Male,40
4,28,Private,Bachelors,Prof-specialty,Female,40


##### income(출력데이터): LabelEncoding

In [9]:
# income만 labelencoding 해서 분리
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['income'])
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1])

##### 1. pandas get_dummies() 사용해 변환

In [10]:
X1 = pd.get_dummies(X)
X1.shape

(30162, 41)

##### 2. scikit-learn OneHotEncoder 사용

In [11]:
from sklearn.preprocessing import OneHotEncoder

In [12]:
ohe = OneHotEncoder(sparse=False)
r = ohe.fit_transform(X[['workclass','education', 'occupation', 'gender']])
r.shape

(30162, 39)

In [13]:
ohe.get_feature_names_out()

array(['workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov',
       'workclass_Without-pay', 'education_10th', 'education_11th',
       'education_12th', 'education_1st-4th', 'education_5th-6th',
       'education_7th-8th', 'education_9th', 'education_Assoc-acdm',
       'education_Assoc-voc', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Preschool', 'education_Prof-school',
       'education_Some-college', 'occupation_Adm-clerical',
       'occupation_Armed-Forces', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
       'occupation_Other-service', 'occupation_Priv-house-serv',
       'occupation_Prof-specialty', 'occupation_Protective-serv',
       'occupation_Sales', 'occupation_Tech-support',
   

In [14]:
# .values => numpy 배열로 변경
c_v = df[['age', 'hours-per-week']].values

In [15]:
type(r), type(c_v)

(numpy.ndarray, numpy.ndarray)

###### onehot encoding 된 feature들과 나머지 feature 합치기

In [16]:
X2 = np.concatenate([c_v, r], axis=1)   # ndarray 를 합칠땐 concatenate
X2.shape

(30162, 41)

### 모델 학습

In [17]:
# X1, y
# X2, y

##### train, test set 나누기

In [19]:
from sklearn.model_selection import train_test_split

# test
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, 
                                                    stratify=y, random_state=0)
# train / validation
X_train, X_val, y_train, y_val= train_test_split(X_train, y_train,
                                                 test_size=0.25,
                                                 stratify=y_train, random_state=0)

##### 모델생성
- DecisionTreeClassifier
- LogisticRegression

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [23]:
tree = DecisionTreeClassifier(random_state=0)
lr = LogisticRegression(max_iter=2000, random_state=0)

# 학습
tree.fit(X_train, y_train)
lr.fit(X_train, y_train)

LogisticRegression(max_iter=2000, random_state=0)

##### 추론

In [25]:
pred_train_tree = tree.predict(X_train)
pred_train_lr = lr.predict(X_train)

pred_val_tree = tree.predict(X_val)
pred_val_lr = lr.predict(X_val)

##### 평가

In [28]:
from sklearn.metrics import accuracy_score
acc_train_tree = accuracy_score(y_train, pred_train_tree)
acc_train_lr = accuracy_score(y_train, pred_train_lr)

acc_val_tree = accuracy_score(y_val, pred_val_tree)
acc_val_lr = accuracy_score(y_val, pred_val_lr)

In [30]:
print('train 정확도')
print(f"Tree: {acc_train_tree}, LR: {acc_train_lr}")

print('validation 정확도')
print(f"Tree: {acc_val_tree}, LR: {acc_val_lr}")

train 정확도
Tree: 0.9472259062776304, LR: 0.8081896551724138
validation 정확도
Tree: 0.7593237195425162, LR: 0.8032487982761478
