# 빅데이터 분석기사 실기 준비 놀이터
- https://www.kaggle.com/datasets/agileteam/bigdatacertificationkr

## T2-3. Adult Census Income (Classification) / 성인 인구소득 예측

### 3. 성인 인구조사 소득 예측
- 데이터셋 : adult.csv
- https://www.kaggle.com/code/agileteam/t2-3-adult-census-income-tutorial/notebook

#### 0. 시험 환경 세팅

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("./data/adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

#### 1. 라이브러리 및 데이터 호출

In [2]:
import numpy as np
import pandas as pd

X_train.shape, X_test.shape, y_train.shape

((26048, 15), (6513, 15), (26048, 2))

In [3]:
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

#### 2. EDA

display(X_train.info(), X_train.head(), X_train.isnull().sum())

display(X_test.info(), X_test.head(), X_test.isnull().sum())

display(y_train.head(), y_train.income.value_counts())

#### 3. Preprocessing
- 타겟 컬럼 : 수치형으로 변경
- 불필요컬럼 제거 : id
- 결측치 컬럼 3개 : 최빈값으로 대체
- 수치형 컬럼 : 정규화(MinMaxScaler)
- 범주형 컬럼 : LabelEncoder

> 타겟 컬럼 : 수치형으로 변경

In [4]:
y = (y_train['income'] == '>50K').astype(int)
y.value_counts()

0    19756
1     6292
Name: income, dtype: int64

> 불필요컬럼 id 제거

In [5]:
print(X_train.shape, X_test.shape)

X_train.drop('id', axis=1, inplace=True)
X_test.drop('id', axis=1, inplace=True)

print(X_train.shape, X_test.shape)

(26048, 15) (6513, 15)
(26048, 14) (6513, 14)


> 결측치 컬럼 3개 : 최빈값으로 대체

In [6]:
train_isnull = X_train.isnull().sum()
null_col = list(train_isnull[train_isnull>0].index)

In [7]:
for c in null_col:
    print(X_train[c].nunique())
    print(X_train[c].unique())
    print(X_train[c].mode()[0])
    print('='*100)

8
['Private' 'State-gov' 'Self-emp-not-inc' 'Self-emp-inc' 'Local-gov'
 'Federal-gov' nan 'Never-worked' 'Without-pay']
Private
14
['Craft-repair' 'Prof-specialty' 'Tech-support' 'Handlers-cleaners'
 'Sales' 'Machine-op-inspct' 'Exec-managerial' 'Other-service'
 'Adm-clerical' 'Transport-moving' 'Farming-fishing' nan 'Protective-serv'
 'Armed-Forces' 'Priv-house-serv']
Exec-managerial
41
['United-States' nan 'England' 'Mexico' 'Jamaica' 'Philippines'
 'El-Salvador' 'Vietnam' 'Poland' 'Canada' 'India' 'Italy' 'Cuba' 'China'
 'Ecuador' 'Haiti' 'Portugal' 'Dominican-Republic' 'Japan' 'Germany'
 'Columbia' 'Yugoslavia' 'Iran' 'Taiwan' 'Puerto-Rico' 'Peru' 'Nicaragua'
 'Ireland' 'France' 'South' 'Guatemala' 'Laos' 'Trinadad&Tobago' 'Greece'
 'Thailand' 'Honduras' 'Scotland' 'Hong' 'Outlying-US(Guam-USVI-etc)'
 'Cambodia' 'Hungary' 'Holand-Netherlands']
United-States


In [8]:
for c in null_col:
    print(c)
    print(X_train[c].value_counts())
    print('='*100)

workclass
Private             18160
Self-emp-not-inc     2049
Local-gov            1648
State-gov            1037
Self-emp-inc          909
Federal-gov           770
Without-pay            12
Never-worked            7
Name: workclass, dtype: int64
occupation
Exec-managerial      3323
Prof-specialty       3306
Craft-repair         3296
Adm-clerical         3037
Sales                2898
Other-service        2624
Machine-op-inspct    1584
Transport-moving     1257
Handlers-cleaners    1080
Farming-fishing       786
Tech-support          746
Protective-serv       521
Priv-house-serv       119
Armed-Forces            8
Name: occupation, dtype: int64
native.country
United-States                 23381
Mexico                          516
Philippines                     158
Germany                         108
Canada                           88
Puerto-Rico                      87
El-Salvador                      76
India                            73
Cuba                             73
England

In [9]:
for c in null_col:
    X_train[c] = X_train[c].fillna(X_train[c].mode()[0])
    X_test[c] = X_test[c].fillna(X_test[c].mode()[0])
    
X_train.isnull().sum().sum(), X_train.isnull().sum().sum()    

(0, 0)

> 수치형 컬럼 : 정규화

In [10]:
int_col = X_train.select_dtypes(exclude='object').columns

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train[int_col] = scaler.fit_transform(X_train[int_col])
X_test[int_col] = scaler.transform(X_test[int_col])

print(X_train.shape, X_test.shape)

(26048, 14) (6513, 14)


> 범주형 컬럼 : LabelEncoder

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
obj_col = X_train.select_dtypes(include='object').columns

In [14]:
le = LabelEncoder()
X_train[obj_col] = X_train[obj_col].apply(le.fit_transform)
X_test[obj_col] = X_test[obj_col].apply(le.fit_transform)

print(X_train.shape, X_test.shape)

(26048, 14) (6513, 14)


- 가변수는 X_train과 X_test의 유일값 차이로 컬럼 수가 달라지므로 제외

In [15]:
'''X_train = pd.get_dummies(X_train, columns=obj_col)
X_test = pd.get_dummies(X_test, columns=obj_col)

print(X_train.shape, X_test.shape)'''

'X_train = pd.get_dummies(X_train, columns=obj_col)\nX_test = pd.get_dummies(X_test, columns=obj_col)\n\nprint(X_train.shape, X_test.shape)'

#### 4. Modeling

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier, XGBRFClassifier

In [17]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.2, shuffle=True, random_state=156)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((20838, 14), (5210, 14), (20838,), (5210,))

In [18]:
rf = RandomForestClassifier(random_state=156)
gb = GradientBoostingClassifier(random_state=156)
xgb = XGBClassifier(random_state=156)
xgbrf = XGBRFClassifier(random_state=156)

models = [rf, gb, xgb, xgbrf]

In [19]:
for model in models:
    model.fit(X_tr, y_tr)
    pred = model.predict(X_val)
    pred_proba = model.predict_proba(X_val)[:,1]
    
    name = str(model).split('(')[0]
    print(name)
    print('정확도 :', accuracy_score(y_val, pred))
    print('auc :', roc_auc_score(y_val, pred_proba))
    print('='*100)

RandomForestClassifier
정확도 : 0.8510556621880998
auc : 0.9003071391237654
GradientBoostingClassifier
정확도 : 0.8658349328214971
auc : 0.9155316427120175
XGBClassifier
정확도 : 0.8690978886756238
auc : 0.9212403599692556
XGBRFClassifier
정확도 : 0.8585412667946257
auc : 0.8999598461632897


In [20]:
xgb = XGBClassifier(random_state=156)

xgb.fit(X_train, y)
pred = xgb.predict(X_test)

In [21]:
out_data = pd.DataFrame({'id' : X_test_copy.id,
                         'income' : pred})

In [22]:
# out_data.to_csv('123456.csv', index=False)

In [23]:
# 가채점
y_test = (y_test['income'] == '>50K').astype(int)
pred_proba = xgb.predict_proba(X_test)[:,1]

print('정확도 :', accuracy_score(y_test, pred))
print('auc : ', roc_auc_score(y_test, pred_proba))

정확도 : 0.8691847075080608
auc :  0.9216985926820298


#### 5. 다른 모델 평가

In [24]:
for model in models:
    model.fit(X_train, y)
    pred = model.predict(X_test)
    pred_proba = model.predict_proba(X_test)[:,1]
    
    name = str(model).split('(')[0]
    
    print(name)
    print('정확도 :', accuracy_score(y_test, pred))
    print('auc : ', roc_auc_score(y_test, pred_proba))
    print('='*100)

RandomForestClassifier
정확도 : 0.8564409642253954
auc :  0.9055850282134663
GradientBoostingClassifier
정확도 : 0.8651926915399969
auc :  0.9185953584985557
XGBClassifier
정확도 : 0.8691847075080608
auc :  0.9216985926820298
XGBRFClassifier
정확도 : 0.8541378780899739
auc :  0.9044136504589013
