# 성인 인구조사 소득 예측

- age: 나이
- workclass: 고용 형태
- fnlwgt: 사람의 대표성을 나타내는 가중치(final weight)
- education: 교육 수준
- education.num: 교육 수준 수치
- marital.status: 결혼 상태
- occupation: 업종
- relationship: 가족 관계
- race: 인종
- sex: 성별
- capital.gain: 양도 소득
- capital.loss: 양도 손실
- hours.per.week: 주당 근무 시간
- native.country: 국적
- income: 수익 (예측해야 하는 값)

In [92]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("data/adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

## 사용자 코딩
### 랜덤포레스트, 앙상블로 코드짜보기! 

## 라이브러리 불러오기

In [93]:
import pandas as pd
import numpy as np

## 데이터 불러오기(생략)

In [94]:
# 시험환경에서는 아래와 같이 제공된다고 함
# import pandas as pd
# X_test = pd.read_csv("data/X_test.csv")
# X_train = pd.read_csv("data/X_train.csv")
# y_train = pd.read_csv("data/y_train.csv")

## EDA

In [95]:
# 데이터 크기 확인
X_train.shape, X_test.shape, y_train.shape # X와 y의 행 개수는 맞구나~

((26048, 15), (6513, 15), (26048, 2))

In [102]:
# 데이터 확인
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States
14121,14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States
32345,32345,54,State-gov,138852,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


In [103]:
# 타겟 수 확인
y_train['income'].value_counts()

<=50K    19756
>50K      6292
Name: income, dtype: int64

In [104]:
# 데이터 타입 확인
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 21851 to 25716
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26048 non-null  int64 
 1   age             26048 non-null  int64 
 2   workclass       24592 non-null  object
 3   fnlwgt          26048 non-null  int64 
 4   education       26048 non-null  object
 5   education.num   26048 non-null  int64 
 6   marital.status  26048 non-null  object
 7   occupation      24585 non-null  object
 8   relationship    26048 non-null  object
 9   race            26048 non-null  object
 10  sex             26048 non-null  object
 11  capital.gain    26048 non-null  int64 
 12  capital.loss    26048 non-null  int64 
 13  hours.per.week  26048 non-null  int64 
 14  native.country  25587 non-null  object
dtypes: int64(7), object(8)
memory usage: 3.2+ MB


## 데이터 칼럼 선택하기. 

# 소득예측

삭제할 목록 : id, education 일단 이것 두개만 빼면 될듯 
포함할 목록 : age, workclass,fnlwgt, education.num, marital.status, occupation, race, relationship 등등

In [105]:
# 불필요한 데이터 삭제. 

x_tr = X_train.drop(['id'], axis = 1)
x_ts = X_test.drop(['id'], axis = 1)
y_tr = y_train['income']

## 결측치 

In [106]:
# 결측치 검출

print(x_tr.isna().sum()) # workclass, occupation, native.country
print(x_ts.isna().sum()) # workclass, occupation, native.country
print(y_tr.isna().sum()) # x

age                  0
workclass         1456
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1463
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     461
dtype: int64
age                 0
workclass         380
fnlwgt              0
education           0
education.num       0
marital.status      0
occupation        380
relationship        0
race                0
sex                 0
capital.gain        0
capital.loss        0
hours.per.week      0
native.country    122
dtype: int64
0


In [107]:
# 결측치 채우기  - 현재 모든 결측치가 범주형이다. - 범주형 데이터의 결측치는 ,, 빈도수로 넣나?

x_tr[['workclass', 'occupation', 'native.country']].describe()

Unnamed: 0,workclass,occupation,native.country
count,24592,24585,25587
unique,8,14,41
top,Private,Exec-managerial,United-States
freq,18160,3323,23381


여기서 확인할 수 있는것. 범주형 결측값의 경우 전체 행 수와 빈도수 1등이 많이 차이 나는게 아닌것 같을 때 넣어야함.   
그런게 아니라면 다른 방법으로 전처리해야함. 

In [108]:
def data_fillna(df) :
    df['workclass'].fillna(df['workclass'].describe().top, inplace = True)
    df['occupation'].fillna('null', inplace = True)
    df['native.country'].fillna(df['native.country'].describe().top, inplace = True)
    return df

data_fillna(x_tr)
data_fillna(x_ts)

print(x_tr.isna().sum()) # workclass, occupation, native.country
print(x_ts.isna().sum()) # workclass, occupation, native.country

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64


## 전처리 

In [109]:
# 피처 구분
# Numeric features
numeric_features = [
                    'age',
                    'fnlwgt', 
                    'education.num',
                    'capital.gain', 
                    'capital.loss', 
                    'hours.per.week',                     
                   ]

# Categorical features
cat_features = [
                 'workclass',                         
                 'marital.status', 
                 'occupation', 
                 'relationship', 
                 'race',
                 'education',
                 'sex',
                 'native.country'
]

In [110]:
# 범주형 

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

x_tr[cat_features] = x_tr[cat_features].apply(le.fit_transform)
x_ts[cat_features] = x_ts[cat_features].apply(le.fit_transform)

In [111]:
x_tr[numeric_features].describe() #이게 정규성을 확보하지 못한다고 생각하는건가?

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,26048.0,26048.0,26048.0,26048.0,26048.0,26048.0
mean,38.610335,189574.1,10.082118,1081.193796,88.477695,40.420224
std,13.628346,104384.8,2.574608,7404.962675,404.689981,12.354707
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,118247.2,9.0,0.0,0.0,40.0
50%,37.0,178575.5,10.0,0.0,0.0,40.0
75%,48.0,236596.8,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [112]:
# 스케일링 (수치형) _ 왜 굳이 ? 

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

x_tr[numeric_features] = scaler.fit_transform(x_tr[numeric_features]) #이 방법대로 해도 될 것 같은디
x_ts[numeric_features] = scaler.transform(X_ts[numeric_features])

In [113]:
# 최종 x값

x_tr.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,0.260274,3,0.156011,9,0.8,2,2,0,4,1,0.0,0.0,0.5,38
7632,0.493151,3,0.062255,12,0.866667,0,9,1,4,0,0.0,0.0,0.397959,38
27878,0.027397,3,0.129566,15,0.6,4,12,1,4,0,0.0,0.0,0.244898,38
14121,0.041096,3,0.061343,11,0.533333,4,5,3,4,1,0.0,0.0,0.295918,38
32345,0.506849,6,0.085958,11,0.533333,2,9,0,4,1,0.0,0.0,0.397959,38


In [114]:
# y도 수치형으로 바꿔야함. 
y_tr.unique()

array(['>50K', '<=50K'], dtype=object)

In [115]:
# 정량적 자료로 바꾸기! 

y_tr = (y_tr != '<=50K').astype(int) # 가로속에 코드로 bool형을 만들고 int로 바꾸기. 

# >50K가 1, <=50K가 0

y_tr.value_counts()

0    19756
1     6292
Name: income, dtype: int64

## 검증용 데이터 분리

In [116]:
from sklearn.model_selection import train_test_split

a_tr, a_val, b_tr, b_val = train_test_split(x_tr, y_tr, test_size = 0.1) # random_state=2021
a_tr.shape, a_val.shape, b_tr.shape, b_val.shape

((23443, 14), (2605, 14), (23443,), (2605,))

In [117]:
b_tr.head()

6250     0
31938    0
14263    1
28074    0
9050     1
Name: income, dtype: int64

## 모델 & 평가

In [118]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


model = RandomForestClassifier(random_state = 2022)
model.fit(a_tr, b_tr)
pred = model.predict(a_val)
print('예측 정확도 = ', accuracy_score(pred,b_val))

예측 정확도 =  0.8479846449136277


In [119]:
# test데이터 예측 (pop을 활용하면 값을 넘겨주고 삭제 됨)
X_test_id = X_test.pop('id')
pred = model.predict(X_test)

ValueError: could not convert string to float: 'Private'

In [None]:
# csv생성
output = pd.DataFrame({'id': X_test_id, 'income':pred})
output.to_csv("000000.csv", index=False)
output.head()

## 채점 (수험자는 확인 불가)

In [None]:
y_test = (y_test['income'] != '<=50K').astype(int)
from sklearn.metrics import accuracy_score
print('accuracy score:', (accuracy_score(y_test, pred)))

## 알것들

In [None]:
# 라벨인코딩
from sklearn.preprocessing import LabelEncoder

all_df = pd.concat([X_train.assign(ind="train"), X_test.assign(ind="test")]) #이거 그냥 각 데이터베이스에 ind열을 만들고 거기에 각각 df이름 넣고 합치는 것 
le = LabelEncoder()
all_df[cat_features] = all_df[cat_features].apply(le.fit_transform)

X_train = all_df[all_df['ind'] == 'train']
X_train = X_train.drop('ind',axis=1)
X_train

X_test = all_df[all_df['ind'] == 'test'] # 그리고 때는거지. 
X_test = X_test.drop('ind',axis=1)
X_test

In [None]:
# 의사결정나무
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(random_state = 2022)
model.fit(X_tr, y_tr)
pred = model.predict(X_val)
print('accuracy score:', (accuracy_score(y_val, pred)))