# 성인 인구조사 소득 예측

- age: 나이
- workclass: 고용 형태
- fnlwgt: 사람의 대표성을 나타내는 가중치(final weight)
- education: 교육 수준
- education.num: 교육 수준 수치
- marital.status: 결혼 상태
- occupation: 업종
- relationship: 가족 관계
- race: 인종
- sex: 성별
- capital.gain: 양도 소득
- capital.loss: 양도 손실
- hours.per.week: 주당 근무 시간
- native.country: 국적
- income: 수익 (예측해야 하는 값)

In [31]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name

    if null_name != "":
        df[df == null_name] = np.nan

    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)

    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])


    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test

df = pd.read_csv("adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

In [None]:
# age: 나이
# workclass: 고용 형태
# fnlwgt: 사람의 대표성을 나타내는 가중치(final weight)
# education: 교육 수준
# education.num: 교육 수준 수치
# marital.status: 결혼 상태
# occupation: 업종
# relationship: 가족 관계
# race: 인종
# sex: 성별
# capital.gain: 양도 소득
# capital.loss: 양도 손실
# hours.per.week: 주당 근무 시간
# native.country: 국적
# income: 수익 (예측해야 하는 값)

In [5]:
# 성인 인구조사 소득 예측
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [14]:
X_train.shape, X_test.shape, y_train.shape

((26048, 15), (6513, 15), (26048, 2))

In [3]:
# age,	workclass,	fnlwgt,	education,	education.num,	marital.status,
# occupation,	relationship,	race,	sex,	capital.gain,	capital.loss,
# hours.per.week,	native.country
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States
14121,14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States
32345,32345,54,State-gov,138852,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


In [4]:
# id, income
y_train.head()

Unnamed: 0,id,income
21851,21851,>50K
7632,7632,<=50K
27878,27878,<=50K
14121,14121,<=50K
32345,32345,<=50K


In [6]:
X_train.isnull().sum()

id                   0
age                  0
workclass         1456
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1463
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     461
dtype: int64

In [18]:
X_test.isnull().sum()

id                  0
age                 0
workclass         380
fnlwgt              0
education           0
education.num       0
marital.status      0
occupation        380
relationship        0
race                0
sex                 0
capital.gain        0
capital.loss        0
hours.per.week      0
native.country    122
dtype: int64

In [7]:
y_train.isnull().sum()

id        0
income    0
dtype: int64

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26048 entries, 21851 to 25716
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26048 non-null  int64 
 1   age             26048 non-null  int64 
 2   workclass       24592 non-null  object
 3   fnlwgt          26048 non-null  int64 
 4   education       26048 non-null  object
 5   education.num   26048 non-null  int64 
 6   marital.status  26048 non-null  object
 7   occupation      24585 non-null  object
 8   relationship    26048 non-null  object
 9   race            26048 non-null  object
 10  sex             26048 non-null  object
 11  capital.gain    26048 non-null  int64 
 12  capital.loss    26048 non-null  int64 
 13  hours.per.week  26048 non-null  int64 
 14  native.country  25587 non-null  object
dtypes: int64(7), object(8)
memory usage: 3.2+ MB


In [13]:
y_train['income'].value_counts()

income
<=50K    19756
>50K      6292
Name: count, dtype: int64

In [15]:
num = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
cat = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']


In [16]:
X_train[num].describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,26048.0,26048.0,26048.0,26048.0,26048.0,26048.0
mean,38.610335,189574.1,10.082118,1081.193796,88.477695,40.420224
std,13.628346,104384.8,2.574608,7404.962675,404.689981,12.354707
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,118247.2,9.0,0.0,0.0,40.0
50%,37.0,178575.5,10.0,0.0,0.0,40.0
75%,48.0,236596.8,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [17]:
X_train[cat].describe()

Unnamed: 0,workclass,education,marital.status,occupation,relationship,race,sex,native.country
count,24592,26048,26048,24585,26048,26048,26048,25587
unique,8,16,7,14,6,5,2,41
top,Private,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
freq,18160,8408,11987,3323,10558,22270,17400,23381


In [32]:
# mode() 함수는 데이터의 최빈값(가장 자주 나타나는 값)을 반환
# workclass         1456
# occupation        1463
# native.country     461
def fill_na(df):
  df['workclass'] = df['workclass'].fillna(df['workclass'].mode()[0])
  df['occupation'] = df['occupation'].fillna(("null"))
  df['native.country'] = df['native.country'].fillna(df['native.country'].mode()[0])
  return df

X_train = fill_na(X_train)
X_test = fill_na(X_test)

In [24]:
X_train.isnull().sum()

id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

In [25]:
X_test.isnull().sum()

id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

In [33]:
# 라벨인코딩, category에 적용
from sklearn.preprocessing import LabelEncoder

all_df = pd.concat([X_train.assign(ind="train"), X_test.assign(ind="test")])
le = LabelEncoder()
all_df[cat] = all_df[cat].apply(le.fit_transform)
all_df.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,ind
21851,21851,36,3,241998,9,13,2,2,0,4,1,0,0,50,38,train
7632,7632,53,3,103950,12,14,0,9,1,4,0,0,0,40,38,train
27878,27878,19,3,203061,15,10,4,12,1,4,0,0,0,25,38,train
14121,14121,20,3,102607,11,9,4,5,3,4,1,0,0,30,38,train
32345,32345,54,6,138852,11,9,2,9,0,4,1,0,0,40,38,train


In [34]:
X_train = all_df[all_df['ind'] == "train"]
X_train = X_train.drop('ind', axis=1)
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,3,241998,9,13,2,2,0,4,1,0,0,50,38
7632,7632,53,3,103950,12,14,0,9,1,4,0,0,0,40,38
27878,27878,19,3,203061,15,10,4,12,1,4,0,0,0,25,38
14121,14121,20,3,102607,11,9,4,5,3,4,1,0,0,30,38
32345,32345,54,6,138852,11,9,2,9,0,4,1,0,0,40,38


In [35]:
X_test = all_df[all_df["ind"] == "test"]
X_test = X_test.drop("ind", axis=1)
X_test.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
20901,20901,58,3,114495,11,9,2,3,0,4,1,0,0,40,38
14170,14170,46,3,247043,11,9,2,13,0,4,1,0,0,40,38
1776,1776,67,1,103315,12,14,4,3,2,4,0,15831,0,72,38
30428,30428,18,3,165532,15,10,4,11,3,4,1,0,0,15,38
8602,8602,26,6,58039,15,10,2,7,0,4,1,0,0,40,38


In [37]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train[num] = scaler.fit_transform(X_train[num])
X_test[num] = scaler.fit_transform(X_test[num])

X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,0.260274,3,0.156011,9,0.8,2,2,0,4,1,0.0,0.0,0.5,38
7632,7632,0.493151,3,0.062255,12,0.866667,0,9,1,4,0,0.0,0.0,0.397959,38
27878,27878,0.027397,3,0.129566,15,0.6,4,12,1,4,0,0.0,0.0,0.244898,38
14121,14121,0.041096,3,0.061343,11,0.533333,4,5,3,4,1,0.0,0.0,0.295918,38
32345,32345,0.506849,6,0.085958,11,0.533333,2,9,0,4,1,0.0,0.0,0.397959,38


In [38]:
# target값 변경
y = (y_train['income'] != '<=50K').astype(int)
y[:5]

21851    1
7632     0
27878    0
14121    0
32345    0
Name: income, dtype: int64

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.15, random_state=2024)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((22140, 15), (3908, 15), (22140,), (3908,))

In [40]:
X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
31148,31148,0.191781,3,0.030899,11,0.533333,0,0,4,4,0,0.0,0.0,0.5,38
4101,4101,0.232877,3,0.062634,8,0.666667,4,9,1,4,0,0.01639,0.0,0.193878,38
7529,7529,0.465753,4,0.198768,12,0.866667,2,3,0,4,1,0.0,0.0,0.5,38
29860,29860,0.520548,3,0.12065,15,0.6,2,13,0,4,1,0.0,0.0,0.846939,38
16243,16243,0.082192,3,0.090398,15,0.6,2,11,5,4,0,0.0,0.0,0.397959,38


In [61]:
# X_train = X_train.drop('id', axis=1)
# X_val = X_val.drop('id', axis=1)

In [58]:
y_val

29058    0
29354    0
6841     1
24982    0
28045    1
        ..
3666     1
1467     0
25133    1
21312    0
4051     0
Name: income, Length: 3908, dtype: int64

In [55]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
31148,0.191781,3,0.030899,11,0.533333,0,0,4,4,0,0.0,0.0,0.5,38
4101,0.232877,3,0.062634,8,0.666667,4,9,1,4,0,0.01639,0.0,0.193878,38
7529,0.465753,4,0.198768,12,0.866667,2,3,0,4,1,0.0,0.0,0.5,38
29860,0.520548,3,0.12065,15,0.6,2,13,0,4,1,0.0,0.0,0.846939,38
16243,0.082192,3,0.090398,15,0.6,2,11,5,4,0,0.0,0.0,0.397959,38


In [62]:
y_val.head()

29058    0
29354    0
6841     1
24982    0
28045    1
Name: income, dtype: int64

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(random_state = 2024)
model.fit(X_train, y_train)
pred = model.predict(X_val)
print('accuracy score:', (accuracy_score(y_val, pred)))

accuracy score: 0.8462128966223132


In [70]:
X_val.shape, pred.shape

((3908, 14), (3908,))

In [71]:
# csv생성
output = pd.DataFrame({'id': X_val.index, 'income':pred})
output.to_csv("000000.csv", index=False)
output.head()

Unnamed: 0,id,income
0,29058,0
1,29354,0
2,6841,1
3,24982,0
4,28045,1


## 채점 (수험자는 확인 불가)

In [None]:
y_test = (y_test['income'] != '<=50K').astype(int)
from sklearn.metrics import accuracy_score
print('accuracy score:', (accuracy_score(y_test, pred)))

accuracy score: 0.8569015814524796
