In [35]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26048 entries, 21851 to 25716
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              26048 non-null  int64 
 1   age             26048 non-null  int64 
 2   workclass       24592 non-null  object
 3   fnlwgt          26048 non-null  int64 
 4   education       26048 non-null  object
 5   education.num   26048 non-null  int64 
 6   marital.status  26048 non-null  object
 7   occupation      24585 non-null  object
 8   relationship    26048 non-null  object
 9   race            26048 non-null  object
 10  sex             26048 non-null  object
 11  capital.gain    26048 non-null  int64 
 12  capital.loss    26048 non-null  int64 
 13  hours.per.week  26048 non-null  int64 
 14  native.country  25587 non-null  object
dtypes: int64(7), object(8)
memory usage: 3.2+ MB


In [3]:
# EDA
X_train.head()


Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States
14121,14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States
32345,32345,54,State-gov,138852,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


In [4]:
y_train.head()

Unnamed: 0,id,income
21851,21851,>50K
7632,7632,<=50K
27878,27878,<=50K
14121,14121,<=50K
32345,32345,<=50K


In [17]:
# 결측치 제거

# print(X_train.isnull().sum())
# print(len(X_train))

X_train['workclass'] = X_train['workclass'].fillna(X_train['workclass'].mode()[0])
X_train['occupation'] = X_train['occupation'].fillna(X_train['occupation'].mode()[0])
X_train['native.country'] = X_train['native.country'].fillna(X_train['native.country'].mode()[0])

X_train.isnull().sum()


X_test['workclass'] = X_test['workclass'].fillna(X_test['workclass'].mode()[0])
X_test['occupation'] = X_test['occupation'].fillna(X_test['occupation'].mode()[0])
X_test['native.country'] = X_test['native.country'].fillna(X_test['native.country'].mode()[0])

X_test.isnull().sum()

id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

In [23]:
# 전처리 수행
from sklearn.preprocessing import LabelEncoder

drop_id_y_train = y_train.drop('id', axis = 1)
drop_id_y_test = y_test.drop('id', axis = 1)

drop_id_y_train.head()

drop_id_x_train = X_train.drop('id', axis = 1)
drop_id_x_test = X_test.drop('id', axis = 1)


drop_id_x_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,36,Private,241998,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,50,United-States
7632,53,Private,103950,Masters,14,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States
27878,19,Private,203061,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,25,United-States
14121,20,Private,102607,HS-grad,9,Never-married,Handlers-cleaners,Own-child,White,Male,0,0,30,United-States
32345,54,State-gov,138852,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States


In [29]:
pre_processing = LabelEncoder()

# 범주형 변수를 수치형으로 변환
drop_id_y_train = pre_processing.fit_transform(drop_id_y_train)
drop_id_y_test = pre_processing.fit_transform(drop_id_y_test)

# 범주형 컬럼 확인하기
col = ["workclass", "education","marital.status","occupation","relationship","race","sex","native.country"]

for c in col:
    drop_id_x_train[c] = pre_processing.fit_transform(drop_id_x_train[c])
    drop_id_x_test[c] = pre_processing.fit_transform(drop_id_x_test[c])


drop_id_x_train

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,36,3,241998,9,13,2,2,0,4,1,0,0,50,38
7632,53,3,103950,12,14,0,9,1,4,0,0,0,40,38
27878,19,3,203061,15,10,4,12,1,4,0,0,0,25,38
14121,20,3,102607,11,9,4,5,3,4,1,0,0,30,38
32345,54,6,138852,11,9,2,9,0,4,1,0,0,40,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2669,45,3,187370,12,14,0,3,4,4,1,7430,0,70,38
17536,36,3,174308,1,7,0,13,1,4,1,0,0,40,38
6201,47,3,275361,7,12,6,7,3,4,0,0,0,35,38
27989,50,5,196504,10,16,2,9,0,4,1,0,0,23,38


In [30]:
# 학습
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(drop_id_x_train, drop_id_y_train)


In [34]:
# 학습 결과 확인하기
from sklearn.metrics import accuracy_score

pred = model.predict(drop_id_x_test)

# 정확도
accuracy = accuracy_score(pred, drop_id_y_test)
print(accuracy)

# 모델 결과 저장하기
result = pd.DataFrame({"id": y_test['id'], "income": pred})

result.head()

0.8578228159066482


Unnamed: 0,id,income
20901,20901,0
14170,14170,0
1776,1776,1
30428,30428,0
8602,8602,0
