In [59]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("../input/adult-census-income/adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')


데이터 불러오기

In [60]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

EDA

In [61]:
# 결측치 확인
X_train.isnull().sum()

id                   0
age                  0
workclass         1456
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1463
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     461
dtype: int64

In [62]:
X_train = X_train.drop(['workclass', 'occupation', 'native.country'], axis=1)
X_test = X_test.drop(['workclass', 'occupation', 'native.country'], axis=1)

In [63]:
# 피처 구분
# Numeric features
numeric_features = [
                    'age',
                    'fnlwgt', 
                    'education.num',
                    'capital.gain', 
                    'capital.loss', 
                    'hours.per.week',                     
                ]

# Categorical features
cat_features = [             
                'education',            
                'marital.status', 
                'relationship', 
                'race', 
                'sex'
]

In [64]:
# 라벨인코딩
from sklearn.preprocessing import LabelEncoder

all_df = pd.concat([X_train.assign(ind="train"), X_test.assign(ind="test")])
le = LabelEncoder()
all_df[cat_features] = all_df[cat_features].apply(le.fit_transform)

X_train = all_df[all_df['ind'] == 'train']
X_train = X_train.drop('ind',axis=1)
X_train

Unnamed: 0,id,age,fnlwgt,education,education.num,marital.status,relationship,race,sex,capital.gain,capital.loss,hours.per.week
21851,21851,36,241998,9,13,2,0,4,1,0,0,50
7632,7632,53,103950,12,14,0,1,4,0,0,0,40
27878,27878,19,203061,15,10,4,1,4,0,0,0,25
14121,14121,20,102607,11,9,4,3,4,1,0,0,30
32345,32345,54,138852,11,9,2,0,4,1,0,0,40
...,...,...,...,...,...,...,...,...,...,...,...,...
2669,2669,45,187370,12,14,0,4,4,1,7430,0,70
17536,17536,36,174308,1,7,0,1,4,1,0,0,40
6201,6201,47,275361,7,12,6,3,4,0,0,0,35
27989,27989,50,196504,10,16,2,0,4,1,0,0,23


In [65]:
X_test = all_df[all_df['ind'] == 'test']
X_test = X_test.drop('ind',axis=1)
X_test

Unnamed: 0,id,age,fnlwgt,education,education.num,marital.status,relationship,race,sex,capital.gain,capital.loss,hours.per.week
20901,20901,58,114495,11,9,2,0,4,1,0,0,40
14170,14170,46,247043,11,9,2,0,4,1,0,0,40
1776,1776,67,103315,12,14,4,2,4,0,15831,0,72
30428,30428,18,165532,15,10,4,3,4,1,0,0,15
8602,8602,26,58039,15,10,2,0,4,1,0,0,40
...,...,...,...,...,...,...,...,...,...,...,...,...
31222,31222,22,199426,15,10,4,1,4,0,0,0,40
10861,10861,41,155106,11,9,2,0,4,1,0,0,40
8929,8929,32,153078,9,13,4,1,1,1,0,0,40
2066,2066,48,171926,14,15,2,0,4,1,15024,0,50


In [66]:
# 스케일링
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])
X_train

Unnamed: 0,id,age,fnlwgt,education,education.num,marital.status,relationship,race,sex,capital.gain,capital.loss,hours.per.week
21851,21851,0.260274,0.156011,9,0.800000,2,0,4,1,0.000000,0.0,0.500000
7632,7632,0.493151,0.062255,12,0.866667,0,1,4,0,0.000000,0.0,0.397959
27878,27878,0.027397,0.129566,15,0.600000,4,1,4,0,0.000000,0.0,0.244898
14121,14121,0.041096,0.061343,11,0.533333,4,3,4,1,0.000000,0.0,0.295918
32345,32345,0.506849,0.085958,11,0.533333,2,0,4,1,0.000000,0.0,0.397959
...,...,...,...,...,...,...,...,...,...,...,...,...
2669,2669,0.383562,0.118910,12,0.866667,0,4,4,1,0.074301,0.0,0.704082
17536,17536,0.260274,0.110039,1,0.400000,0,1,4,1,0.000000,0.0,0.397959
6201,6201,0.410959,0.178669,7,0.733333,6,3,4,0,0.000000,0.0,0.346939
27989,27989,0.452055,0.125113,10,1.000000,2,0,4,1,0.000000,0.0,0.224490


In [67]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26048 entries, 21851 to 25716
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26048 non-null  int64  
 1   age             26048 non-null  float64
 2   fnlwgt          26048 non-null  float64
 3   education       26048 non-null  int32  
 4   education.num   26048 non-null  float64
 5   marital.status  26048 non-null  int32  
 6   relationship    26048 non-null  int32  
 7   race            26048 non-null  int32  
 8   sex             26048 non-null  int32  
 9   capital.gain    26048 non-null  float64
 10  capital.loss    26048 non-null  float64
 11  hours.per.week  26048 non-null  float64
dtypes: float64(6), int32(5), int64(1)
memory usage: 2.1 MB


In [68]:
X_test.columns

Index(['id', 'age', 'fnlwgt', 'education', 'education.num', 'marital.status',
       'relationship', 'race', 'sex', 'capital.gain', 'capital.loss',
       'hours.per.week'],
      dtype='object')

In [69]:
y_train.columns

Index(['id', 'income'], dtype='object')

In [70]:
# target값 변경
y = (y_train['income'] != '<=50K').astype(int)
y[:5]

  y[:5]


21851    1
7632     0
27878    0
14121    0
32345    0
Name: income, dtype: int32

In [71]:
# 학습용 데이터와 검증용 데이터로 구분
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y, test_size=0.15, random_state=2021)
X_tr.shape, X_val.shape, y_tr.shape, y_val.shape

((22140, 12), (3908, 12), (22140,), (3908,))

In [72]:
# id 삭제
X_tr = X_tr.drop('id', axis=1)
X_val = X_val.drop('id', axis=1)

In [73]:
# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

model = RandomForestClassifier(random_state = 2022)
model.fit(X_tr, y_tr)
pred = model.predict(X_val)
print('accuracy score:', (accuracy_score(y_val, pred)))

accuracy score: 0.8421187308085978


In [74]:
# test데이터 예측 (pop을 활용하면 값을 넘겨주고 삭제 됨)
X_test_id = X_test.pop('id')
pred = model.predict(X_test)

In [75]:
# csv생성
output = pd.DataFrame({'id': X_test_id, 'income':pred})
output.to_csv("000000.csv", index=False)
output.head()

Unnamed: 0,id,income
20901,20901,0
14170,14170,0
1776,1776,1
30428,30428,0
8602,8602,0


In [76]:
y_test = (y_test['income'] != '<=50K').astype(int)
from sklearn.metrics import accuracy_score
print('accuracy score:', (accuracy_score(y_test, pred)))

accuracy score: 0.8440042990941194
