In [1]:
# 시험환경 세팅 (코드 변경 X)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def exam_data_load(df, target, id_name="", null_name=""):
    if id_name == "":
        df = df.reset_index().rename(columns={"index": "id"})
        id_name = 'id'
    else:
        id_name = id_name
    
    if null_name != "":
        df[df == null_name] = np.nan
    
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021)
    
    y_train = X_train[[id_name, target]]
    X_train = X_train.drop(columns=[target])

    
    y_test = X_test[[id_name, target]]
    X_test = X_test.drop(columns=[target])
    return X_train, X_test, y_train, y_test 
    
df = pd.read_csv("adult.csv")
X_train, X_test, y_train, y_test = exam_data_load(df, target='income', null_name='?')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26048, 15), (6513, 15), (26048, 2), (6513, 2))

In [2]:
# EDA

print(X_train.head())
y_train['income'].value_counts()

          id  age  workclass  fnlwgt     education  education.num  \
21851  21851   36    Private  241998     Bachelors             13   
7632    7632   53    Private  103950       Masters             14   
27878  27878   19    Private  203061  Some-college             10   
14121  14121   20    Private  102607       HS-grad              9   
32345  32345   54  State-gov  138852       HS-grad              9   

           marital.status         occupation   relationship   race     sex  \
21851  Married-civ-spouse       Craft-repair        Husband  White    Male   
7632             Divorced     Prof-specialty  Not-in-family  White  Female   
27878       Never-married       Tech-support  Not-in-family  White  Female   
14121       Never-married  Handlers-cleaners      Own-child  White    Male   
32345  Married-civ-spouse     Prof-specialty        Husband  White    Male   

       capital.gain  capital.loss  hours.per.week native.country  
21851             0             0              50

income
<=50K    19756
>50K      6292
Name: count, dtype: int64

In [3]:
# 결측치 처리
# X_train.isnull().sum()
# X_test.isnull().sum()

print(X_train['workclass'].head())
print(X_train['occupation'].head())
X_train['native.country'].head()

# 모두 문자열로 구성
X_train['workclass'] = X_train['workclass'].fillna("-")
X_train['occupation'] = X_train['occupation'].fillna("-")
X_train['native.country'] = X_train['native.country'].fillna("-")

21851      Private
7632       Private
27878      Private
14121      Private
32345    State-gov
Name: workclass, dtype: object
21851         Craft-repair
7632        Prof-specialty
27878         Tech-support
14121    Handlers-cleaners
32345       Prof-specialty
Name: occupation, dtype: object


In [4]:
# 전처리 작업 수행
# 문자열 -> 숫자로
from sklearn.preprocessing import LabelEncoder

col = ["workclass", "education", "marital.status", "occupation", "relationship", "race",
       "sex", "native.country"]

prepre = LabelEncoder()

for column in col:
    X_train[column] = prepre.fit_transform(X_train[column])

X_train.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
21851,21851,36,4,241998,9,13,2,3,0,4,1,0,0,50,39
7632,7632,53,4,103950,12,14,0,10,1,4,0,0,0,40,39
27878,27878,19,4,203061,15,10,4,13,1,4,0,0,0,25,39
14121,14121,20,4,102607,11,9,4,6,3,4,1,0,0,30,39
32345,32345,54,7,138852,11,9,2,10,0,4,1,0,0,40,39


In [5]:
X_train.head()
y_train.head()
# s = ["age", "fnlwgt", "education.num", "marital.status","occupation","relationship","race"]
# train_data = X_train[s]

# from sklearn.ensemble import RandomForestClassifier

# model = RandomForestClassifier()
# model.fit(train_data, y_train)

Unnamed: 0,id,income
21851,21851,>50K
7632,7632,<=50K
27878,27878,<=50K
14121,14121,<=50K
32345,32345,<=50K


In [5]:
from sklearn.preprocessing import LabelEncoder

y_train = y_train.drop(['id'], axis = 1)
y_test = y_test.drop(['id'], axis = 1)
print(y_train)

pre = LabelEncoder()
y_train = pre.fit_transform(y_train)
y_test = pre.fit_transform(y_test)


      income
21851   >50K
7632   <=50K
27878  <=50K
14121  <=50K
32345  <=50K
...      ...
2669    >50K
17536  <=50K
6201   <=50K
27989  <=50K
25716  <=50K

[26048 rows x 1 columns]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [7]:

print(y_train, y_test)

[1 0 0 ... 0 0 0] [0 1 1 ... 0 1 1]


In [8]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)

In [9]:
# 평가

from sklearn.metrics import accuracy_score

pred = model.predict(X_train)
accuracy = accuracy_score(pred, y_train)
print(accuracy)


1.0


In [23]:
from sklearn.preprocessing import LabelEncoder

col = ["workclass", "education", "marital.status", "occupation", "relationship", "race",
       "sex", "native.country"]

prepre = LabelEncoder()

for column in col:
    X_test[column] = prepre.fit_transform(X_test[column])

X_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
20901,58,2,114495,11,9,2,3,0,4,1,0,0,40,37
14170,46,2,247043,11,9,2,13,0,4,1,0,0,40,37
1776,67,1,103315,12,14,4,3,2,4,0,15831,0,72,37
30428,18,2,165532,15,10,4,11,3,4,1,0,0,15,37
8602,26,5,58039,15,10,2,7,0,4,1,0,0,40,37


In [11]:
result = model.predict(X_test)

final = accuracy_score(result, y_test)
print(final)

0.8549055734684478


In [12]:
print(result)

[0 0 1 ... 0 1 1]


In [22]:
answer = []
for r in result:
    if r == 1:
        answer.append(">50K")
    else:
        answer.append("<=50K")

data = pd.DataFrame({'id':X_test.index, 'income': answer})

data.head()
#data.to_csv('result.csv', index = False)


Unnamed: 0,id,income
0,20901,<=50K
1,14170,<=50K
2,1776,>50K
3,30428,<=50K
4,8602,<=50K
