In [148]:
import pandas as pd
import numpy as np

In [149]:
df = pd.read_csv('human.csv', encoding='cp949')
df.head()

Unnamed: 0,아이디,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,성별,자본 이득,자본 손실,주당 시간,모국
0,H20001,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,H20002,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,H20003,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,H20004,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,H20005,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [104]:
mdf = df.copy()


In [105]:
# 결측치 처리 : 1. drop 2. SimpleImputer
columns = ['노동 계급', '직업', '모국']
mdf = mdf.drop(columns, axis = 1)

In [106]:
mdf.head()

Unnamed: 0,아이디,나이,fnlwgt,학력,교육 수,혼인 상태,관계,인종,성별,자본 이득,자본 손실,주당 시간
0,H20001,39,77516,Bachelors,13,Never-married,Not-in-family,White,Male,2174,0,40
1,H20002,50,83311,Bachelors,13,Married-civ-spouse,Husband,White,Male,0,0,13
2,H20003,38,215646,HS-grad,9,Divorced,Not-in-family,White,Male,0,0,40
3,H20004,53,234721,11th,7,Married-civ-spouse,Husband,Black,Male,0,0,40
4,H20005,28,338409,Bachelors,13,Married-civ-spouse,Wife,Black,Female,0,0,40


In [107]:
# 문자형 -> 수치형
from sklearn.preprocessing import LabelEncoder
# 1. 숫자로 변환 가능한 문자형 데이터 변환
mdf = mdf.apply(pd.to_numeric, errors="ignore")  # 변환할 수 있는 것만 변환

# 2. Label Encoding 적용 (카테고리형 변수)
label_cols = ['성별', '혼인 상태', '인종']  # 변환할 열 지정
for col in label_cols:
    mdf[col] = LabelEncoder().fit_transform(mdf[col])

# 3. One-Hot Encoding 적용 (필요할 경우)
mdf = pd.get_dummies(mdf, columns=['학력', '관계'])

print(mdf.dtypes)  # 변환 확인

아이디                   object
나이                     int64
fnlwgt                 int64
교육 수                   int64
혼인 상태                  int32
인종                     int32
성별                     int32
자본 이득                  int64
자본 손실                  int64
주당 시간                  int64
학력_ 10th                bool
학력_ 11th                bool
학력_ 12th                bool
학력_ 1st-4th             bool
학력_ 5th-6th             bool
학력_ 7th-8th             bool
학력_ 9th                 bool
학력_ Assoc-acdm          bool
학력_ Assoc-voc           bool
학력_ Bachelors           bool
학력_ Doctorate           bool
학력_ HS-grad             bool
학력_ Masters             bool
학력_ Preschool           bool
학력_ Prof-school         bool
학력_ Some-college        bool
관계_ Husband             bool
관계_ Not-in-family       bool
관계_ Other-relative      bool
관계_ Own-child           bool
관계_ Unmarried           bool
관계_ Wife                bool
dtype: object


  mdf = mdf.apply(pd.to_numeric, errors="ignore")  # 변환할 수 있는 것만 변환


In [108]:
# 여러 알고리즘 적용하여 예측율을 최대한 높이기
from sklearn.model_selection import train_test_split 
dfX = mdf.drop(['아이디','성별'], axis=1)
dfy = mdf['성별']
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.25, random_state=0)

In [109]:
display(X_train.shape, X_test.shape)
X_train.head()

(24420, 30)

(8141, 30)

Unnamed: 0,나이,fnlwgt,교육 수,혼인 상태,인종,자본 이득,자본 손실,주당 시간,학력_ 10th,학력_ 11th,...,학력_ Masters,학력_ Preschool,학력_ Prof-school,학력_ Some-college,관계_ Husband,관계_ Not-in-family,관계_ Other-relative,관계_ Own-child,관계_ Unmarried,관계_ Wife
26464,59,61885,8,0,2,0,0,35,False,False,...,False,False,False,False,False,False,True,False,False,False
16134,71,180733,14,4,4,0,0,20,False,False,...,True,False,False,False,False,True,False,False,False,False
4747,42,107762,14,2,4,0,0,40,False,False,...,True,False,False,False,True,False,False,False,False,False
8369,26,35917,9,2,4,0,0,40,False,False,...,False,False,False,False,True,False,False,False,False,False
5741,46,256522,2,4,4,0,0,40,False,False,...,False,False,False,False,False,False,False,True,False,False


In [110]:
from sklearn.tree import DecisionTreeClassifier

In [111]:
tree = DecisionTreeClassifier(max_depth=6, random_state=0)

In [112]:
tree.fit(X_train, y_train)

In [113]:
pred_tree = tree.predict(X_test); pred_tree

array([0, 1, 1, ..., 1, 1, 0])

In [114]:
display(tree.score(X_train, y_train))
display(tree.score(X_test, y_test))

0.8058968058968059

0.7962166809974205

In [115]:
from sklearn.svm import SVC
svm = SVC(random_state=0)
svm.fit(X_train, y_train)

In [116]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier()
mlp.fit(X_train, y_train)

In [117]:
display(tree.score(X_train, y_train))
display(tree.score(X_test, y_test))

0.8058968058968059

0.7962166809974205

In [118]:
display(svm.score(X_train, y_train))
display(svm.score(X_test, y_test))

0.6696150696150697

0.6680997420464316

In [119]:
display(mlp.score(X_train, y_train))
display(mlp.score(X_test, y_test))

0.6702702702702703

0.6692052573393932

In [120]:
best_model = tree
best_model.score(X_test, y_test)

0.7962166809974205

In [121]:
new = pd.read_csv('human_new.csv', encoding='cp949')

columns = ['노동 계급', '직업', '모국']

ndf = new.drop(columns, axis = 1)

ndf = ndf.apply(pd.to_numeric, errors="ignore")  # 변환할 수 있는 것만 변환

# 2. Label Encoding 적용 (카테고리형 변수)
label_cols = ['혼인 상태', '인종']  # 변환할 열 지정
for col in label_cols:
    ndf[col] = LabelEncoder().fit_transform(ndf[col])

# 3. One-Hot Encoding 적용 (필요할 경우)
ndf = pd.get_dummies(ndf, columns=['학력', '관계'])

ndf.head()

  ndf = ndf.apply(pd.to_numeric, errors="ignore")  # 변환할 수 있는 것만 변환


Unnamed: 0,아이디,나이,fnlwgt,교육 수,혼인 상태,인종,자본 이득,자본 손실,주당 시간,학력_ 10th,...,학력_ Masters,학력_ Preschool,학력_ Prof-school,학력_ Some-college,관계_ Husband,관계_ Not-in-family,관계_ Other-relative,관계_ Own-child,관계_ Unmarried,관계_ Wife
0,H00001,25,226802,7,4,2,0,0,40,False,...,False,False,False,False,False,False,False,True,False,False
1,H00002,38,89814,9,2,4,0,0,50,False,...,False,False,False,False,True,False,False,False,False,False
2,H00003,28,336951,12,2,4,0,0,40,False,...,False,False,False,False,True,False,False,False,False,False
3,H00004,44,160323,10,2,2,7688,0,40,False,...,False,False,False,True,True,False,False,False,False,False
4,H00005,18,103497,10,4,4,0,0,30,False,...,False,False,False,True,False,False,False,True,False,False


In [122]:
ndf.isnull().sum()

아이디                   0
나이                    0
fnlwgt                0
교육 수                  0
혼인 상태                 0
인종                    0
자본 이득                 0
자본 손실                 0
주당 시간                 0
학력_ 10th              0
학력_ 11th              0
학력_ 12th              0
학력_ 1st-4th           0
학력_ 5th-6th           0
학력_ 7th-8th           0
학력_ 9th               0
학력_ Assoc-acdm        0
학력_ Assoc-voc         0
학력_ Bachelors         0
학력_ Doctorate         0
학력_ HS-grad           0
학력_ Masters           0
학력_ Preschool         0
학력_ Prof-school       0
학력_ Some-college      0
관계_ Husband           0
관계_ Not-in-family     0
관계_ Other-relative    0
관계_ Own-child         0
관계_ Unmarried         0
관계_ Wife              0
dtype: int64

In [123]:
ndf['성별'] = tree.predict(ndf.loc[:,'나이':'관계_ Wife'])
print(ndf)

          아이디  나이  fnlwgt  교육 수  혼인 상태  인종  자본 이득  자본 손실  주당 시간  학력_ 10th  \
0      H00001  25  226802     7      4   2      0      0     40     False   
1      H00002  38   89814     9      2   4      0      0     50     False   
2      H00003  28  336951    12      2   4      0      0     40     False   
3      H00004  44  160323    10      2   2   7688      0     40     False   
4      H00005  18  103497    10      4   4      0      0     30     False   
...       ...  ..     ...   ...    ...  ..    ...    ...    ...       ...   
16276  H16277  20  216672    10      4   4      0      0     30     False   
16277  H16278  25   61956    13      4   4   4650      0     45     False   
16278  H16279  33  157216    14      4   4      0      0     40     False   
16279  H16280  68  150250    14      2   4      0   1510     30     False   
16280  H16281  37  112838    14      2   4      0      0     55     False   

       ...  학력_ Preschool  학력_ Prof-school  학력_ Some-college  관계_ Husband  

In [126]:
ndf['성별'] = ndf['성별'].map({1: 'Male', 0: 'Female'})
df_selected = ndf[['아이디', '성별']]

# 새로운 CSV 파일로 저장 (인덱스 없이)
df_selected.to_csv("아이디_성별.csv", index=False, encoding="cp949")  

print("✅ '아이디_성별.csv' 저장 완료!")

✅ '아이디_성별.csv' 저장 완료!


In [127]:
# (추가)
# 1. MinMaxScaler
# 2. StandardScaler
# 3. PolynomialFeatures
# 4. SelectKBest
# 5. One-Hot Encoding


In [157]:
# 1~4 각각의 예측율 비교3
from sklearn.preprocessing import MinMaxScaler
mm_df = df.copy()

mm_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   아이디     32561 non-null  object
 1   나이      32561 non-null  int64 
 2   노동 계급   30725 non-null  object
 3   fnlwgt  32561 non-null  int64 
 4   학력      32561 non-null  object
 5   교육 수    32561 non-null  int64 
 6   혼인 상태   32561 non-null  object
 7   직업      30718 non-null  object
 8   관계      32561 non-null  object
 9   인종      32561 non-null  object
 10  성별      32561 non-null  object
 11  자본 이득   32561 non-null  int64 
 12  자본 손실   32561 non-null  int64 
 13  주당 시간   32561 non-null  int64 
 14  모국      31978 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [154]:
obj=['학력','혼인 상태','관계','인종','성별'] 
dataP_imp[obj] = dataP_imp[obj].apply(lambda x: x.astype('category').cat.codes) 
columns = ['아아디', '노동 계급', '직업', '모국']
mm_df = mm_df.drop(columns, axis=1)
mm_df.head()

Unnamed: 0,나이,fnlwgt,학력,교육 수,혼인 상태,관계,인종,성별,자본 이득,자본 손실,주당 시간
0,39,77516,Bachelors,13,Never-married,Not-in-family,White,Male,2174,0,40
1,50,83311,Bachelors,13,Married-civ-spouse,Husband,White,Male,0,0,13
2,38,215646,HS-grad,9,Divorced,Not-in-family,White,Male,0,0,40
3,53,234721,11th,7,Married-civ-spouse,Husband,Black,Male,0,0,40
4,28,338409,Bachelors,13,Married-civ-spouse,Wife,Black,Female,0,0,40


In [155]:
scaler = MinMaxScaler()
scaler.fit(mm_df)

ValueError: could not convert string to float: ' Bachelors'