In [71]:
# 판다스 라이브러리
import pandas as pd

# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 데이터 샘플
print(train.head(3))

# 데이터 크기
print(train.shape, test.shape)

# 자료형 (타입)
print(train.info())

# 결측치 (train)
print(train.isnull().sum())

# 결측치 (test)
print(test.isnull().sum())

# label(target)별 개수 확인
print(train['income'].value_counts())

# 결측치가 있는 데이터(행) 전체 삭제 및 확인 dropna() / 기본값 axis=0
print("처리전:", train.shape)
df = train.dropna()
print("처리후:", df.shape)

# 결측치가 있는 특정컬럼 데이터(행)삭제
df = train.dropna(subset=['native.country','workclass'])
df.isnull().sum()

# 결측치가 있는 컬럼 삭제 dropna(axis=1)
print("처리전:", train.shape)
df = train.dropna(axis=1)
print("처리후:", df.shape)

# 결측치가 많은 특정 컬럼 삭제 drop
print("처리전:", train.shape)
df = train.drop(['native.country','workclass'], axis=1)
print("처리후:", df.shape)

# 최빈값
m = train['workclass'].mode()[0]
train['workclass'] = train['workclass'].fillna(m)

m = train['native.country'].mode()[0]
train['native.country'] = train['native.country'].fillna(m)

train.isnull().sum()

# 결측값을 새로운 카테고리로 생성 X_train['occupation']
train['occupation'] = train['occupation'].fillna('X')
train.isnull().sum()

# test데이터
test['workclass'] = test['workclass'].fillna(train['workclass'].mode()[0])
test['native.country']  = test['native.country'].fillna(train['native.country'].mode()[0])
test['occupation'] = test['occupation'].fillna("X")

# age 컬럼 평균값으로 채우기
value = int(train['age'].mean())
print("평균값:", value)
train['age'] = train['age'].fillna(value)
test['age'] = test['age'].fillna(value)

# 주당 근무시간 중앙값으로 채우기
value = int(train['hours.per.week'].median())
print("중앙값:", value)
train['hours.per.week'] = train['hours.per.week'].fillna(value)
test['hours.per.week'] = test['hours.per.week'].fillna(value)

train.isnull().sum()

# 이상치 제거 - 선택
# age가 1이상인 데이터만 살림
# print(train.shape)
# train = train[train['age']>0]
# print(train.shape)

      id   age  workclass  fnlwgt     education  education.num  \
0   3331  34.0  State-gov  177331  Some-college             10   
1  19749  58.0    Private  290661       HS-grad              9   
2   1157  48.0    Private  125933  Some-college             10   

       marital.status       occupation relationship   race     sex  \
0  Married-civ-spouse   Prof-specialty      Husband  Black    Male   
1  Married-civ-spouse     Craft-repair      Husband  White    Male   
2             Widowed  Exec-managerial    Unmarried  Black  Female   

   capital.gain  capital.loss  hours.per.week native.country income  
0          4386             0            40.0  United-States   >50K  
1             0             0            40.0  United-States  <=50K  
2             0          1669            38.0  United-States  <=50K  
(29304, 16) (3257, 15)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29304 entries, 0 to 29303
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype

id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

In [73]:
# 데이터 선택
y_train = train.pop("income")

# cols = train.select_dtypes(include='object').columns #방법1
# cols = train.columns[train.dtypes == object] #방법2
cols = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex','native.country']
cols

# 라벨(레이블) 인코딩
from sklearn.preprocessing import LabelEncoder

for col in cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

train.head()

print(train.info())
print(test.info())

print(y_train.value_counts())

# 검증 데이터 나누기
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train,
                                                  y_train,
                                                  test_size=0.2,
                                                  random_state=0)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

# 머신러닝 학습 및 평가
# 랜덤포레스트 - 의사결정나무
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0)
# rf = RandomForestClassifier(max_depth=7, n_estimators=200, random_state=0)
rf.fit(X_train, y_train)
pred=rf.predict_proba(X_val) # 각 레이블에 속할 확률 값 반환
print(pred) # <=50K, >50K

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29304 entries, 0 to 29303
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              29304 non-null  int64  
 1   age             29304 non-null  float64
 2   workclass       29304 non-null  int32  
 3   fnlwgt          29304 non-null  int64  
 4   education       29304 non-null  int32  
 5   education.num   29304 non-null  int64  
 6   marital.status  29304 non-null  int32  
 7   occupation      29304 non-null  int32  
 8   relationship    29304 non-null  int32  
 9   race            29304 non-null  int32  
 10  sex             29304 non-null  int32  
 11  capital.gain    29304 non-null  int64  
 12  capital.loss    29304 non-null  int64  
 13  hours.per.week  29304 non-null  float64
 14  native.country  29304 non-null  int32  
dtypes: float64(2), int32(8), int64(5)
memory usage: 2.5 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3257 entrie

In [69]:
# 검증 데이터 평가
from sklearn.metrics import roc_auc_score
# <=50K, >50K
roc_auc = roc_auc_score(y_val, pred[:,1])
print('roc_auc:', roc_auc)

# test 데이터 예측
pred2 = rf.predict_proba(test)
print(pred2)

# 결과 파일 생성
submit = pd.DataFrame({'pred':pred2[:,1]})

submit.to_csv("result.csv", index=False)

pd.read_csv("result.csv")

roc_auc: 0.9173016455843583
[[0.92 0.08]
 [0.99 0.01]
 [0.93 0.07]
 ...
 [0.86 0.14]
 [1.   0.  ]
 [0.89 0.11]]


Unnamed: 0,pred
0,0.08
1,0.01
2,0.07
3,0.72
4,0.03
...,...
3252,0.03
3253,0.43
3254,0.14
3255,0.00
