# part 2 chapter 2
> 머신러닝 프로세스
>> 문제 정의 -> 라이브러리/데이터 불러오기 -> EDA -> 데이터 전처리 -> 검증 데이터 나누기 -> 모델 학습 및 평가(최고 효율이 나올 때까지 데이터 전처리 단계로 가서 아래 프로세스 반복 수행) -> 예측 및 제출

In [1]:
import pandas as pd
train = pd.read_csv("http://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch2/train.csv")
test = pd.read_csv("http://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch2/test.csv")

# 머신러닝 분류 실습

## EDA

In [2]:
# Process 1 head(): 데이터 샘플 확인
print(train.head(), '\n')
# Process 2 shape: 데이터 크기 확인(shape는 괄호 없음), 행과 열의 개수 보여줌, train과 test의 열 개수 차이는 target의 유무에 따른 것
print(train.shape, '\n')
print(test.shape, '\n')
# Process 3 info(): 데이터 자료형 확인, object형은 인코딩 과정 거쳐야 함
print(train.info(), '\n')
print(test.info(), '\n')
# Process 4-1 describe()/수치형: 데이터 기초 통계 확인
print(train.describe(), '\n')
print(test.describe(), '\n')
# Process 4-2 describe(include = 'O')/범주형: 데이터 기초 통계 확인
print(train.describe(include = 'O'), '\n')
print(test.describe(include = 'O'), '\n')
# Process 5 isnull().sum(): 결측치 확인, 결측치 확인 시 전처리 단계에서 처리
print(train.isnull().sum(), '\n')
print(test.isnull().sum(), '\n')
# Process 6 value_counts(): 각 target별 빈도 수 확인, 이진분류 확인, 불균형 데이터인지 확인
print(train['relationship'].value_counts())

      id   age  workclass  fnlwgt     education  education.num  \
0   3331  34.0  State-gov  177331  Some-college             10   
1  19749  58.0    Private  290661       HS-grad              9   
2   1157  48.0    Private  125933  Some-college             10   
3    693  58.0    Private  100313  Some-college             10   
4  12522  41.0    Private  195661  Some-college             10   

       marital.status        occupation relationship   race     sex  \
0  Married-civ-spouse    Prof-specialty      Husband  Black    Male   
1  Married-civ-spouse      Craft-repair      Husband  White    Male   
2             Widowed   Exec-managerial    Unmarried  Black  Female   
3  Married-civ-spouse   Protective-serv      Husband  White    Male   
4  Married-civ-spouse  Transport-moving      Husband  White    Male   

   capital.gain  capital.loss  hours.per.week native.country income  
0          4386             0            40.0  United-States   >50K  
1             0             0       

## 데이터 전처리
결측치, 인코딩: 필수
이상치, 스케일링: 선택
데이터 전처리 시 train 데이터 뿐만이 아니라 test 데이터도 똑같이 적용 시켜야 함

### 결측치 삭제

In [3]:
# dropna(): 결측치가 하나라도 있는 행을 제거, 권장
print("처리 전\n", train.isnull().sum(), "\n")
df = train.dropna()
print("처리 후\n", df.isnull().sum(), "\n")

# dropna(subset = ['칼럼명']): 결측치가 있는 컬럼 데이터에서 결측이 있는 행을 제거
df = train.dropna(subset = ['native.country', 'workclass'])
print("dropna(subset = ['native.country', 'workclass'])\n", df.isnull().sum(), "\n")

# dropna(axis = 1): 결측치가 있는 컬럼 전체를 제거, 위험하니 주의
df = train.dropna(axis = 1)
print("dropna(axis=1)\n",df.isnull().sum(), "\n", "\n")

# drop(['native.country', 'workclass'], axis=1): 특정 컬럼만 삭제
df = train.drop(['native.country', 'workclass'], axis = 1)
print("drop(['native.country', 'workclass'], axis = 1)\n", df.isnull().sum(), "\n")

처리 전
 id                   0
age                 12
workclass         1662
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1668
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week      13
native.country     537
income               0
dtype: int64 

처리 후
 id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64 

dropna(subset = ['native.country', 'workclass'])
 id                 0
age               12
workclass          0
fnlwgt             0
education          0
education.num      0
marital.status     0
occupation         6
relationship       0
race               0
sex       

### 결측치 채우기(범주형)

In [4]:
# 최빈값으로 대체
m = train['workclass'].mode()[0]
train['workclass'] = train['workclass'].fillna(m)

m = train['native.country'].mode()[0]
train['native.country'] = train['native.country'].fillna(m)

train.isnull().sum()

# 결측치를 새로운 카테고리로 분류(임의의 값: 여기서는 X 으로 대체)
train['occupation'] = train['occupation'].fillna('X')
train.isnull().sum()

# 전처리 작업 test에 적용
test['workclass'] = test['workclass'].fillna(train['workclass'].mode()[0])
test['native.country'] = test['native.country'].fillna(train['native.country'].mode()[0])
test['occupation'] = test['occupation'].fillna('X')

### 결측치 채우기(수치형)
> 결측치를 채울 컬럼의 평균값 또는 중앙값으로 대체

In [5]:
# 평균값
value = int(train['age'].mean())
print("평균값:", value)
train['age'] = train['age'].fillna(value)
test['age'] = test['age'].fillna(value)

# 중앙값
value = int(train['hours.per.week'].median())
print("중앙값:", value)
train['hours.per.week'] = train['hours.per.week'].fillna(value)
test['hours.per.week'] = test['hours.per.week'].fillna(value)

train.isnull().sum()

평균값: 38
중앙값: 40


id                0
age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64

### 이상치 처리

In [6]:
# IQR 활용
train.describe()
train[train['age']<=0]
test[test['age']<=0]
print(train.shape)
# age가 0 이상인 데이터만 살리기
train = train[train['age']>=0]
print(train.shape)

(29304, 16)
(29301, 16)


### 인코딩
> train과 test의 컬럼이 다르면 머신러닝 입력 데이터로 사용 불가

In [7]:
# 원-핫 인코딩
# pd.get_dummies(train): 인코딩이 필요한 컬럼만 자동으로 인코딩이 진행된 후 결과값 반환
y_train = train.pop('income')
train_oh = pd.get_dummies(train)
test_oh = pd.get_dummies(test)
print(train.shape, test.shape, train_oh.shape, test_oh.shape)
# train과 test의 컬럼 개수가 달라(109개, 104개) 머신러닝 입력 데이터로 사용 불가

(29301, 15) (3257, 15) (29301, 107) (3257, 103)


In [8]:
# 레이블 인코딩
# cols = train.select_dtypes(include = 'O').columns
cols = train.columns[train.dtypes == 'object']
# cols = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
cols

from sklearn.preprocessing import LabelEncoder

for col in cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

### 스케일링

In [9]:
cols = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
def get_data():
    train_copy = train.copy()
    test_copy = test.copy()
    return train_copy, test_copy

In [10]:
# 민맥스 스케일링: 최대값을 1로 최소값을 0으로 하여 범주화시키는 것
train_copy, test_copy = get_data()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
display(train_copy[cols].head(2))
train_copy[cols] = scaler.fit_transform(train_copy[cols])
test_copy[cols] = scaler.transform(test_copy[cols])
display(train_copy[cols].head(2))

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,34.0,177331,10,4386,0,40.0
1,58.0,290661,9,0,0,40.0


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,0.232877,0.112092,0.6,0.04386,0.0,0.397959
1,0.561644,0.18906,0.533333,0.0,0.0,0.397959


In [11]:
# 스탠더드 스케일링: 평균을 0, 표준편차를 1로 변환
train_copy, test_copy = get_data()

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
display(train_copy[cols].head(2))
train_copy[cols] = scaler.fit_transform(train_copy[cols])
test_copy[cols] = scaler.transform(test_copy[cols])
display(train_copy[cols].head(2))

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,34.0,177331,10,4386,0,40.0
1,58.0,290661,9,0,0,40.0


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,-0.335121,-0.117705,-0.031462,0.440247,-0.216056,-0.035121
1,1.42859,0.956277,-0.42043,-0.146298,-0.216056,-0.035121


In [12]:
# 로버스트 스케일링: 각 값의 중앙값을 빼고 1사분위수와 3사분위수의 차이로 나누는 방법
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
display(train[cols].head(2))
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])
display(train[cols].head(2))

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,34.0,177331,10,4386,0,40.0
1,58.0,290661,9,0,0,40.0


Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,-0.15,-0.008711,0.0,4386.0,0.0,0.0
1,1.05,0.941438,-0.333333,0.0,0.0,0.0


## 검증 데이터 나누기

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train, y_train, test_size=0.2, random_state=0)
X_train.shape, X_val.shape, y_train.shape, y_val.shape


((23440, 15), (5861, 15), (23440,), (5861,))

<b>occur mistake 1: y_train의 income(target)도 같이 원핫인코딩됐다</b>

## 머신러닝 학습 및 평가

### RandomForest

In [14]:
# predict_proba: 각 레이블에 속할 확률 구함, 2차원 형태, ROC_AUC
# predict: 예측된 각 레이블을 반환(<=50K, >50K), 1차원 형태, 정확도/F1 Score/정밀도/재현율
# 칼럼 순서 확인은 rf.classes_
# random_state를 0으로 고정하면 같은 결과값을 반환하도록 해줌
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict_proba(X_val)

# 0번째 열은 <=50K확률, 1번째 열은 <50K확률
print(rf.classes_)
pred[:10]

['<=50K' '>50K']


array([[1.  , 0.  ],
       [1.  , 0.  ],
       [0.9 , 0.1 ],
       [0.63, 0.37],
       [1.  , 0.  ],
       [0.99, 0.01],
       [0.98, 0.02],
       [0.94, 0.06],
       [0.12, 0.88],
       [0.88, 0.12]])

### 머신러닝 평가 지표(분류모델)

In [15]:
# ROC_AUC
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val, pred[:,1])
print('roc_auc:', roc_auc)

roc_auc: 0.9173623004487484


In [16]:
# 정확도
# 1에 가까울수록 좋음
pred = rf.predict(X_val)
pred[:10]

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, pred)
print('accuracy_score:', accuracy)

accuracy_score: 0.8694761986009213


In [17]:
# F1 Score
# 값이 클수록 좋음
from sklearn.metrics import f1_score
f1 = f1_score(y_val, pred, pos_label = '>50K')
print('f1_score:', f1)

f1_score: 0.6926476496584973


### Light GBM

In [18]:
import lightgbm as lgb
lgbmc = lgb.LGBMClassifier(random_state=0, verbose=1)
lgbmc.fit(X_train, y_train)
pred = lgbmc.predict_proba(X_val)

roc_auc = roc_auc_score(y_val, pred[:,1])
print('roc_auc:', roc_auc)

pred = lgbmc.predict(X_val)
accuracy = accuracy_score(y_val, pred)
print('accuracy_score:', accuracy)

f1 = f1_score(y_val, pred, pos_label='>50K')
print('f1_score:', f1)

[LightGBM] [Info] Number of positive: 5670, number of negative: 17770
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 941
[LightGBM] [Info] Number of data points in the train set: 23440, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.241894 -> initscore=-1.142323
[LightGBM] [Info] Start training from score -1.142323
roc_auc: 0.9279535666686397
accuracy_score: 0.8771540692714553
f1_score: 0.7158642462509865


## 예측 및 결과 파일 생성

In [24]:
pred = lgbmc.predict_proba(test)
pred

submit = pd.DataFrame({'pred' : pred[:,1]})
submit.to_csv('result_ex2.csv', index = False)