# 문제 정의
- 새로운 일자리를 찾을지 예측하시오.
 - 제공된 데이터 목록: hr_train.csv, hr_test.csv
 - 예측할 칼럼: target(0: 새 일자리를 찾지 않음, 1: 새 일자리를 찾음)
- 학습용 데이터(train)를 이용해 새 일자리를 찾을지 예측하는 모델을 만든 후 이를 평가용 데이터(test)에 적용해 얻은 예측값을 다음과 같은 형식의 CSV 파일로 생성하시오.

제출 파일은 다음 1개의 컬럼을 포함해야 한다.
 - pred: 예측값(이직할 확률)
 - 제출 파일명: 'result.csv'

In [34]:
# 데이터 불러오기
from google.colab import files
uploads = files.upload()

Saving hr_train.csv to hr_train (1).csv
Saving hr_test.csv to hr_test (1).csv


In [35]:
# 데이터 불러오기
import pandas as pd
train = pd.read_csv('hr_train.csv')
test = pd.read_csv('hr_test.csv')

In [36]:
# 탐색적 데이터 분석

# sample 확인

print(' ===== 데이터 크기=====')
print("train:", train.shape, "test:", test.shape)

print("\n===== 데이터 정보(object) =====")
print(train.info())

print('\n ===== object 정보(train) =====')
print(train.describe(include='O'))

print('\n ===== object 정보(test) =====')
print(test.describe(include='O'))

print('\n ===== 결측치 개수 (train) =====')
print(train.isnull().sum())

print('\n ===== 결측치 개수 (test) =====')
print(test.isnull().sum())

print('\n ===== train카테고리별 수 =====')
print(train.nunique())

print('\n ===== test카테고리별 수 =====')
print(test.nunique())

print('\n ===== target 빈도 =====')
print(train['target'].value_counts())

print('\n ===== 기술통계량(train) =====')
print(train.describe())

 ===== 데이터 크기=====
train: (15326, 14) test: (3832, 13)

===== 데이터 정보(object) =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11750 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15012 non-null  object 
 6   education_level         14961 non-null  object 
 7   major_discipline        13045 non-null  object 
 8   experience              15272 non-null  object 
 9   company_size            10539 non-null  object 
 10  company_type            10383 non-null  object 
 11  last_new_job            14984 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target  

In [37]:
# 데이터 전처리
# 결측치는 별도의 값으로 대치

# 결측치 채우기
train = train.fillna('Missing_Value')
test = test.fillna('Missing_Value')

train.isnull().sum(), test.isnull().sum()

(enrollee_id               0
 city                      0
 city_development_index    0
 gender                    0
 relevent_experience       0
 enrolled_university       0
 education_level           0
 major_discipline          0
 experience                0
 company_size              0
 company_type              0
 last_new_job              0
 training_hours            0
 target                    0
 dtype: int64,
 enrollee_id               0
 city                      0
 city_development_index    0
 gender                    0
 relevent_experience       0
 enrolled_university       0
 education_level           0
 major_discipline          0
 experience                0
 company_size              0
 company_type              0
 last_new_job              0
 training_hours            0
 dtype: int64)

In [38]:
# 레이블 인코딩
# 우선 target 추출
target = train.pop('target')

In [39]:
#레이블 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = train.select_dtypes(include='O').columns
for col in cols:
  train[col] = le.fit_transform(train[col])
  test[col] = le.transform(test[col])

train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,30266,113,0.698,1,1,0,1,3,0,1,6,6,15
1,13254,48,0.91,1,0,3,0,6,6,4,6,0,4
2,31675,64,0.624,1,1,0,1,3,13,8,2,6,12
3,30804,6,0.924,1,1,3,0,0,21,8,2,1,258
4,18269,37,0.74,1,0,3,0,6,3,4,6,0,3


# 머신러닝 학습

In [40]:
# 데이터 split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size = 0.2,
    random_state = 0
)

dfs = [X_train, X_val, y_train, y_val]

for df in dfs:
  print(df.shape)

(12260, 13)
(3066, 13)
(12260,)
(3066,)


In [41]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)

#학습
rf.fit(X_train, y_train)
pred = rf.predict_proba(X_val)

In [42]:
# 모델 평가
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_val,pred[:,1])
roc_auc

np.float64(0.7750648526650465)

# 성능 개선

In [43]:
# 하이퍼 파라미터 조정(max_depth)

depths = [3,5,7]
n_estimators = [200,300,400,500]
best_auc = 0
best_depth = None
best_n_estimator = None

for depth in depths:
  for n_estimator in n_estimators:
    rf = RandomForestClassifier(random_state=0, max_depth = depth, n_estimators = n_estimator)
    rf.fit(X_train, y_train)
    pred = rf.predict_proba(X_val)
    roc_auc = roc_auc_score(y_val, pred[:,1])
    print('roc_auc of depth', depth, '& n_estimator', n_estimator,'is',roc_auc)

    if roc_auc > best_auc:
      best_auc = roc_auc
      best_depth = depth
      best_n_estimator = n_estimator

print('best combination of depth & n_estimator is', best_depth, '&', best_n_estimator)

roc_auc of depth 3 & n_estimator 200 is 0.7733080361114624
roc_auc of depth 3 & n_estimator 300 is 0.7737651654562396
roc_auc of depth 3 & n_estimator 400 is 0.7736252756823577
roc_auc of depth 3 & n_estimator 500 is 0.7739466124432831
roc_auc of depth 5 & n_estimator 200 is 0.7798998295568947
roc_auc of depth 5 & n_estimator 300 is 0.7791734562958592
roc_auc of depth 5 & n_estimator 400 is 0.7788805072087133
roc_auc of depth 5 & n_estimator 500 is 0.7790057641610594
roc_auc of depth 7 & n_estimator 200 is 0.7815755802791708
roc_auc of depth 7 & n_estimator 300 is 0.7816449398532502
roc_auc of depth 7 & n_estimator 400 is 0.7817760499342108
roc_auc of depth 7 & n_estimator 500 is 0.7818778943720999
best combination of depth & n_estimator is 7 & 500


In [44]:
# 모델 선정
rf = RandomForestClassifier(random_state=0, max_depth = best_depth, n_estimators = best_n_estimator)
rf.fit(X_train, y_train)


# test로 예측
pred = rf.predict_proba(test)
submit = pd.DataFrame({'pred':pred[:,1]})
submit.to_csv('result.csv', index=False)

In [45]:
# 확인
pd.read_csv('result.csv')

Unnamed: 0,pred
0,0.197672
1,0.532090
2,0.596633
3,0.085661
4,0.108453
...,...
3827,0.102392
3828,0.389250
3829,0.087899
3830,0.187471


# lightgbm 확인

In [46]:
import lightgbm as lgb
lgbmc = lgb.LGBMClassifier(random_state=0, verbose=-1)
lgbmc.fit(X_train, y_train)
pred = lgbmc.predict_proba(X_val)
roc_auc = roc_auc_score(y_val, pred[:,1])
roc_auc

np.float64(0.7864269459311392)

In [48]:
# lightgbm으로 모델 선정
pred = lgbmc.predict_proba(test)
submit = pd.DataFrame({'pred':pred[:,1]})
submit.to_csv('result.csv', index=False)

In [49]:
pd.read_csv('result.csv')

Unnamed: 0,pred
0,0.251764
1,0.604601
2,0.621962
3,0.059376
4,0.109373
...,...
3827,0.275885
3828,0.395954
3829,0.130392
3830,0.079468


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[152]	valid_0's auc: 0.785262	valid_0's binary_logloss: 0.443481
depth=3, leaves=15, lr=0.05 -> AUC: 0.7853
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[67]	valid_0's auc: 0.785042	valid_0's binary_logloss: 0.443441
depth=3, leaves=15, lr=0.1 -> AUC: 0.7850
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[152]	valid_0's auc: 0.785262	valid_0's binary_logloss: 0.443481
depth=3, leaves=31, lr=0.05 -> AUC: 0.7853
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[67]	valid_0's auc: 0.785042	valid_0's binary_logloss: 0.443441
depth=3, leaves=31, lr=0.1 -> AUC: 0.7850
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[152]	valid_0's auc: 0.785262	valid_0's binary_logloss: 0.443481
depth=3, leaves=63, lr=0.05