### 기초 - Baseline

In [26]:
# 1. 문제정의
# 평가: roc-auc
# target: target
# 최종파일: result.csv(컬럼 1개 pred, 1확률값)

# 2. 라이브러리 및 데이터 불러오기
import pandas as pd

train = pd.read_csv("hr_train.csv")
test = pd.read_csv("hr_test.csv")
# train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/hr_train.csv")
# test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/hr_test.csv")

# 3. 탐색적 데이터 분석(EDA)
print("===== 데이터 정보(자료형) =====")
print(train.info())

print("\n ===== train 결측치 수 =====")
print(train.isnull().sum())

print("\n ===== test 결측치 수 =====")
print(test.isnull().sum())

print("\n ===== train/test 카테고리별 수 =====")
print(train.nunique())
print(test.nunique())

print("\n ===== target 빈도 =====")
print(train['target'].value_counts())

===== 데이터 정보(자료형) =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11750 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15012 non-null  object 
 6   education_level         14961 non-null  object 
 7   major_discipline        13045 non-null  object 
 8   experience              15272 non-null  object 
 9   company_size            10539 non-null  object 
 10  company_type            10383 non-null  object 
 11  last_new_job            14984 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target                  15326 non-null  float64
dtypes: float64(2),

In [34]:
# 4. 데이터 전처리
# 결측치 처리
train = train.fillna("X")
test = test.fillna("X")

# 변수 선택
# target = train.pop('target')

# train과 test 합쳐서 원핫인코딩
combined = pd.concat([train, test])
combined_dummies = pd.get_dummies(combined)
n_train = len(train)
train = combined_dummies[:n_train]
test = combined_dummies[n_train:]
print(train.info())

# 5. 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# 6. 머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
pred = rf.predict_proba(X_test)

from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test, pred[:,1])
print('\n roc_auc:', roc_auc)

# 7. 예측 및 결과 파일 생성
pred = rf.predict_proba(test)
submit = pd.DataFrame({'pred':pred[:,1]})
submit.to_csv("result.csv", index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 15326 entries, 0 to 15325
Columns: 195 entries, enrollee_id to last_new_job_never
dtypes: bool(192), float64(1), int64(2)
memory usage: 3.3 MB
None
(12260, 195) (3066, 195) (12260,) (3066,)

 roc_auc: 0.7730742036233207


### 성능 개선

In [45]:
# 2. 라이브러리 및 데이터 불러오기
import pandas as pd

train = pd.read_csv("hr_train.csv")
test = pd.read_csv("hr_test.csv")
# train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/hr_train.csv")
# test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part2/ch6/hr_test.csv")

# 4. 데이터 전처리
target = train.pop('target')

# 결측치 처리
train = train.fillna("X")
test = test.fillna("X")

# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
# 훈련, 테스트 데이터 합치기
combined = pd.concat([train, test])
cols = train.select_dtypes(include='object').columns

for col in cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])

n_train = len(train)
train = combined[:n_train]
test = combined[n_train:]

# id 제거 (성능 떨어짐)
train = train.drop('enrollee_id', axis=1)
test = test.drop('enrollee_id', axis=1)
print(train.info())
print(test.info())

# 스케일링
# from sklearn.preprocessing import RobustScaler
# scaler = RobustScaler()
# n_cols = train.select_dtypes(exclude='object').columns
# train = scaler.fit_transform(train)
# test = scaler.transform(test)

# 5. 검증 데이터 분할
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0)

# 6. 머신러닝 학습 및 평가
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=7, n_estimators=200, random_state=0)
rf.fit(X_tr, y_tr)
pred = rf.predict_proba(X_val)

from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_val, pred[:,1])
print('\n roc_auc:', roc_auc)

# 7. 예측 및 결과 파일 생성
pred = rf.predict_proba(test)
submit = pd.DataFrame({'pred':pred[:,1]})
submit.to_csv("result.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 15326 entries, 0 to 15325
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city                    15326 non-null  int32  
 1   city_development_index  15326 non-null  float64
 2   gender                  15326 non-null  int32  
 3   relevent_experience     15326 non-null  int32  
 4   enrolled_university     15326 non-null  int32  
 5   education_level         15326 non-null  int32  
 6   major_discipline        15326 non-null  int32  
 7   experience              15326 non-null  int32  
 8   company_size            15326 non-null  int32  
 9   company_type            15326 non-null  int32  
 10  last_new_job            15326 non-null  int32  
 11  training_hours          15326 non-null  int64  
dtypes: float64(1), int32(10), int64(1)
memory usage: 957.9 KB
None
<class 'pandas.core.frame.DataFrame'>
Index: 3832 entries, 0 to 3831
Data columns (tot