# pip & import

In [1]:
!pip install category_encoders



In [2]:
!pip install catboost



In [3]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
import easydict
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# 이진분류

In [4]:
apply_train_df = pd.read_csv('apply_train.csv')

In [5]:
apply_train_df.head()

Unnamed: 0,resume_seq,recruitment_seq
0,U05833,R03838
1,U06456,R02144
2,U07807,R01877
3,U04842,R02463
4,U08336,R00112


In [6]:
resume_pool = set(apply_train_df['resume_seq'].unique())
recruitment_pool = set(apply_train_df['recruitment_seq'].unique())

len(resume_pool), len(recruitment_pool)

(8482, 6695)

In [7]:
# 신청한 회사
df_add_negative = apply_train_df.groupby(['resume_seq'])['recruitment_seq'].apply(set).reset_index().rename(columns={'recruitment_seq':'interacted_iid'})
df_add_negative

Unnamed: 0,resume_seq,interacted_iid
0,U00001,"{R06065, R05210, R05288, R04536}"
1,U00002,"{R01960, R04588, R01730, R01103, R02346, R0547..."
2,U00003,"{R03301, R01460, R04918}"
3,U00004,"{R05367, R02355, R00564, R01511, R00004, R0289..."
4,U00005,"{R00832, R03914, R00374}"
...,...,...
8477,U08478,"{R03939, R02170}"
8478,U08479,"{R03837, R04036, R01470, R02787, R04196, R02988}"
8479,U08480,"{R00803, R03152}"
8480,U08481,"{R06253, R05500, R00225}"


In [8]:
# 아직 신청하지 않은 회사
df_add_negative['negative_iid'] = df_add_negative['interacted_iid'].map(lambda x: recruitment_pool - x)
df_add_negative

Unnamed: 0,resume_seq,interacted_iid,negative_iid
0,U00001,"{R06065, R05210, R05288, R04536}","{R01218, R01364, R05742, R00782, R00765, R0265..."
1,U00002,"{R01960, R04588, R01730, R01103, R02346, R0547...","{R01218, R01364, R05742, R00782, R00765, R0265..."
2,U00003,"{R03301, R01460, R04918}","{R01218, R01364, R05742, R00782, R00765, R0265..."
3,U00004,"{R05367, R02355, R00564, R01511, R00004, R0289...","{R01218, R01364, R05742, R00782, R00765, R0265..."
4,U00005,"{R00832, R03914, R00374}","{R01218, R01364, R05742, R00782, R00765, R0265..."
...,...,...,...
8477,U08478,"{R03939, R02170}","{R01218, R01364, R05742, R00782, R00765, R0265..."
8478,U08479,"{R03837, R04036, R01470, R02787, R04196, R02988}","{R01218, R01364, R05742, R00782, R00765, R0265..."
8479,U08480,"{R00803, R03152}","{R01218, R01364, R05742, R00782, R00765, R0265..."
8480,U08481,"{R06253, R05500, R00225}","{R01218, R01364, R05742, R00782, R00765, R0265..."


In [9]:
# 신청한 수
df_add_negative['interacted_iid_cnt'] = df_add_negative['interacted_iid'].map(lambda x: len(x))
df_add_negative

Unnamed: 0,resume_seq,interacted_iid,negative_iid,interacted_iid_cnt
0,U00001,"{R06065, R05210, R05288, R04536}","{R01218, R01364, R05742, R00782, R00765, R0265...",4
1,U00002,"{R01960, R04588, R01730, R01103, R02346, R0547...","{R01218, R01364, R05742, R00782, R00765, R0265...",8
2,U00003,"{R03301, R01460, R04918}","{R01218, R01364, R05742, R00782, R00765, R0265...",3
3,U00004,"{R05367, R02355, R00564, R01511, R00004, R0289...","{R01218, R01364, R05742, R00782, R00765, R0265...",17
4,U00005,"{R00832, R03914, R00374}","{R01218, R01364, R05742, R00782, R00765, R0265...",3
...,...,...,...,...
8477,U08478,"{R03939, R02170}","{R01218, R01364, R05742, R00782, R00765, R0265...",2
8478,U08479,"{R03837, R04036, R01470, R02787, R04196, R02988}","{R01218, R01364, R05742, R00782, R00765, R0265...",6
8479,U08480,"{R00803, R03152}","{R01218, R01364, R05742, R00782, R00765, R0265...",2
8480,U08481,"{R06253, R05500, R00225}","{R01218, R01364, R05742, R00782, R00765, R0265...",3


In [10]:
# 신청한 횟수만큼 신청하지 않은 회사 추출..
df_add_negative['negative_sampling'] = df_add_negative.apply(lambda row: random.sample(list(row['negative_iid']), row['interacted_iid_cnt']), axis=1)
df_add_negative

Unnamed: 0,resume_seq,interacted_iid,negative_iid,interacted_iid_cnt,negative_sampling
0,U00001,"{R06065, R05210, R05288, R04536}","{R01218, R01364, R05742, R00782, R00765, R0265...",4,"[R03912, R04406, R04340, R01673]"
1,U00002,"{R01960, R04588, R01730, R01103, R02346, R0547...","{R01218, R01364, R05742, R00782, R00765, R0265...",8,"[R00812, R06503, R01245, R03342, R04988, R0219..."
2,U00003,"{R03301, R01460, R04918}","{R01218, R01364, R05742, R00782, R00765, R0265...",3,"[R01929, R01733, R01665]"
3,U00004,"{R05367, R02355, R00564, R01511, R00004, R0289...","{R01218, R01364, R05742, R00782, R00765, R0265...",17,"[R02197, R01546, R05196, R04307, R00262, R0191..."
4,U00005,"{R00832, R03914, R00374}","{R01218, R01364, R05742, R00782, R00765, R0265...",3,"[R00168, R04821, R01525]"
...,...,...,...,...,...
8477,U08478,"{R03939, R02170}","{R01218, R01364, R05742, R00782, R00765, R0265...",2,"[R03477, R03615]"
8478,U08479,"{R03837, R04036, R01470, R02787, R04196, R02988}","{R01218, R01364, R05742, R00782, R00765, R0265...",6,"[R03939, R01179, R02482, R02692, R00848, R04218]"
8479,U08480,"{R00803, R03152}","{R01218, R01364, R05742, R00782, R00765, R0265...",2,"[R04731, R04899]"
8480,U08481,"{R06253, R05500, R00225}","{R01218, R01364, R05742, R00782, R00765, R0265...",3,"[R00439, R05334, R04539]"


In [11]:
# 검증 코드
df_add_negative['negative_sampling_cnt'] = df_add_negative['negative_sampling'].map(lambda x: len(x))
df_add_negative['is_error'] = df_add_negative.apply(lambda row: False if row['interacted_iid_cnt'] == row['negative_sampling_cnt'] else True, axis=1)
df_add_negative['is_error'].sum()


0

In [12]:
df_interacted = df_add_negative[['resume_seq', 'interacted_iid']].explode('interacted_iid').rename(
        columns={'interacted_iid':'recruitment_seq'}
    )
df_interacted['target'] = 1
df_interacted.reset_index(drop=True, inplace=True)

In [13]:
df_negatived = df_add_negative[['resume_seq', 'negative_sampling']].explode('negative_sampling').rename(
        columns={'negative_sampling':'recruitment_seq'}
    )
df_negatived['target'] = 0
df_negatived.reset_index(drop=True, inplace=True)

In [14]:
#유저가 실제 지원한 공고는 target = 1
#유저가 지원하지 않은 공고중에서 유저가 실제 지원한 수만큼 sampling으로 추출, 해당 공고들의 target = 0
#두 df를 하나로 concat.
df_concat = pd.concat([df_interacted, df_negatived], axis=0)

In [15]:
df_shuffle = df_concat.sample(frac=1).reset_index(drop=True)
#.sample은 전체 데이터셋에서 frac(0<=frac<=1)값만큼 랜덤하게 추출한다.
#frac = 1 이기 때문에 전체 데이터셋을 그대로 뽑되 순서만 랜덤하게 된다. 즉, shuffle의 기능을 한다.

In [16]:
df_shuffle['recruitment_seq'].describe()
#데이터 확인 2

count     115892
unique      6695
top       R03237
freq          84
Name: recruitment_seq, dtype: object

In [17]:
# 그래서.. 이게 무슨의미인데?
## 결국은, apply_train 데이터를 조작한 것이다.
## 기존 apply_train : 실제 유저가 지원한 이력들
## negative sampling을 거친 후 apply_train(df_shuffle) : 유저가 실제 지원한 공고(50%) + 유저가 지원하지 않은 공고 중 일부(50%)
## => !!!!!!! 이젠 Boosting 모델을 적용할 수 있게 된다 !!!!!!!!!

In [18]:
apply_train_df = df_shuffle

In [19]:
### df_shuffle의 resume_seq기준으로 유의미하다고 생각되는 resume_feautre를 붙인다.
### df_shuffle의 recruitment_seq기준으로 유의미하다고 생각되는 recruitment_seq를 붙인다.
### 그러면 데이터 형태가 resume_seq | resume feature들... | recruitment_seq | recruitment feature들 ... | target 형태로 될 것이다.
### 여기서 id 성격인 seq 컬럼들을 드랍한다.
#### 그러면 resume feature들 .... | recruitment feature들 .... | target 이 된다.
#### 이렇게 되면 우리가 많이 봐왔던 타이타닉 이진분류 문제처럼 된다.
##### 따라서 향후 필요한 작업은 다음과 같다.
##### 1. resume feature들 / recruitment feature들을 유의미한 것들을 골라낸다
##### 2. 가장 좋은 Boost 모델을 찾는다
##### 3. 해당 Boost 모델의 가장 성능 좋은 하이퍼패러미터 값을 찾는다.

# 시작

In [None]:
resume_train_df = pd.read_csv('resume.csv')
certificate_df = pd.read_csv('resume_certificate.csv')
education_train_df = pd.read_csv('resume_education.csv')
language_train_df = pd.read_csv('resume_language.csv')
company_train_df = pd.read_csv('company.csv')
recruitment_train_df = pd.read_csv('recruitment.csv')

In [21]:
apply_train_df

Unnamed: 0,resume_seq,recruitment_seq,target
0,U00038,R06337,1
1,U03392,R01416,0
2,U03303,R01903,1
3,U08346,R02626,0
4,U02335,R01500,0
...,...,...,...
115887,U00038,R05282,1
115888,U03622,R01436,0
115889,U00808,R05127,0
115890,U06610,R05677,1


In [20]:
# 여기서 feature 합치쳐서 company 말고 resume만으로 붙여넣기

# 함수 정의 & 글로벌 변수

In [None]:
def recall5(answer_df, submission_df):

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # submission의 예측이 각각 5개인지 확인
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # submission의 예측된 값들에 null값이 있는지 확인
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # 예측값에 중복이 있는지 확인
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    # primary_col 즉 resume_seq가 양측에 있는지 확인 후 남김
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    #
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [None]:
def minimum_condition(resume, company):
  if(resume['degree'] < company['education']):
    return False
  else:
    return True

# null 지우기

In [None]:
mean_age = find_s_survived['age'].mean()

find_s_survived['age'].fillna(mean_age, inplace=True)
ori_test['age'].fillna(mean_age, inplace=True)

In [None]:
embarked_mode = find_s_survived['embarked'].mode().values[0]

find_s_survived['embarked'].fillna(embarked_mode, inplace=True)
ori_test['embarked'].fillna(embarked_mode, inplace=True)

# 범주형 데이터 형변환

In [None]:
find_s_survived['sub_parch'] = find_s_survived['parch'].map(lambda x: sub_parch(x))
find_s_survived['sub_parch'] = find_s_survived['sub_parch'].astype("category")

ori_test['sub_parch'] = ori_test['parch'].map(lambda x: sub_parch(x))
ori_test['sub_parch'] = ori_test['sub_parch'].astype("category")

In [None]:
no_category_cols = ['age', 'survived', 'fare']
category_cols = ['pclass', 'gender', 'sub_parch', 'sub_sibsp', 'embarked']

train_no_category = find_s_survived[no_category_cols]
train_category = find_s_survived[category_cols]

train_no_category.shape, train_category.shape

In [None]:
no_category_cols = ['age', 'fare']
category_cols = ['pclass', 'gender', 'sub_parch', 'sub_sibsp', 'embarked']

test_no_category = ori_test[no_category_cols]
test_category = ori_test[category_cols]

test_no_category.shape, test_category.shape

In [None]:
train_category['gender'] = train_category['gender'].map({'male':1, 'female':2}).astype('category')
train_category['pclass'] = train_category['pclass'].astype('category')
train_category['sub_parch'] = train_category['sub_parch'].astype('category')

test_category['gender'] = test_category['gender'].map({'male':1, 'female':2}).astype('category')
test_category['pclass'] = test_category['pclass'].astype('category')
test_category['sub_parch'] = test_category['sub_parch'].astype('category')

# 수치형 스케일링

In [None]:
# 스케일링 적용할 컬럼 선언
scaling_cols = ['fare','age']

# 수치형 데이터 프레임에서 스케일링 적용할 컬럼이 포함된 데이터 프레임 생성
train_scaling = train_no_category[scaling_cols]
test_scaling = test_no_category[scaling_cols]

print(f'{train_scaling.shape} / {test_scaling.shape}')
train_scaling.head(3)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# 인스턴스 생성
std = StandardScaler()
# 학습
std.fit(train_scaling)
# 반영
X_train_scaled = std.transform(train_scaling)
X_test_scaled = std.transform(test_scaling)

In [None]:
X_train_scaled.shape, X_test_scaled.shape

In [None]:
train_scaling.shape, test_scaling.shape

# 수치형, 범주형 합치기

In [None]:
_train_encoded = pd.DataFrame()
_test_encoded = pd.DataFrame()

for col in ['pclass', 'gender', 'sub_parch', 'embarked']:
  _encoder = ce.OneHotEncoder(use_cat_names=True)
  _encoder.fit(train_category[col])
  _encoded = _encoder.transform(train_category[col])
  _train_encoded = pd.concat([_train_encoded, _encoded], axis=1)

  _encoded = _encoder.transform(test_category[col])
  _test_encoded = pd.concat([_test_encoded, _encoded], axis=1)

In [None]:
train_encoded = pd.concat([train_no_category, _train_encoded], axis=1)
train_encoded.shape

In [None]:
test_encoded = pd.concat([test_no_category, _test_encoded], axis=1)
test_encoded.shape

In [None]:
train_encoded.isnull().sum().sum(), test_encoded.isnull().sum().sum()

In [None]:
train_target = train_encoded['survived']
train_features = train_encoded.drop(columns=['survived'])

train_features.shape, train_target.shape

# K-Fold

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn import datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# 교차 검증
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(random_state=42)

n_iter = 0
auc_lst = []

for train_index, valid_index in skf.split(train_features, train_target):
  n_iter += 1
  # 학습용, 검증용 데이터 구성
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  # 학습
  model.fit(train_x, train_y)
  valid_pred_proba = model.predict_proba(valid_x)[:,1] # 예측확률

  # 평가
  fpr, tpr, thresholds = roc_curve(valid_y, valid_pred_proba)
  valid_auc = auc(fpr, tpr)
  auc_lst.append(valid_auc)
  print(f'{n_iter} 번째 Stratified Stratified K-Fold 정확도: {valid_auc}')

# 최종 평가

print('-'*50)
print(f'교차 검증 auc: {np.mean(auc_lst)}')

# XGBoost

## model1

In [None]:
from xgboost import XGBClassifier, plot_importance

In [None]:
hp = {
    "random_state" : 42
}
model = XGBClassifier(**hp)
n_iter = 0
auc_lst = []

for train_index, valid_index in skf.split(train_features, train_target):
  n_iter += 1
  # 학습용, 검증용 데이터 구성
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  # 학습
  model.fit(train_x, train_y)
  valid_pred_proba = model.predict_proba(valid_x)[:,1] # 예측확률

  # 평가
  fpr, tpr, thresholds = roc_curve(valid_y, valid_pred_proba)
  valid_auc = auc(fpr, tpr)
  auc_lst.append(valid_auc)
  print(f'{n_iter} 번째 Stratified Stratified K-Fold 정확도: {valid_auc}')

# 최종 평가

print('-'*50)
print(f'교차 검증 auc: {np.mean(auc_lst)}')

# model2

In [None]:
hp = {
    "random_state" : 42,
    "verbose": 0, # 로그
}
model = XGBClassifier(**hp)

In [None]:
n_iter=50 # 몇번 hp조합을 찾을 것인가?
scoring = 'roc_auc' # 어떤 평가지표를 사용할 것인가?
hp={
    "max_depth" : np.linspace(5,12,8,dtype = int), # 깊이
    "n_estimators" : np.linspace(800,1200,5, dtype = int), # 부스팅 단계수
    "learning_rate" : np.logspace(-3, -1, 3)
}

# HPO 인스턴스 생성
rs1=RandomizedSearchCV(model, hp, scoring=scoring, n_iter=n_iter, n_jobs=-1, cv=skf, verbose=False)
# 학습 (best hp 조합 찾기)
rs1.fit(train_x, train_y)

# 학습 결과 (best hp 조합일 때,)
rs1.best_score_

In [None]:
best_params = rs1.best_params_
best_params

In [None]:
rs_results_df=pd.DataFrame(np.transpose([rs1.cv_results_['mean_test_score'],
                                         rs1.cv_results_['param_learning_rate'].data,
                                         rs1.cv_results_['param_max_depth'].data,
                                         rs1.cv_results_['param_n_estimators'].data]),
                           columns=['score', 'learning_rate', 'max_depth', 'n_estimators'])
rs_results_df.plot(subplots=True,figsize=(10, 10))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
best_params['random_state'] = 42
best_params['verbose'] = 0

# best 모델 인스턴스 생성(정의)
best_mode = XGBClassifier(**best_params)

In [None]:
n_iter = 0
auc_lst = []

for train_index, valid_index in skf.split(train_features, train_target):
  n_iter += 1
  # 학습용, 검증용 데이터 구성
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  # 학습
  model.fit(train_x, train_y)
  valid_pred_proba = model.predict_proba(valid_x)[:,1] # 예측확률

  # 평가
  fpr, tpr, thresholds = roc_curve(valid_y, valid_pred_proba)
  valid_auc = auc(fpr, tpr)
  auc_lst.append(valid_auc)
  print(f'{n_iter} 번째 Stratified Stratified K-Fold 정확도: {valid_auc}')

# 최종 평가
print('-'*50)
print(f'교차 검증 auc: {np.mean(auc_lst)}')

In [None]:
plot_importance(model)
plt.show()

# Light GBM

## model1

In [None]:
from lightgbm import LGBMClassifier, plot_importance

In [None]:
hp = {
    "random_state" : 42
}
model = LGBMClassifier(**hp)
n_iter = 0
accuracy_lst = []

for train_index, valid_index in skf.split(train_features, train_target):
  n_iter += 1
  # 학습용, 검증용 데이터 구성
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  # 학습
  model.fit(train_x, train_y)
  valid_pred_proba = model.predict_proba(valid_x)[:,1] # 예측확률

  # 평가
  fpr, tpr, thresholds = roc_curve(valid_y, valid_pred_proba)
  valid_auc = auc(fpr, tpr)
  auc_lst.append(valid_auc)
  print(f'{n_iter} 번째 Stratified Stratified K-Fold 정확도: {valid_auc}')

# 최종 평가
print('-'*50)
print(f'교차 검증 auc: {np.mean(auc_lst)}')

## model2

In [None]:
hp = {
    "random_state" : 42,
    "verbose": -1, # 로그
}
model = LGBMClassifier(**hp)

In [None]:
n_iter=50 # 몇번 hp조합을 찾을 것인가?
scoring = 'roc_auc' # 어떤 평가지표를 사용할 것인가?
hp={
    "max_depth" : np.linspace(5,12,8,dtype = int), # 깊이
    "n_estimators" : np.linspace(800,1200,5, dtype = int), # 부스팅 단계수
    "learning_rate" : np.logspace(-3, -1, 3)
}

# HPO 인스턴스 생성
rs1=RandomizedSearchCV(model, hp, scoring=scoring, n_iter=n_iter, n_jobs=-1, cv=skf, verbose=False)
# 학습 (best hp 조합 찾기)
rs1.fit(train_x, train_y)

# 학습 결과 (best hp 조합일 때,)
rs1.best_score_

In [None]:
best_params = rs1.best_params_
best_params

In [None]:
rs_results_df=pd.DataFrame(np.transpose([rs1.cv_results_['mean_test_score'],
                                         rs1.cv_results_['param_learning_rate'].data,
                                         rs1.cv_results_['param_max_depth'].data,
                                         rs1.cv_results_['param_n_estimators'].data]),
                           columns=['score', 'learning_rate', 'max_depth', 'n_estimators'])
rs_results_df.plot(subplots=True,figsize=(10, 10))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
best_params['random_state'] = 42
best_params['verbose'] = -1

# best 모델 인스턴스 생성(정의)
best_mode = LGBMClassifier(**best_params)

In [None]:
n_iter = 0
accuracy_lst = []

for train_index, valid_index in skf.split(train_features, train_target):
  n_iter += 1
  # 학습용, 검증용 데이터 구성
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  # 학습
  model.fit(train_x, train_y)
  valid_pred_proba = model.predict_proba(valid_x)[:,1] # 예측확률

  # 평가
  fpr, tpr, thresholds = roc_curve(valid_y, valid_pred_proba)
  valid_auc = auc(fpr, tpr)
  auc_lst.append(valid_auc)
  print(f'{n_iter} 번째 Stratified Stratified K-Fold 정확도: {valid_auc}')

# 최종 평가
print('-'*50)
print(f'교차 검증 auc: {np.mean(auc_lst)}')

In [None]:
plot_importance(model)
plt.show()

# Catboost

## model1

In [None]:
from catboost import CatBoostClassifier

In [None]:
hp = {
    "random_state" : 42,
    "verbose" : 0 # 부스팅 단계 출력 안보이게 하기
}
model = CatBoostClassifier(**hp)
n_iter = 0
accuracy_lst = []

for train_index, valid_index in skf.split(train_features, train_target):
  n_iter += 1
  # 학습용, 검증용 데이터 구성
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  # 학습
  model.fit(train_x, train_y)
  valid_pred_proba = model.predict_proba(valid_x)[:,1] # 예측확률

  # 평가
  fpr, tpr, thresholds = roc_curve(valid_y, valid_pred_proba)
  valid_auc = auc(fpr, tpr)
  auc_lst.append(valid_auc)
  print(f'{n_iter} 번째 Stratified Stratified K-Fold 정확도: {valid_auc}')

# 최종 평가
print('-'*50)
print(f'교차 검증 auc: {np.mean(auc_lst)}')

## model2

In [None]:
hp = {
    "random_state" : 42,
    "verbose": 0, # 로그
}
model = CatBoostClassifier(**hp)

In [None]:
n_iter=20 # 몇번 hp조합을 찾을 것인가?
scoring = 'roc_auc' # 어떤 평가지표를 사용할 것인가?
hp={
    "max_depth" : np.linspace(5,12,8,dtype = int), # 깊이
    "n_estimators" : np.linspace(10,500,5, dtype = int), # 부스팅 단계수
    "learning_rate" : np.logspace(-3, -1, 3)
}

# HPO 인스턴스 생성
rs1=RandomizedSearchCV(model, hp, scoring=scoring, n_iter=n_iter, n_jobs=-1, cv=skf, verbose=False)
# 학습 (best hp 조합 찾기)
rs1.fit(train_x, train_y)

# 학습 결과 (best hp 조합일 때,)
rs1.best_score_

In [None]:
best_params = rs1.best_params_
best_params

In [None]:
rs_results_df=pd.DataFrame(np.transpose([rs1.cv_results_['mean_test_score'],
                                         rs1.cv_results_['param_learning_rate'].data,
                                         rs1.cv_results_['param_max_depth'].data,
                                         rs1.cv_results_['param_n_estimators'].data]),
                           columns=['score', 'learning_rate', 'max_depth', 'n_estimators'])
rs_results_df.plot(subplots=True,figsize=(10, 10))

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
best_params['random_state'] = 42
best_params['verbose'] = 0

# best 모델 인스턴스 생성(정의)
best_mode = LGBMClassifier(**best_params)

In [None]:
n_iter = 0
accuracy_lst = []

for train_index, valid_index in skf.split(train_features, train_target):
  n_iter += 1
  # 학습용, 검증용 데이터 구성
  train_x, valid_x = train_features.iloc[train_index], train_features.iloc[valid_index]
  train_y, valid_y = train_target.iloc[train_index], train_target.iloc[valid_index]

  # 학습
  model.fit(train_x, train_y)
  valid_pred_proba = model.predict_proba(valid_x)[:,1] # 예측확률

  # 평가
  fpr, tpr, thresholds = roc_curve(valid_y, valid_pred_proba)
  valid_auc = auc(fpr, tpr)
  auc_lst.append(valid_auc)
  print(f'{n_iter} 번째 Stratified Stratified K-Fold 정확도: {valid_auc}')

# 최종 평가
print('-'*50)
print(f'교차 검증 auc: {np.mean(auc_lst)}')

In [None]:
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)

fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
# plt.yticks(range(len(sorted_idx)), np.array(range(len(train_x)))[sorted_idx])
# 변수명이 있다면, 아래와 같이...
plt.yticks(range(len(sorted_idx)), np.array(train_x.columns)[sorted_idx])
plt.title('Feature Importance')

# F1-socre

In [None]:
model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
pred = model.predict(train_x)
pred_proba = model.predict_proba(train_x)

In [None]:
score = accuracy_score(train_y, pred)

In [None]:
f1_score(train_y, pred, average="micro")

In [None]:
f1_score(train_y, pred, average="macro")

In [None]:
f1_score(train_y, pred, average="weighted")

In [None]:
f1_score(train_y, pred)

# 학습 평가

In [None]:
pred = model.predict(train_features) # 예측값
pred_proba = model.predict_proba(train_features)[:,1] # 예측확률

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
# roc_curve(실제값, 예측확률값)
fpr, tpr, thresholds = roc_curve(train_target, pred_proba)
print(f'auc: {auc(fpr, tpr)}')

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# confusion_matrix(실제값, 예측값)
# normalize="true" -> 확률값으로 변경!!
conf_mx = confusion_matrix(train_target, pred, normalize="true")
conf_mx

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(7,5))

# annot=True -> 수치데이터 표시
# cmap -> 히트맵 컬러정의
# linewidth -> 선 두께
sns.heatmap(conf_mx, annot=True, cmap="coolwarm", linewidth=0.5)

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# pandas의 데이터 프레임을 슬라이싱할때 사용한 모듈은 iloc, loc임!!!
# 인덱스(또는 컬럼)의 조건이 True인 것으로 정의를 하면 사용할 있음
tmp_train = find_s_survived.copy()
tmp_train['pred'] = pred
tmp_train.head()

In [None]:
# FP정의
# 모델은 True & 실제값은 False인 데이터
c1 = tmp_train['survived'] == 0 # 실제값은 False
c2 = tmp_train['pred'] == 1 # 모델은 True
c = c1 & c2

tmp_train.loc[c]

In [None]:
# FN정의
# 모델은 False & 실제값은 True인 데이터
c1 = tmp_train['survived'] == 1 # 실제값은 True
c2 = tmp_train['pred'] == 0 # 모델은 False
c = c1 & c2

tmp_train.loc[c]

# 예측

In [None]:
target_pred = model.predict_proba(test_encoded)[:,1]
test_encoded.shape, target_pred.shape

In [None]:
submission['survived'] = target_pred
submission.head()

In [None]:
submission.to_csv("submission.csv", header=True, index=False)