## 모델 학습

- XGB와 LightBGM 분류 학습을 통해 사용자가 어떤 목적지를 예약할지 예측을하는 분류 학습 모델을 생성
- 기존 데이터, country 데이터 병합, sessions 데이터 병합, country_sessions 데이터 병합의 비교를 통해 병합 데이터가 성능을 높이는 데 의미가 있는지 판단

## 결과 요약

- XGB : Private Score 0.85711, Public Score 0.85345로 성능이 나쁘지 않은 모델이 생성되었으며, 둘의 차이가 크지 않은 것을 보아 과적합이 되지 않음을 알 수 있음.
- LightBGM : Private Score 0.77494, Public Score 0.77117 과적합이 되지 않았으나 XGB 모델보다 평가가 좋지 않음

### 병합 데이터 NDCG SCORE 비교

Clean : 0.8233732350776741<br>
Country : 0.9826664673095211<br>
Session : 0.8491297566294652<br>
Country_Session : 0.9825252250725767<br>
<br>
해당 결과로 병합한 데이터가 좀 더 성능을 높이는 데 의미가 있음을 알 수 있음



In [None]:
!mkdir -p /content/dataset/origin
!mkdir -p /content/dataset/clean

In [None]:
!git clone https://github.com/hardok00/2024-07-18-DataThon.git

Cloning into '2024-07-18-DataThon'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 27 (delta 0), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (27/27), 13.64 MiB | 10.21 MiB/s, done.


In [None]:
!unzip /content/2024-07-18-DataThon/dataset/clean_dataset.zip -d /content/dataset/clean

Archive:  /content/2024-07-18-DataThon/dataset/clean_dataset.zip
  inflating: /content/dataset/clean/test_clean.csv  
  inflating: /content/dataset/clean/train_clean.csv  
  inflating: /content/dataset/clean/train_clean_with_country.csv  
  inflating: /content/dataset/clean/train_clean_with_session.csv  
  inflating: /content/dataset/clean/train_clean_with_two_datasets.csv  


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob

In [None]:
# ndcg_score 계산 함수
# https://www.kaggle.com/code/davidgasquez/ndcg-scorer 참고

from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer

def dcg_score(y_true, y_score, k=5):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score_udf(ground_truth, predictions, k=5):
    lb = LabelBinarizer()
    lb.fit(range(predictions.shape[1] + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)

# Create the scorer with NDCG score metric function
ndcg_scorer = make_scorer(ndcg_score_udf, needs_proba=True, k=5)

In [None]:
# CSV 파일 생성
def make_csv(test_id, y_pred, le, model_name='model'):
  # 가장 높은 확률을 가진 국가 5개를 선택하고 리스트에 저장
  ids = []  #list of ids
  cts = []  #list of countries

  for i in range(len(test_id)):
      idx = test_id[i]
      ids += [idx] * 5
      cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

  sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
  sub.to_csv(f'/content/sub_{model_name}.csv',index=False)

In [None]:
# 데이터 병합

def data_labeling(train):
  label = train['country_destination']
  train = train.drop(['id', 'age_bucket', 'country_destination'], axis=1)

  train_shape = train.shape[0]

  return train, label, train_shape

def data_merge(train, test):
  test_id = test['id']
  train = pd.concat((train, test), axis=0, ignore_index=True)
  train = train.drop(['id', 'age_bucket'], axis=1)

  return train, test_id

# date_account_created 데이터 분리 년 월 일

def data_dac_vstack(dataset):
  dac = np.vstack(dataset.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)

  dataset['dac_year'] = dac[:,0]
  dataset['dac_month'] = dac[:,1]
  dataset['dac_day'] = dac[:,2]
  dataset = dataset.drop(['date_account_created'], axis=1)

  return dataset

# timestamp_first_active 데이터 분리 년 월 일 시

def data_tfa_vstack(dataset):
  tfa = np.vstack(dataset.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
  dataset['tfa_year'] = tfa[:,0]
  dataset['tfa_month'] = tfa[:,1]
  dataset['tfa_day'] = tfa[:,2]
  dataset['tfa_hour'] = tfa[:,3]
  dataset = dataset.drop(['timestamp_first_active'], axis=1)

  return dataset

# 원핫 인코딩

def one_hot_encoding(dataset):
  ohe_feats = dataset.select_dtypes(include=['object']).columns.tolist()
  ohe_feats.append('signup_flow')
  for f in ohe_feats:
    dataset_dummy = pd.get_dummies(dataset[f], prefix=f)
    dataset = dataset.drop([f], axis=1)
    dataset = pd.concat((dataset, dataset_dummy), axis=1)

  return dataset

def data_feature_engineering(train, test=None):
  test_id = None
  train, label, train_shape = data_labeling(train)
  if test is not None:
    train, test_id = data_merge(train, test)

  train = data_dac_vstack(train)
  train = data_tfa_vstack(train)
  train = one_hot_encoding(train)

  return train, label, train_shape, test_id

In [None]:
# train, test 데이터 분열

from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import train_test_split

def train_val_split(train, label, train_shape):
  vals = train.values
  X = vals[:train_shape]
  le = LabelEncoder()
  y = le.fit_transform(label)
  X_test = vals[train_shape:]

  return X, y, X_test

def train_split(train, label):
  le = LabelEncoder()
  y = le.fit_transform(label)

  X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

  return X_train, X_test, y_train, y_test

In [None]:
# clean dataset load

train_clean_users_df = pd.read_csv('/content/dataset/clean/train_clean.csv')
test_clean_users_df = pd.read_csv('/content/dataset/clean/test_clean.csv')

# 밑 3개의 데이터 셋은 test 데이터에도 똑같이 적용을 시켜줘야 예측이 가능함.
country_clean_users_df = pd.read_csv('/content/dataset/clean/train_clean_with_country.csv')
sessions_clean_users_df = pd.read_csv('/content/dataset/clean/train_clean_with_session.csv')
country_sessions_clean_users_df = pd.read_csv('/content/dataset/clean/train_clean_with_two_datasets.csv')

clean_dataset_list = ['train_clean_users_df', 'test_clean_users_df', 'country_clean_users_df', 'sessions_clean_users_df', 'country_sessions_clean_users_df']

## XGB 분류 학습

In [None]:
def xgb_train(X, y, xgb_para, X_test):
  xgb = XGBClassifier(**xgb_para)
  xgb.fit(X, y)
  y_xgb_pred = xgb.predict_proba(X_test)

  return xgb, y_xgb_pred

In [None]:
clean_train, clean_label, clean_train_shape, clean_test_id = data_feature_engineering(train_clean_users_df, test_clean_users_df)

In [None]:
le = LabelEncoder()
y = le.fit_transform(clean_label)

In [None]:
# 데이터 분열

X, y, X_test = train_val_split(clean_train, clean_label, clean_train_shape)

In [None]:
# 파라미터 설정
xgb_para = {
            'max_depth': 6,
            'learning_rate': 0.3,
            'n_estimators' : 25,
            'objective': 'multi:softprob',
            'subsample': 0.5,
            'colsample_bytree': 0.5,
            'seed': 0
            }

In [None]:
clean_xgb, clean_y_xgb_pred = xgb_train(X, y, xgb_para, X_test)

In [None]:
xgb_ndcg_score = ndcg_score_udf(y, clean_y_xgb_pred)
print(f"NDCG : {xgb_ndcg_score}")

NDCG : 0.7678214298134697


In [None]:
# 가장 높은 확률을 가진 국가 5개를 선택하고 리스트에 저장
make_csv(clean_test_id, clean_y_xgb_pred, le, clean_xgb.__class__.__name__)

### XGB Submission 제출 결과

Private Score 0.85711<br>
Public Score 0.85345

둘의 차이가 크다면 과적합 의심

In [None]:
clean_train, clean_label, _, _ =  data_feature_engineering(train_clean_users_df)

In [None]:
X_train, X_test, y_train, y_test = train_split(clean_train, clean_label)

In [None]:
clean_xgb, clean_y_xgb_pred = xgb_train(X_train, y_train, xgb_para, X_test)

In [None]:
xgb_ndcg_score = ndcg_score_udf(y_test, clean_y_xgb_pred)
print(f"NDCG : {xgb_ndcg_score}")

NDCG : 0.8233732350776741


## Country 병합 데이터 XGB 학습 후 NDCG Score 측정

In [None]:
country_train, country_label, _, _ =  data_feature_engineering(country_clean_users_df)

In [None]:
X_train, X_test, y_train, y_test = train_split(country_train, country_label)

In [None]:
country_xgb, country_y_xgb_pred = xgb_train(X_train, y_train, xgb_para, X_test)

In [None]:
xgb_ndcg_score = ndcg_score_udf(y_test, country_y_xgb_pred)
print(f"NDCG : {xgb_ndcg_score}")

NDCG : 0.9826664673095211


## Session 병합 데이터 XGB 학습 후 NDCG Score 측정

In [None]:
sessions_train, sessions_label, _, _ =  data_feature_engineering(sessions_clean_users_df)

In [None]:
X_train, X_test, y_train, y_test = train_split(sessions_train, sessions_label)

In [None]:
sessions_xgb, sessions_y_xgb_pred = xgb_train(X_train, y_train, xgb_para, X_test)

In [None]:
xgb_ndcg_score = ndcg_score_udf(y_test, sessions_y_xgb_pred)
print(f"NDCG : {xgb_ndcg_score}")

NDCG : 0.8491297566294652


## Country, Session 병합 데이터 XGB 학습 후 NDCG Score 측정

In [None]:
country_sessions_train, country_sessions_label, _, _ =  data_feature_engineering(country_sessions_clean_users_df)

In [None]:
X_train, X_test, y_train, y_test = train_split(country_sessions_train, country_sessions_label)

In [None]:
country_sessions_xgb, country_sessions_y_xgb_pred = xgb_train(X_train, y_train, xgb_para, X_test)

In [None]:
xgb_ndcg_score = ndcg_score_udf(y_test, country_sessions_y_xgb_pred)
print(f"NDCG : {xgb_ndcg_score}")

NDCG : 0.9825252250725767


## NDCG Score 측정 결과

Clean : 0.8233732350776741<br>
Country : 0.9826664673095211<br>
Session : 0.8491297566294652<br>
Country_Session : 0.9825252250725767

## LGBM 분류 학습

In [None]:
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(
    max_depth=6,
    learning_rate=0.3,
    n_estimators=25,
    objective='multiclass',
    num_class=len(np.unique(y)),
    subsample=0.5,
    colsample_bytree=0.5,
    random_state=0,
    num_leaves=32,
    min_data_in_leaf=20,
    max_bin=255
)

lgbm.fit(X, y)
y_lgbm_pred = lgbm.predict_proba(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063012 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 592
[LightGBM] [Info] Number of data points in the train set: 213451, number of used features: 106
[LightGBM] [Info] Start training from score -5.981447
[LightGBM] [Info] Start training from score -5.007132
[LightGBM] [Info] Start training from score -5.304195
[LightGBM] [Info] Start training from score -4.552922
[LightGBM] [Info] Start training from score -3.749380
[LightGBM] [Info] Start training from score -4.520117
[LightGBM] [Info] Start training from score -4.321365
[LightGBM] [Info] Start training from score -0.538756
[LightGBM] [Info] Start training from score -5.635216
[LightGBM] [Info] Start training from score -6.891265
[LightGBM] [Info] Start training from score -1.230227
[LightGBM] [Info] Start training from score -3.05146

In [None]:
lgbm_ndcg_score = ndcg_score_udf(y, y_lgbm_pred)
print(f"NDCG : {lgbm_ndcg_score}")

NDCG : 0.703719565708642


In [None]:
# 가장 높은 확률을 가진 국가 5개를 선택하고 리스트에 저장
make_csv(clean_test_id, y_lgbm_pred, lgbm.__class__.__name__)

## Submission 제출 결과

Private Score 0.77494<br>
Public Score 0.77117

둘의 차이가 크다면 과적합 의심

XGB 분류 보다 성능이 낮게 나옴