In [623]:
# conda install -c conda-forge catboost

In [624]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import os

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, ndcg_score, make_scorer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.stats import randint as sp_randint

In [625]:
def set_random_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
random_seed = 1106
set_random_seed(random_seed)

In [626]:
train = pd.read_csv('train_users_2.csv')
session = pd.read_csv('sessions.csv')

## Train 전처리

In [627]:
# train 전처리
train.rename(columns={'id':'user_id'},inplace=True)
train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active'], format = '%Y%m%d%H%M%S')
train['date_account_created'] = pd.to_datetime(train['date_account_created'])
train['date_first_booking'] = pd.to_datetime(train['date_first_booking'], errors='coerce')

In [628]:
train.isnull().sum()

user_id                         0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                          0
age                         87990
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
dtype: int64

In [629]:
train_user_ids = set(train['user_id'])
session_user_ids = set(session['user_id'])
# session 데이터에서 train 데이터에 존재하는 user_id만 필터링하여 사용
merged_session = session[session['user_id'].isin(train_user_ids)]
merged_session.isnull().sum()

merged_train = train[train['user_id'].isin(session_user_ids)]

### age -> age group으로 대체
- 18세 이하는 가입이 불가능하니 해당되는 나이는 0으로 대체
- 결측치는 1로 대체 
- 1000이상인 값은 2
- 1000미만 100세 초과는 3
- 18~100세 사이 4

In [630]:
merged_train.isnull().sum()

user_id                        0
date_account_created           0
timestamp_first_active         0
date_first_booking         45041
gender                         0
age                        32248
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked      302
signup_app                     0
first_device_type              0
first_browser                  0
country_destination            0
dtype: int64

In [631]:
import numpy as np

# 나이 그룹을 계산하는 함수 정의
def calculate_age_group(age):
    if np.isnan(age):
        return 1
    elif age < 18:
        return 0
    elif age >= 1000:
        return 2
    elif age > 100:
        return 3
    return 4

# apply 함수를 사용하여 age_group 계산

merged_train = merged_train.copy()
merged_train['age_group'] = merged_train['age'].apply(calculate_age_group)

merged_train['age_group'].value_counts()

age_group
4    40795
1    32248
3      590
2      131
0       51
Name: count, dtype: int64

## 결측치 처리

In [634]:
merged_train.isnull().sum()

user_id                        0
date_account_created           0
timestamp_first_active         0
date_first_booking         45041
gender                         0
age                        32248
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked      302
signup_app                     0
first_device_type              0
first_browser                  0
country_destination            0
age_group                      0
dtype: int64

In [635]:
if 'age' in merged_train.columns:
    merged_train.drop(columns=['age'], inplace=True)
merged_train.loc[:, 'first_affiliate_tracked'].fillna('untracked', inplace=True)


## Session 전처리

In [636]:
merged_session.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0


In [637]:
merged_session.isnull().sum()

user_id               0
action            51532
action_type      619860
action_detail    619860
device_type           0
secs_elapsed      73815
dtype: int64

In [638]:
merged_session=merged_session.copy()
merged_session['action'].replace('-unknown-', 'unknown', inplace=True)
merged_session['action_type'].replace('-unknown-', 'unknown', inplace=True)
merged_session['action_detail'].replace('-unknown-', 'unknown', inplace=True)
merged_session['device_type'].replace('-unknown-', 'unknown', inplace=True)

merged_session['action'].fillna('unknown', inplace=True)
merged_session['action_type'].fillna('unknown', inplace=True)
merged_session['action_detail'].fillna('unknown', inplace=True)
merged_session['secs_elapsed'].fillna(0, inplace=True)
# user_flow 컬럼 생성

merged_session['user_flow'] = merged_session['action'].astype('str') + '+' + merged_session['action_type'].astype('str') + '+' + merged_session['action_detail'].astype('str')

In [642]:
merged_session.isnull().sum()

user_id          0
action           0
action_type      0
action_detail    0
device_type      0
secs_elapsed     0
user_flow        0
dtype: int64

In [643]:
binary_session = merged_session.copy()

### action columns

- action+type+detail = user_flow로 묶고 user_id 별 모든 user_flow를 한 string으로 합치기
- train과 머지했을 때 중복을 피하기 위함
- 구분자는 쉼표

In [644]:
user_flows_concatenated = merged_session.groupby('user_id')['user_flow'].apply(lambda x: ','.join(x)).reset_index()

merged_session = pd.merge(merged_session.drop(columns=['user_flow']), user_flows_concatenated, on='user_id', how='left')

In [645]:
merged_session

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed,user_flow
0,d1mm9tcy42,lookup,unknown,unknown,Windows Desktop,319.0,"lookup+unknown+unknown,search_results+click+vi..."
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0,"lookup+unknown+unknown,search_results+click+vi..."
2,d1mm9tcy42,lookup,unknown,unknown,Windows Desktop,301.0,"lookup+unknown+unknown,search_results+click+vi..."
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0,"lookup+unknown+unknown,search_results+click+vi..."
4,d1mm9tcy42,lookup,unknown,unknown,Windows Desktop,435.0,"lookup+unknown+unknown,search_results+click+vi..."
...,...,...,...,...,...,...,...
5537952,nw9fwlyb5f,index,data,reservations,iPhone,245.0,"show+view+user_profile,show+view+p3,reviews+da..."
5537953,nw9fwlyb5f,unavailabilities,data,unavailable_dates,iPhone,286.0,"show+view+user_profile,show+view+p3,reviews+da..."
5537954,nw9fwlyb5f,notifications,submit,notifications,iPhone,830.0,"show+view+user_profile,show+view+p3,reviews+da..."
5537955,nw9fwlyb5f,search,click,view_search_results,iPhone,101961.0,"show+view+user_profile,show+view+p3,reviews+da..."


In [646]:
session_df = merged_session.copy()

### feature engineering
- user_id별 평균 세션 시간 생성
- user_id별 가장 많이 쓴 주요 device_type 생성
- user_id별 세션 총 개수 생성

In [647]:
# user_id별 평균 secs_elapsed column 생성
session_df['mean_secs_elapsed'] = session_df.groupby('user_id')['secs_elapsed'].transform('mean')
# user_id별 가장 빈도수가 높은 device_type column 생성
session_df['most_frequent_device'] = session_df.groupby('user_id')['device_type'].transform(lambda x: x.mode()[0])
# user_id별 세션 총 개수
session_df['session_count'] = session_df.groupby('user_id')['action'].transform('count')

In [648]:
len(session_df['user_id'].unique())

73815

In [651]:
session_df2 = session_df.copy()
session_df2.drop(['action', 'action_type', 'action_detail', 'device_type', 'secs_elapsed'], axis=1, inplace=True)
session_df2.drop_duplicates(subset='user_id', inplace=True)

In [652]:
session_df2

Unnamed: 0,user_id,user_flow,mean_secs_elapsed,most_frequent_device,session_count
0,d1mm9tcy42,"lookup+unknown+unknown,search_results+click+vi...",26988.417323,Windows Desktop,127
127,yo8nz8bqcq,"dashboard+view+dashboard,create+submit+create_...",23093.555556,Mac Desktop,9
136,4grx6yxeby,"verify+unknown+unknown,create+submit+create_us...",70965.250000,Windows Desktop,16
152,ncf87guaf0,"lookup+unknown+unknown,show+view+p3,search_res...",24704.605263,Windows Desktop,152
304,4rvqpxoh3h,"campaigns+unknown+unknown,active+unknown+unkno...",319.375000,iPhone,8
...,...,...,...,...,...
5537475,zxodksqpep,"edit+view+edit_profile,edit+view+edit_profile,...",46739.963636,Mac Desktop,110
5537585,mhewnxesx9,"confirm_email+click+confirm_email_link,dashboa...",12096.319328,Windows Desktop,238
5537823,6o3arsjbb4,ajax_refresh_subtotal+click+change_trip_charac...,19054.555556,Mac Desktop,18
5537841,jh95kwisub,"search+click+view_search_results,search+click+...",4554.786667,iPhone,75


## train, session merge

In [653]:
merged_df = pd.merge(merged_train, session_df2, on='user_id', how='inner')

In [654]:
merged_df

Unnamed: 0,user_id,date_account_created,timestamp_first_active,date_first_booking,gender,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,age_group,user_flow,mean_secs_elapsed,most_frequent_device,session_count
0,d1mm9tcy42,2014-01-01,2014-01-01 00:09:36,2014-01-04,MALE,basic,0,en,sem-non-brand,google,omg,Web,Windows Desktop,Chrome,other,4,"lookup+unknown+unknown,search_results+click+vi...",26988.417323,Windows Desktop,127
1,yo8nz8bqcq,2014-01-01,2014-01-01 00:15:58,NaT,-unknown-,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,NDF,1,"dashboard+view+dashboard,create+submit+create_...",23093.555556,Mac Desktop,9
2,4grx6yxeby,2014-01-01,2014-01-01 00:16:39,NaT,-unknown-,basic,0,en,sem-brand,google,omg,Web,Windows Desktop,Firefox,NDF,1,"verify+unknown+unknown,create+submit+create_us...",70965.250000,Windows Desktop,16
3,ncf87guaf0,2014-01-01,2014-01-01 00:21:46,NaT,-unknown-,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,NDF,1,"lookup+unknown+unknown,show+view+p3,search_res...",24704.605263,Windows Desktop,152
4,4rvqpxoh3h,2014-01-01,2014-01-01 00:26:19,2014-01-02,-unknown-,basic,25,en,direct,direct,untracked,iOS,iPhone,-unknown-,GB,1,"campaigns+unknown+unknown,active+unknown+unkno...",319.375000,iPhone,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73810,zxodksqpep,2014-06-30,2014-06-30 23:56:36,NaT,MALE,basic,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari,NDF,4,"edit+view+edit_profile,edit+view+edit_profile,...",46739.963636,Mac Desktop,110
73811,mhewnxesx9,2014-06-30,2014-06-30 23:57:19,NaT,-unknown-,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,NDF,1,"confirm_email+click+confirm_email_link,dashboa...",12096.319328,Windows Desktop,238
73812,6o3arsjbb4,2014-06-30,2014-06-30 23:57:54,NaT,-unknown-,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,NDF,4,ajax_refresh_subtotal+click+change_trip_charac...,19054.555556,Mac Desktop,18
73813,jh95kwisub,2014-06-30,2014-06-30 23:58:22,NaT,-unknown-,basic,25,en,other,other,tracked-other,iOS,iPhone,Mobile Safari,NDF,1,"search+click+view_search_results,search+click+...",4554.786667,iPhone,75


# 액션 가설 검정

In [655]:
# merged_train, binary_session 에 booking [0,1] column 생성
merged_train['booking'] = (merged_train['country_destination'] != 'NDF').astype(int)
binary_session.loc[:,'booking'] = binary_session.loc[:,'user_id'].map(merged_train.set_index('user_id')['booking'])

In [656]:
# 유저별로 액션과 예약 여부를 그룹화하여 빈도 테이블에 해당하는 카운트 계산
frequency_table = binary_session.groupby(['action', 'booking']).size().unstack(fill_value=0)

# 2x2 빈도 테이블 출력
print("2x2 Frequency Table:")
print(frequency_table)

2x2 Frequency Table:
booking                         0     1
action                                 
10                            643  1007
11                            120   158
12                            229  1097
15                            161   405
about_us                      101    83
...                           ...   ...
weibo_signup_referral_finish    4     1
why_host                       55    19
widget                         71     4
wishlists                       0     1
zendesk_login_jwt               7     3

[332 rows x 2 columns]


In [657]:
from scipy.stats import chi2_contingency
# 각 유저가 수행한 모든 행동을 하나의 행으로 표현
df_pivot = binary_session.pivot_table(index='user_id', columns='action', aggfunc='size', fill_value=0)

# 액션의 빈도수가 5 미만인 경우 해당 액션을 제외
filtered_observed = df_pivot.copy()
for action in df_pivot.columns:
    if filtered_observed[action].sum() < 5:
        filtered_observed.drop(columns=action, inplace=True)

# 카이제곱 동질성 검정 수행
chi2, p, dof, expected = chi2_contingency(filtered_observed)

# 결과 출력
print("카이제곱 통계량:", chi2)
print("p-value:", p)
print("자유도:", dof)
print("기대 빈도:")
print(expected)

카이제곱 통계량: 79754015.33515027
p-value: 0.0
자유도: 21479874
기대 빈도:
[[1.19179267e-02 2.00799007e-03 9.57767924e-03 ... 5.34500953e-04
  5.41723939e-04 7.22298585e-05]
 [2.68153350e-02 4.51797765e-03 2.15497783e-02 ... 1.20262714e-03
  1.21887886e-03 1.62517182e-04]
 [9.23639316e-03 1.55619230e-03 7.42270141e-03 ... 4.14238239e-04
  4.19836053e-04 5.59781404e-05]
 ...
 [8.93844499e-04 1.50599255e-04 7.18325943e-04 ... 4.00875715e-05
  4.06292954e-05 5.41723939e-06]
 [1.51953565e-02 2.56018734e-03 1.22115410e-02 ... 6.81488715e-04
  6.90698022e-04 9.20930696e-05]
 [2.20481643e-02 3.71478162e-03 1.77187066e-02 ... 9.88826763e-04
  1.00218929e-03 1.33625238e-04]]


# action추가

In [667]:
action_group=binary_session['action'].unique()
detail_group=binary_session['action_detail'].unique()

booking_action = ["apply_coupon_click_success", "apply_reservation", "booking","change_currency", "coupon_code_click",
"pay", "payment_methods", "print_confirmation", "rate","receipt","guest_billing_receipt", "recent_reservations"]

booking_action_detail = ["apply_coupon","apply_coupon_click", "apply_coupon_click_success","apply_coupon_error", "booking", "book_it",
"change_availability", "change_or_alter"]
# request, message, pending

language_action = ["ajax_google_translate", "ajax_google_translate_description","ajax_google_translate_reviews","change_currency",
"languages_multiselect","country_options", "southern-europe","spoken_languages"]

language_action_detail = ["translate_listing_reviews","translations","user_languages"]




In [668]:
def calculate_booking_ratio(binary_session,column,action):
    # 해당 액션을 수행한 사용자 수 계산
    total_users_with_action = binary_session[binary_session[column] == action]['user_id'].nunique()

    # 해당 액션을 수행한 사용자 중 예약한 사용자 수 계산
    booked_users_with_action = binary_session[(binary_session[column] == action) & (binary_session['booking'] == 1)]['user_id'].nunique()

    # 예약 비율 계산
    booking_ratio = booked_users_with_action / total_users_with_action

    return total_users_with_action, booking_ratio

# 함수 호출
print('action group에서 booking')
for action in booking_action:
    action_info = calculate_booking_ratio(binary_session, 'action', action)
    print(f"{action}를 수행한 사용자의 수 : {action_info[0]}, 예약 비율: {action_info[1]}")

print('\naction detail group에서 booking')
for action in booking_action_detail:
    action_info = calculate_booking_ratio(binary_session, 'action_detail', action)
    print(f"{action}를 수행한 사용자의 수 : {action_info[0]}, 예약 비율: {action_info[1]}")


# 함수 호출
print('\naction group에서 language')
for action in language_action:
    action_info = calculate_booking_ratio(binary_session, 'action', action)
    print(f"{action}를 수행한 사용자의 수 : {action_info[0]}, 예약 비율: {action_info[1]}")

print('\naction detail group에서 language')
for action in language_action_detail:
    action_info = calculate_booking_ratio(binary_session, 'action_detail', action)
    print(f"{action}를 수행한 사용자의 수 : {action_info[0]}, 예약 비율: {action_info[1]}")


action group에서 booking
apply_coupon_click_success를 수행한 사용자의 수 : 15, 예약 비율: 0.7333333333333333
apply_reservation를 수행한 사용자의 수 : 1344, 예약 비율: 0.6614583333333334
booking를 수행한 사용자의 수 : 2, 예약 비율: 0.5
change_currency를 수행한 사용자의 수 : 281, 예약 비율: 0.5480427046263345
coupon_code_click를 수행한 사용자의 수 : 241, 예약 비율: 0.5933609958506224
pay를 수행한 사용자의 수 : 737, 예약 비율: 0.7639077340569878
payment_methods를 수행한 사용자의 수 : 1901, 예약 비율: 0.4518674381904261
print_confirmation를 수행한 사용자의 수 : 2, 예약 비율: 1.0
rate를 수행한 사용자의 수 : 192, 예약 비율: 0.3541666666666667
receipt를 수행한 사용자의 수 : 102, 예약 비율: 0.9901960784313726
guest_billing_receipt를 수행한 사용자의 수 : 3, 예약 비율: 1.0
recent_reservations를 수행한 사용자의 수 : 796, 예약 비율: 0.4585427135678392

action detail group에서 booking
apply_coupon를 수행한 사용자의 수 : 1344, 예약 비율: 0.6614583333333334
apply_coupon_click를 수행한 사용자의 수 : 180, 예약 비율: 0.6055555555555555
apply_coupon_click_success를 수행한 사용자의 수 : 15, 예약 비율: 0.7333333333333333
apply_coupon_error를 수행한 사용자의 수 : 171, 예약 비율: 0.5906432748538012
booking를 수행한 사용자의

In [669]:
binary_session[binary_session['action']=='actionspoken_languages']
for i in action_group:
    if 'spoken_languages' in i:
        print(i)

spoken_languages


In [670]:
# 사용자별로 예약 액션 횟수를 계산합니다.
booking_action_count = binary_session[binary_session['action'].isin(booking_action)].groupby('user_id')['action'].count().reset_index()
booking_action_count.rename(columns={'action': 'booking_action_count'}, inplace=True)
# 결과를 merged_df에 추가합니다.
merged_df = merged_df.merge(booking_action_count, on='user_id', how='left')

merged_df['booking_action_count'] = merged_df['booking_action_count'].fillna(0).astype(int)



In [671]:
booking_detail_count = binary_session[binary_session['action_detail'].isin(booking_action_detail)].groupby('user_id')['action_detail'].count().reset_index()
booking_detail_count.rename(columns={'action_detail': 'booking_action_detail_count'}, inplace=True)
# 결과를 merged_df에 추가합니다.
merged_df = merged_df.merge(booking_detail_count, on='user_id', how='left')

merged_df['booking_action_detail_count'] = merged_df['booking_action_detail_count'].fillna(0).astype(int)

In [672]:
# 사용자별로 언어 액션 횟수를 계산합니다.
booking_action_count = binary_session[binary_session['action'].isin(language_action)].groupby('user_id')['action'].count().reset_index()
booking_action_count.rename(columns={'action': 'language_action_count'}, inplace=True)
# 결과를 merged_df에 추가합니다.
merged_df = merged_df.merge(booking_action_count, on='user_id', how='left')

merged_df['language_action_count'] = merged_df['language_action_count'].fillna(0).astype(int)



In [673]:
booking_detail_count = binary_session[binary_session['action_detail'].isin(language_action_detail)].groupby('user_id')['action_detail'].count().reset_index()
booking_detail_count.rename(columns={'action_detail': 'language_action_detail_count'}, inplace=True)
# 결과를 merged_df에 추가합니다.
merged_df = merged_df.merge(booking_detail_count, on='user_id', how='left')

merged_df['language_action_detail_count'] = merged_df['language_action_detail_count'].fillna(0).astype(int)




# Modeling

In [725]:
# NDCG metric 이용하여 확인
X = merged_df.drop(['user_id', 'date_first_booking', 'country_destination'], axis=1)
y = merged_df['country_destination']

cat_features = ['gender', 'signup_method', 'language', 'affiliate_channel', 'affiliate_provider',
                'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'user_flow',
                'most_frequent_device']

# 훈련 데이터와 테스트 데이터로 나누기
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_seed)

for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


model = CatBoostClassifier(iterations=50, 
                            learning_rate=0.1, 
                            depth=6, 
                            loss_function='MultiClass',
                            cat_features=cat_features,
                            random_seed=random_seed)

model.fit(X_train, y_train)

# 각 나라에 대해서 확률 값을 나타냄 
y_pred_probs = model.predict_proba(X_test, verbose=100)




lb = LabelBinarizer()
y_train_bin = lb.fit_transform(y_train)

ndcg1 = ndcg_score(lb.transform(y_test), y_pred_probs,k=5)

print("NDCG Score:", ndcg1)




# 가장 높은 확률 5개의 index를 추출
top_5_indices = np.argsort(y_pred_probs, axis=1)[:, ::-1][:, :5]

# 모델에서 사용한 카테고리 목록
classes = model.classes_

# 순서대로 예측한 나라 목록
top_5_classes = np.array(classes)[top_5_indices]

# airbnb식 평가 지표 - 이론적인 ndcg와 다름(relevance값이 binary이므로 정답 나라의 위치만 중요, 나머지는 계산 x)
# 또한 iDCG의 경우도 정답 나라가 1순위에 오는 것이 이상적이므로 항상 1의 값을 가짐.
# 그렇다면 n개의 예측을 평가 지표로써 사용하기 위해 각 ndcg값을 더한 후 n개로 나눠주어 점수 계산

def calculate_ndcg(predicted_answers, true_answers):
    ndcg=0
    for predicted, true in zip(predicted_answers, true_answers):
        relevance = [1 if ans == true else 0 for ans in predicted]
        dcg = np.sum([(2**rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(relevance)])
        ndcg += dcg
    return ndcg/len(true_answers)


ndcg = calculate_ndcg(top_5_classes, y_test)
print("NDCG:", ndcg)


0:	learn: 1.9984641	total: 416ms	remaining: 20.4s
1:	learn: 1.7844008	total: 727ms	remaining: 17.5s
2:	learn: 1.6357260	total: 1.33s	remaining: 20.9s
3:	learn: 1.5252326	total: 1.88s	remaining: 21.6s
4:	learn: 1.4415239	total: 2.45s	remaining: 22s
5:	learn: 1.3750408	total: 2.83s	remaining: 20.7s
6:	learn: 1.3206396	total: 3.37s	remaining: 20.7s
7:	learn: 1.2762406	total: 3.76s	remaining: 19.7s
8:	learn: 1.2374495	total: 4.3s	remaining: 19.6s
9:	learn: 1.2054721	total: 4.75s	remaining: 19s
10:	learn: 1.1775674	total: 5.25s	remaining: 18.6s
11:	learn: 1.1538280	total: 5.82s	remaining: 18.4s
12:	learn: 1.1333677	total: 6.22s	remaining: 17.7s
13:	learn: 1.1156370	total: 6.86s	remaining: 17.6s
14:	learn: 1.1006777	total: 7.45s	remaining: 17.4s
15:	learn: 1.0870906	total: 7.99s	remaining: 17s
16:	learn: 1.0758080	total: 8.53s	remaining: 16.6s
17:	learn: 1.0658003	total: 9.2s	remaining: 16.4s
18:	learn: 1.0569593	total: 9.9s	remaining: 16.2s
19:	learn: 1.0491044	total: 10.6s	remaining: 15.9s

In [726]:
for i in top_5_classes:
    print(i)

['US' 'NDF' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'ES']
['NDF' 'US' 'other' 'FR' 'ES']
['US' 'NDF' 'other' 'FR' 'IT']
['US' 'NDF' 'other' 'FR' 'IT']
['US' 'NDF' 'other' 'FR' 'IT']
['US' 'NDF' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['US' 'NDF' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['US' 'NDF' 'other' 'FR' 'IT']
['US' 'NDF' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['US' 'NDF' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['US' 'NDF' 'other' 'FR' 'IT']
['NDF' 'US' 'other' 'FR' 'IT']
['NDF' '

In [696]:
feature_importance = model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance score in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')

plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()

In [678]:
y_test.index

Index([54860, 33007, 45521, 14309, 48817, 55287, 52346, 15321, 25429, 51067,
       ...
       19520, 64808, 41666,  6565, 37813,  8138,  9890, 57688, 47350, 14877],
      dtype='int64', length=14763)

# test 데이터 형식으로 변환

In [679]:
# y_test.index에 해당하는 유저 ID와 예측된 나라들을 추출
user_ids = merged_df.loc[y_test.index, 'user_id']

# 데이터프레임으로 변환하여 출력
output_data = []
for user_id, countries in zip(user_ids, top_5_classes):
    for country in countries:
        output_data.append({'id': user_id, 'country': country})

output_df = pd.DataFrame(output_data)
print(output_df)

               id country
0      5nnomvo7dx      US
1      5nnomvo7dx     NDF
2      5nnomvo7dx   other
3      5nnomvo7dx      FR
4      5nnomvo7dx      IT
...           ...     ...
73810  vukjz4f059     NDF
73811  vukjz4f059      US
73812  vukjz4f059   other
73813  vukjz4f059      FR
73814  vukjz4f059      GB

[73815 rows x 2 columns]


# TF - IDF

In [680]:
merged_df2 = merged_df.copy()

In [681]:
# tf-idf
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df2['user_flow'])

# TF-IDF를 기존 데이터프레임에 추가
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
merged_df_with_tfidf = pd.concat([merged_df2, tfidf_df], axis=1)

# TF-IDF를 적용한 데이터에서 불필요한 열 제거
merged_df_with_tfidf.drop(['user_flow'], axis=1, inplace=True)

In [682]:
merged_df_with_tfidf

Unnamed: 0,user_id,date_account_created,timestamp_first_active,date_first_booking,gender,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,why_host,widget,wishlist,wishlist_content_update,wishlist_note,wishlists,your_listings,your_reservations,your_trips,zendesk_login_jwt
0,d1mm9tcy42,2014-01-01,2014-01-01 00:09:36,2014-01-04,MALE,basic,0,en,sem-non-brand,google,...,0.0,0.0,0.0,0.251403,0.0,0.0,0.000000,0.0,0.000000,0.0
1,yo8nz8bqcq,2014-01-01,2014-01-01 00:15:58,NaT,-unknown-,basic,0,en,direct,direct,...,0.0,0.0,0.0,0.152975,0.0,0.0,0.000000,0.0,0.000000,0.0
2,4grx6yxeby,2014-01-01,2014-01-01 00:16:39,NaT,-unknown-,basic,0,en,sem-brand,google,...,0.0,0.0,0.0,0.057185,0.0,0.0,0.000000,0.0,0.000000,0.0
3,ncf87guaf0,2014-01-01,2014-01-01 00:21:46,NaT,-unknown-,basic,0,en,direct,direct,...,0.0,0.0,0.0,0.078482,0.0,0.0,0.000000,0.0,0.000000,0.0
4,4rvqpxoh3h,2014-01-01,2014-01-01 00:26:19,2014-01-02,-unknown-,basic,25,en,direct,direct,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73810,zxodksqpep,2014-06-30,2014-06-30 23:56:36,NaT,MALE,basic,0,en,sem-brand,google,...,0.0,0.0,0.0,0.023646,0.0,0.0,0.055504,0.0,0.047922,0.0
73811,mhewnxesx9,2014-06-30,2014-06-30 23:57:19,NaT,-unknown-,basic,0,en,direct,direct,...,0.0,0.0,0.0,0.142619,0.0,0.0,0.000000,0.0,0.000000,0.0
73812,6o3arsjbb4,2014-06-30,2014-06-30 23:57:54,NaT,-unknown-,basic,0,en,direct,direct,...,0.0,0.0,0.0,0.136141,0.0,0.0,0.000000,0.0,0.000000,0.0
73813,jh95kwisub,2014-06-30,2014-06-30 23:58:22,NaT,-unknown-,basic,25,en,other,other,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0


In [729]:
X = merged_df_with_tfidf.drop(['user_id', 'date_account_created', 'timestamp_first_active', 'date_first_booking', 'country_destination'], axis=1)
y = merged_df_with_tfidf['country_destination']

cat_features = ['gender', 'signup_method', 'language', 'affiliate_channel', 'affiliate_provider',
                'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser',
                'most_frequent_device']


# 훈련 데이터와 테스트 데이터로 나누기
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random_seed)
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]


model = CatBoostClassifier(iterations=50, 
                            learning_rate=0.1,
                            depth=6, 
                            loss_function='MultiClass',
                            cat_features=cat_features,
                            random_seed=random_seed)

model.fit(X_train, y_train, verbose=10)
y_pred_probs = model.predict_proba(X_test)


# 가장 높은 확률 5개의 index를 추출
top_5_indices = np.argsort(y_pred_probs, axis=1)[:, ::-1][:, :5]

# 모델에서 사용한 카테고리 목록
classes = model.classes_

# 순서대로 예측한 나라 목록
top_5_classes = np.array(classes)[top_5_indices]


ndcg = calculate_ndcg(top_5_classes, y_test)
print("NDCG:", ndcg)


0:	learn: 1.9640580	total: 922ms	remaining: 45.2s
10:	learn: 1.1200507	total: 9.34s	remaining: 33.1s
20:	learn: 0.9785864	total: 18.9s	remaining: 26.1s
30:	learn: 0.9383355	total: 28.8s	remaining: 17.6s
40:	learn: 0.9244258	total: 39.5s	remaining: 8.67s
49:	learn: 0.9175047	total: 49.8s	remaining: 0us
NDCG: 0.85215943871506


In [None]:
feature_importance = model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance score in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df[:15]

In [None]:
# 예측된 상위 5개의 나라 확인
top5_pred_indices = np.argsort(y_pred_probs, axis=1)[:, ::-1][:, :5]

# Convert top 5 indices to country labels
top5_pred_countries = model.classes_[top5_pred_indices]

print("Top 5 Predicted Countries:")
print(top5_pred_countries)

In [None]:
feature_importance = model.feature_importances_
feature_names = X_train.columns

# Create a DataFrame to store feature importances
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance score in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df[:20], palette='viridis')

plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.show()

# test airbnb 

In [706]:
test = pd.read_csv('test_users.csv')
session_test = pd.read_csv('sessions.csv')

In [707]:
# train 전처리
test.rename(columns={'id':'user_id'},inplace=True)
test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active'], format = '%Y%m%d%H%M%S')
test['date_account_created'] = pd.to_datetime(test['date_account_created'])
test['date_first_booking'] = pd.to_datetime(test['date_first_booking'], errors='coerce')

In [708]:

test['age_group'] = test['age'].apply(calculate_age_group)
if 'age' in test.columns:
    test.drop(columns=['age'], inplace=True)
test.loc[:, 'first_affiliate_tracked'].fillna('untracked', inplace=True)

session_test['action'].replace('-unknown-', 'unknown', inplace=True)
session_test['action_type'].replace('-unknown-', 'unknown', inplace=True)
session_test['action_detail'].replace('-unknown-', 'unknown', inplace=True)
session_test['device_type'].replace('-unknown-', 'unknown', inplace=True)

session_test['action'].fillna('unknown', inplace=True)
session_test['action_type'].fillna('unknown', inplace=True)
session_test['action_detail'].fillna('unknown', inplace=True)
session_test['secs_elapsed'].fillna(0, inplace=True)


session_test['user_flow'] = session_test['action'].astype('str') + '+' + session_test['action_type'].astype('str') + '+' + session_test['action_detail'].astype('str')

user_flows_concatenated = session_test.groupby('user_id')['user_flow'].apply(lambda x: ','.join(x)).reset_index()
merged_session = pd.merge(session_test.drop(columns=['user_flow']), user_flows_concatenated, on='user_id', how='left')

# user_id별 평균 secs_elapsed column 생성
merged_session['mean_secs_elapsed'] = merged_session.groupby('user_id')['secs_elapsed'].transform('mean')
# user_id별 가장 빈도수가 높은 device_type column 생성
merged_session['most_frequent_device'] = merged_session.groupby('user_id')['device_type'].transform(lambda x: x.mode()[0])
# user_id별 세션 총 개수
merged_session['session_count'] = merged_session.groupby('user_id')['action'].transform('count')

session_df2 = merged_session.copy()
session_df2.drop(['action', 'action_type', 'action_detail', 'device_type', 'secs_elapsed'], axis=1, inplace=True)
session_df2.drop_duplicates(subset='user_id', inplace=True)

test_df = pd.merge(test, session_df2, on='user_id', how='inner')


In [710]:

booking_action = ["apply_coupon_click_success", "apply_reservation", "booking","change_currency", "coupon_code_click",
"pay", "payment_methods", "print_confirmation", "rate","receipt","guest_billing_receipt", "recent_reservations"]

booking_action_detail = ["apply_coupon","apply_coupon_click", "apply_coupon_click_success","apply_coupon_error", "booking", "book_it",
"change_availability", "change_or_alter"]
# request, message, pending

language_action = ["ajax_google_translate", "ajax_google_translate_description","ajax_google_translate_reviews","change_currency",
"languages_multiselect","country_options", "southern-europe","spoken_languages"]

language_action_detail = ["translate_listing_reviews","translations","user_languages"]

# 사용자별로 예약 액션 횟수를 계산합니다.
booking_action_count = session_df[session_df['action'].isin(booking_action)].groupby('user_id')['action'].count().reset_index()
booking_action_count.rename(columns={'action': 'booking_action_count'}, inplace=True)
test_df = test_df.merge(booking_action_count, on='user_id', how='left')
test_df['booking_action_count'] = test_df['booking_action_count'].fillna(0).astype(int)


booking_detail_count = session_df[session_df['action_detail'].isin(booking_action_detail)].groupby('user_id')['action_detail'].count().reset_index()
booking_detail_count.rename(columns={'action_detail': 'booking_action_detail_count'}, inplace=True)
test_df = test_df.merge(booking_detail_count, on='user_id', how='left')
test_df['booking_action_detail_count'] = test_df['booking_action_detail_count'].fillna(0).astype(int)


booking_action_count = session_df[session_df['action'].isin(language_action)].groupby('user_id')['action'].count().reset_index()
booking_action_count.rename(columns={'action': 'language_action_count'}, inplace=True)
test_df = test_df.merge(booking_action_count, on='user_id', how='left')
test_df['language_action_count'] = test_df['language_action_count'].fillna(0).astype(int)

booking_detail_count = session_df[session_df['action_detail'].isin(language_action_detail)].groupby('user_id')['action_detail'].count().reset_index()
booking_detail_count.rename(columns={'action_detail': 'language_action_detail_count'}, inplace=True)
test_df = test_df.merge(booking_detail_count, on='user_id', how='left')
test_df['language_action_detail_count'] = test_df['language_action_detail_count'].fillna(0).astype(int)



In [727]:
test_df

Unnamed: 0,user_id,date_account_created,timestamp_first_active,date_first_booking,gender,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,...,first_browser,age_group,user_flow,mean_secs_elapsed,most_frequent_device,session_count,booking_action_count,booking_action_detail_count,language_action_count,language_action_detail_count
0,5uwns89zht,2014-07-01,2014-07-01 00:00:06,NaT,FEMALE,facebook,0,en,direct,direct,...,Mobile Safari,4,"show+view+user_profile,search+click+view_searc...",14898.375000,unknown,8.0,0,0,0,0
1,jtl0dijy2j,2014-07-01,2014-07-01 00:00:51,NaT,-unknown-,basic,0,en,direct,direct,...,Mobile Safari,1,"dashboard+view+dashboard,login+view+login_page...",13164.157895,unknown,19.0,0,0,0,0
2,xx0ulgorjt,2014-07-01,2014-07-01 00:01:48,NaT,-unknown-,basic,0,en,direct,direct,...,Chrome,1,"index+view+view_search_results,index+view+view...",16820.258621,Windows Desktop,58.0,0,0,0,0
3,6c6puo6ix0,2014-07-01,2014-07-01 00:02:15,NaT,-unknown-,basic,0,en,direct,direct,...,IE,1,"personalize+data+wishlist_content_update,heade...",11181.909091,Windows Desktop,11.0,0,0,0,0
4,czqhjk3yfe,2014-07-01,2014-07-01 00:03:05,NaT,-unknown-,basic,0,en,direct,direct,...,Safari,1,message_to_host_change+click+message_to_host_c...,23895.947368,Mac Desktop,19.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61663,cv0na2lf5a,2014-09-30,2014-09-30 23:52:32,NaT,-unknown-,basic,0,en,direct,direct,...,IE,4,"confirm_email+click+confirm_email_link,authent...",27249.806452,Windows Desktop,93.0,0,0,1,0
61664,zp8xfonng8,2014-09-30,2014-09-30 23:53:06,NaT,-unknown-,basic,23,ko,direct,direct,...,-unknown-,1,"index+view+view_reservations,campaigns+unknown...",2580.900000,Android Phone,20.0,0,0,0,0
61665,fa6260ziny,2014-09-30,2014-09-30 23:54:08,NaT,-unknown-,basic,0,de,direct,direct,...,Firefox,1,"index+view+view_search_results,my+view+user_wi...",10555.089744,Windows Desktop,78.0,0,0,0,0
61666,87k0fy4ugm,2014-09-30,2014-09-30 23:54:30,NaT,-unknown-,basic,0,en,sem-brand,google,...,Safari,1,"show+unknown+unknown,update+submit+update_list...",23471.200000,Mac Desktop,15.0,0,0,0,0


# 머신 학습 및 결과

In [711]:
# NDCG metric 이용하여 확인
X_train = merged_df.drop(['user_id', 'date_first_booking', 'country_destination'], axis=1)
y_train = merged_df['country_destination']

X_test = test_df.drop(['user_id', 'date_first_booking'], axis=1)

cat_features = ['gender', 'signup_method', 'language', 'affiliate_channel', 'affiliate_provider',
                'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'user_flow',
                'most_frequent_device']


model = CatBoostClassifier(iterations=1000, 
                            learning_rate=0.03, 
                            depth=6, 
                            loss_function='MultiClass',
                            cat_features=cat_features,
                            random_seed=random_seed)

model.fit(X_train, y_train)

# 각 나라에 대해서 확률 값을 나타냄 
y_pred_probs = model.predict_proba(X_test, verbose=100)




lb = LabelBinarizer()
y_train_bin = lb.fit_transform(y_train)

# 가장 높은 확률 5개의 index를 추출
top_5_indices = np.argsort(y_pred_probs, axis=1)[:, ::-1][:, :5]

# 모델에서 사용한 카테고리 목록
classes = model.classes_

# 순서대로 예측한 나라 목록
top_5_classes = np.array(classes)[top_5_indices]


0:	learn: 1.9973407	total: 776ms	remaining: 38s
1:	learn: 1.7792888	total: 1.4s	remaining: 33.6s
2:	learn: 1.6314999	total: 2.02s	remaining: 31.7s
3:	learn: 1.5234047	total: 2.69s	remaining: 30.9s
4:	learn: 1.4394149	total: 3.44s	remaining: 30.9s
5:	learn: 1.3714065	total: 4.13s	remaining: 30.3s
6:	learn: 1.3173616	total: 4.66s	remaining: 28.6s
7:	learn: 1.2720868	total: 5.31s	remaining: 27.9s
8:	learn: 1.2341907	total: 5.84s	remaining: 26.6s
9:	learn: 1.2025339	total: 6.47s	remaining: 25.9s
10:	learn: 1.1752808	total: 6.99s	remaining: 24.8s
11:	learn: 1.1516351	total: 7.57s	remaining: 24s
12:	learn: 1.1322075	total: 7.89s	remaining: 22.5s
13:	learn: 1.1147804	total: 8.67s	remaining: 22.3s
14:	learn: 1.0993778	total: 9.41s	remaining: 22s
15:	learn: 1.0856962	total: 10.1s	remaining: 21.5s
16:	learn: 1.0744723	total: 10.6s	remaining: 20.6s
17:	learn: 1.0646793	total: 11.3s	remaining: 20.1s
18:	learn: 1.0560071	total: 12.1s	remaining: 19.7s
19:	learn: 1.0483761	total: 12.9s	remaining: 19.

In [718]:
top_5_classes

array([['NDF', 'US', 'other', 'FR', 'IT'],
       ['NDF', 'US', 'other', 'FR', 'IT'],
       ['NDF', 'US', 'other', 'FR', 'IT'],
       ...,
       ['NDF', 'US', 'other', 'FR', 'IT'],
       ['NDF', 'US', 'other', 'FR', 'IT'],
       ['US', 'NDF', 'other', 'FR', 'IT']], dtype=object)

In [721]:
# 데이터프레임으로 변환하여 출력
output_data = []
for user_id, countries in zip(test_df['user_id'], top_5_classes):
    for country in countries:
        output_data.append({'id': user_id, 'country': country})

output_df = pd.DataFrame(output_data)
output_df.to_csv("test.csv",index=False)
#print(output_df)

# 결과

Score: 0.86318  
Public score: 0.85902

# TF-IDF

In [None]:
merged_df2 = merged_df.copy()