In [119]:
# !pip install shap

In [120]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shap

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, ndcg_score, make_scorer
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from scipy.stats import randint as sp_randint

In [2]:
def set_random_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
random_seed = 1106
set_random_seed(random_seed)

In [3]:
train = pd.read_csv('train_users_2.csv')
session = pd.read_csv('sessions.csv')

## train 전처리


In [4]:
# train 전처리
train.rename(columns={'id':'user_id'},inplace=True)
train['timestamp_first_active'] = pd.to_datetime(train['timestamp_first_active'], format = '%Y%m%d%H%M%S')
train['date_account_created'] = pd.to_datetime(train['date_account_created'])
train['date_first_booking'] = pd.to_datetime(train['date_first_booking'], errors='coerce')
train['first_affiliate_tracked'].fillna('untracked', inplace=True)

### age 전처리

In [5]:
train2 = train.copy()

In [6]:
# 나머지는 동일하게 처리하되 결측치는 평균으로 대체

# 18세 이하는 0으로
train2['age'] = train2['age'].apply(lambda x: 0 if x <= 18 else x)

# 1000이상인 값은 태어난 년도로 간주하여 계정 생성연도를 추출하여 나이를 채운다
current_year = pd.to_datetime(train2['date_account_created']).dt.year
birth_year = current_year - train2['age']
train2['age'] = np.where(train2['age'] > 1000, np.where(birth_year > 0, birth_year, 0), train2['age'])

# 결측치는 평균으로 대체
train2['age'].fillna(train2['age'].mean(), inplace=True)

## session 전처리

In [8]:
# user_id가 존재하지 않는 것은 삭제 -> train이랑 머지가 불가능한 데이터
session.dropna(subset='user_id', inplace=True)

session['action'].replace('-unknown-', 'unknown', inplace=True)
session['action_type'].replace('-unknown-', 'unknown', inplace=True)
session['action_detail'].replace('-unknown-', 'unknown', inplace=True)
session['device_type'].replace('-unknown-', 'unknown', inplace=True)

session['action'].fillna('unknown', inplace=True)
session['action_type'].fillna('unknown', inplace=True)
session['action_detail'].fillna('unknown', inplace=True)

session['secs_elapsed'].fillna(0, inplace=True)

# user_flow 컬럼 생성
session['user_flow'] = session['action'].astype('str') + '+' + session['action_type'].astype('str') + '+' + session['action_detail'].astype('str')

In [9]:
# user별 user_flow 합치기
user_flows_concatenated = session.groupby('user_id')['user_flow'].apply(lambda x: ','.join(x)).reset_index()

session = pd.merge(session.drop(columns=['user_flow']), user_flows_concatenated, on='user_id', how='left')

In [10]:
session_df = session.copy()

## feature engineering

In [11]:
# user_id별 평균 secs_elapsed column 생성
session_df['mean_secs_elapsed'] = session_df.groupby('user_id')['secs_elapsed'].transform('mean')
# user_id별 가장 빈도수가 높은 device_type column 생성
session_df['most_frequent_device'] = session_df.groupby('user_id')['device_type'].transform(lambda x: x.mode()[0])
# user_id별 세션 총 개수
session_df['session_count'] = session_df.groupby('user_id')['action'].transform('count')

In [12]:
session_df.drop(['action', 'action_type', 'action_detail', 'device_type', 'secs_elapsed'], axis=1, inplace=True)

In [13]:
session_df.drop_duplicates(subset='user_id', inplace=True)

## Train Merge

In [14]:
merged_df = pd.merge(train2, session_df, on='user_id', how='inner')

In [23]:
merged_df.shape

(73815, 20)

In [28]:
# tfidf
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_df['user_flow'])

# TF-IDF를 기존 데이터프레임에 추가
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_merged = pd.concat([merged_df, tfidf_df], axis=1)

# TF-IDF를 적용한 데이터에서 불필요한 열 제거
tfidf_merged.drop(['user_flow'], axis=1, inplace=True)

In [29]:
tfidf_merged.shape

(73815, 449)

# Test 전처리

In [16]:
test = pd.read_csv('test_users.csv')

In [17]:
test.isnull().sum()

id                             0
date_account_created           0
timestamp_first_active         0
date_first_booking         62096
gender                         0
age                        28876
signup_method                  0
signup_flow                    0
language                       0
affiliate_channel              0
affiliate_provider             0
first_affiliate_tracked       20
signup_app                     0
first_device_type              0
first_browser                  0
dtype: int64

In [18]:
# test 전처리
test.rename(columns={'id':'user_id'},inplace=True)
test['timestamp_first_active'] = pd.to_datetime(test['timestamp_first_active'], format = '%Y%m%d%H%M%S')
test['date_account_created'] = pd.to_datetime(test['date_account_created'])
test['date_first_booking'] = pd.to_datetime(test['date_first_booking'], errors='coerce')
test['first_affiliate_tracked'].fillna('untracked', inplace=True)

In [19]:
test2 = test.copy()

In [20]:
#나머지는 동일하게 처리하되 결측치는 평균으로 대체

# 18세 이하는 0으로
test2['age'] = test2['age'].apply(lambda x: 0 if x <= 18 else x)

# 1000이상인 값은 태어난 년도로 간주하여 계정 생성연도를 추출하여 나이를 채운다
current_year = pd.to_datetime(test2['date_account_created']).dt.year
birth_year = current_year - test2['age']
test2['age'] = np.where(test2['age'] > 1000, np.where(birth_year > 0, birth_year, 0), test2['age'])

# 결측치는 평균으로 대체
test2['age'].fillna(test2['age'].mean(), inplace=True)

In [24]:
test2.shape

(62096, 15)

In [25]:
# test merge
merged_test = pd.merge(test2, session_df, on='user_id', how='inner')

In [26]:
merged_test.shape

(61668, 19)

In [30]:
# tfidf
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(merged_test['user_flow'])

# TF-IDF를 기존 데이터프레임에 추가
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_test = pd.concat([merged_test, tfidf_df], axis=1)

# TF-IDF를 적용한 데이터에서 불필요한 열 제거
tfidf_test.drop(['user_flow'], axis=1, inplace=True)

In [31]:
tfidf_test.shape

(61668, 457)

## train / test column matching

In [32]:
print("train set:", tfidf_merged.shape)
print("test set:", tfidf_test.shape)

train set: (73815, 449)
test set: (61668, 457)


In [35]:
# 서로에게 없는 컬럼 확인
train_columns_only = set(column for column in tfidf_merged.columns if column != 'country_destination') - set(tfidf_test.columns)

test_columns_only = set(tfidf_test.columns) - set(tfidf_merged.columns)

In [36]:
print("train columns only:", len(train_columns_only))
print("test columns only:", len(test_columns_only))

train columns only: 49
test columns only: 58


In [37]:
# 서로에게 없는 것은 예측에 사용할 수 없으므로 삭제
tfidf_merged_processed = tfidf_merged.drop(columns=train_columns_only)
tfidf_test_processed = tfidf_test.drop(columns=test_columns_only)

In [39]:
# train의 user_id 이외에는 동일
print("train set:", tfidf_merged_processed.shape)
print("test set:", tfidf_test_processed.shape)

train set: (73815, 400)
test set: (61668, 399)


In [44]:
ids = tfidf_test_processed['user_id']
len(ids)

61668

In [45]:
tfidf_test_processed.drop(['user_id','date_first_booking'], axis=1, inplace=True)

In [46]:
print("train set:", tfidf_merged_processed.shape)
print("test set:", tfidf_test_processed.shape)

train set: (73815, 400)
test set: (61668, 397)


## Modeling

In [40]:
cat_features3 = ['gender', 'signup_method', 'language', 'affiliate_channel', 'affiliate_provider',
                'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser',
                'most_frequent_device']

In [47]:
X = tfidf_merged_processed.drop(['user_id', 'date_first_booking', 'country_destination'], axis=1)
y = tfidf_merged_processed['country_destination']

In [48]:
n_splits = 3

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

model = CatBoostClassifier(iterations=100, 
                           learning_rate=0.1, 
                           depth=6, 
                           loss_function='MultiClass',
                           cat_features=cat_features3,
                           random_seed=random_seed,
                           eval_metric='MultiClass',
                           verbose=20)

ndcg_scores = []

lb = LabelBinarizer()
lb.fit(y)

for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    model.fit(X_train_fold, y_train_fold, eval_set=(X_test_fold, y_test_fold))
    
    y_pred_probs_fold = model.predict_proba(X_test_fold)
    # k=5
    ndcg_fold = ndcg_score(lb.transform(y_test_fold), y_pred_probs_fold, k=5)
    ndcg_scores.append(ndcg_fold)

mean_ndcg = np.mean(ndcg_scores)
print("Mean NDCG Score:", mean_ndcg)

0:	learn: 1.9778115	test: 1.9791817	best: 1.9791817 (0)	total: 893ms	remaining: 1m 28s
20:	learn: 0.9877456	test: 0.9925559	best: 0.9925559 (20)	total: 18.3s	remaining: 1m 8s
40:	learn: 0.9299216	test: 0.9398013	best: 0.9398013 (40)	total: 36.6s	remaining: 52.6s
60:	learn: 0.9171119	test: 0.9313468	best: 0.9313468 (60)	total: 55.2s	remaining: 35.3s
80:	learn: 0.9087066	test: 0.9277485	best: 0.9277485 (80)	total: 1m 14s	remaining: 17.5s
99:	learn: 0.9022379	test: 0.9249172	best: 0.9249172 (99)	total: 1m 32s	remaining: 0us

bestTest = 0.9249172202
bestIteration = 99

0:	learn: 1.9720698	test: 1.9733008	best: 1.9733008 (0)	total: 903ms	remaining: 1m 29s
20:	learn: 0.9823973	test: 0.9907683	best: 0.9907683 (20)	total: 20.8s	remaining: 1m 18s
40:	learn: 0.9265822	test: 0.9411513	best: 0.9411513 (40)	total: 39.4s	remaining: 56.6s
60:	learn: 0.9135375	test: 0.9331463	best: 0.9331463 (60)	total: 59.2s	remaining: 37.8s
80:	learn: 0.9041464	test: 0.9290678	best: 0.9290678 (80)	total: 1m 17s	rema

In [51]:
pred_y_test = model.predict_proba(tfidf_test_processed)

In [53]:
pred_y_test.shape

(61668, 12)

In [57]:
top_k = 5  

cts = []

for i in range(len(pred_y_test)):
    sorted_indices = np.argsort(pred_y_test[i])[::-1][:top_k]  
    sorted_indices_array = np.array(sorted_indices) 
    top_countries = lb.classes_[sorted_indices_array] 
    cts.append(top_countries)


cts_flat = [country for sublist in cts for country in sublist]

submission_df = pd.DataFrame({'id': np.repeat(ids, 5), 'country': cts_flat})

In [62]:
# 예측 결과 저장
submission_df.to_csv('sub1.csv', index=False)

In [69]:
feature_importances = model.feature_importances_

features = X.columns

importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df

Unnamed: 0,Feature,Importance
3,age,29.821394
72,booking_request,15.056176
2,gender,4.668679
88,change_trip_characteristics,3.225038
205,message_post,3.131205
...,...,...
164,host_guarantee,0.000000
163,hospitality,0.000000
162,home_safety_terms,0.000000
161,home_safety_landing,0.000000


# Choosing the best model

In [63]:
X2 = tfidf_merged_processed.drop(['user_id', 'date_first_booking', 'country_destination'], axis=1)
y2 = tfidf_merged_processed['country_destination']

In [64]:
n_splits = 3

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

model_2 = CatBoostClassifier(iterations=100, 
                           learning_rate=0.1, 
                           depth=6, 
                           loss_function='MultiClass',
                           cat_features=cat_features3,
                           random_seed=random_seed,
#                            eval_metric='MultiClass',
                           verbose=20)

ndcg_scores = []

lb = LabelBinarizer()
lb.fit(y2)

for train_index, test_index in skf.split(X2, y2):
    X_train_fold, X_test_fold = X2.iloc[train_index], X2.iloc[test_index]
    y_train_fold, y_test_fold = y2.iloc[train_index], y2.iloc[test_index]
    
    model_2.fit(X_train_fold, y_train_fold, eval_set=(X_test_fold, y_test_fold))
    
    y_pred_probs_fold = model.predict_proba(X_test_fold)
    # k=5
    ndcg_fold = ndcg_score(lb.transform(y_test_fold), y_pred_probs_fold)
    ndcg_scores.append(ndcg_fold)

mean_ndcg = np.mean(ndcg_scores)
print("Mean NDCG Score:", mean_ndcg)

0:	learn: 1.9778115	test: 1.9791817	best: 1.9791817 (0)	total: 834ms	remaining: 1m 22s
20:	learn: 0.9877456	test: 0.9925559	best: 0.9925559 (20)	total: 17.9s	remaining: 1m 7s
40:	learn: 0.9299216	test: 0.9398013	best: 0.9398013 (40)	total: 35.7s	remaining: 51.4s
60:	learn: 0.9171119	test: 0.9313468	best: 0.9313468 (60)	total: 52.8s	remaining: 33.7s
80:	learn: 0.9087066	test: 0.9277485	best: 0.9277485 (80)	total: 1m 10s	remaining: 16.6s
99:	learn: 0.9022379	test: 0.9249172	best: 0.9249172 (99)	total: 1m 27s	remaining: 0us

bestTest = 0.9249172202
bestIteration = 99

0:	learn: 1.9720698	test: 1.9733008	best: 1.9733008 (0)	total: 851ms	remaining: 1m 24s
20:	learn: 0.9823973	test: 0.9907683	best: 0.9907683 (20)	total: 18.8s	remaining: 1m 10s
40:	learn: 0.9265822	test: 0.9411513	best: 0.9411513 (40)	total: 36.5s	remaining: 52.6s
60:	learn: 0.9135375	test: 0.9331463	best: 0.9331463 (60)	total: 54.4s	remaining: 34.8s
80:	learn: 0.9041464	test: 0.9290678	best: 0.9290678 (80)	total: 1m 12s	rema

In [65]:
pred_y_test = model.predict_proba(tfidf_test_processed)

In [66]:
top_k = 5  # 상위 5개 국가만

cts = []

for i in range(len(pred_y_test)):
    sorted_indices = np.argsort(pred_y_test[i])[::-1][:top_k]  
    sorted_indices_array = np.array(sorted_indices)  
    top_countries = lb.classes_[sorted_indices_array]  
    cts.append(top_countries)


cts_flat = [country for sublist in cts for country in sublist]

submission_df = pd.DataFrame({'id': np.repeat(ids, 5), 'country': cts_flat})

In [68]:
# eval_metric이나 k값을 지정하지 않은 예측 결과는 동일했다.
submission_df.to_csv('sub2.csv', index=False)

### Submission result
- sub1.csv :
    - Public : 0.83144, Private : 0.83354