<a href="https://colab.research.google.com/github/karellen-kim/training-search/blob/main/2_pairwise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [62]:
import pandas as pd
import numpy as np

np.random.seed(42)

# 쿼리 5개 정도 추가
queries = ["best pizza", "cheap sushi", "vegan burger", "romantic restaurant", "craft beer bar"]

# 각 쿼리마다 10개 문서 생성
doc_pool = {}

for query in queries:
    docs = []
    for i in range(10):
        review = np.round(np.random.normal(loc=4.0, scale=0.4), 2)
        review = np.clip(review, 3.0, 5.0)
        distance = np.round(np.random.exponential(scale=1.0), 2)
        distance = np.clip(distance, 0.1, 5.0)
        docs.append({
            "doc_name": f"{query}_doc{i+1}",
            "review_score": review,
            "distance_km": distance
        })
    doc_pool[query] = docs

def calc_click_score(review, distance, alpha=1.0, beta=1.5):
    """
    리뷰는 높을수록, 거리는 짧을수록 클릭 확률이 높은 score.
    """
    return review * alpha - distance * beta

pairs = []

for query, docs in doc_pool.items():
    # 클릭 점수 계산
    for doc in docs:
        doc['click_score'] = calc_click_score(doc['review_score'], doc['distance_km'])

    # 상위 12개 문서(top_docs)와 나머지(rest_docs) 분리
    sorted_docs = sorted(docs, key=lambda d: d['click_score'], reverse=True)
    top_docs = sorted_docs[:5]
    rest_docs = sorted_docs[5:]

    # top_doc × rest_doc 조합으로 pair 생성
    for top_doc in top_docs:
        for rest_doc in rest_docs:
            # top_doc > rest_doc → label = 1
            pairs.append({
                "query": query,
                "doc1_name": top_doc['doc_name'],
                "doc2_name": rest_doc['doc_name'],
                "review_diff": top_doc['review_score'] - rest_doc['review_score'],
                "dist_diff": top_doc['distance_km'] - rest_doc['distance_km'],
                "label": 1
            })

            # 반대 쌍도 생성 → label = 0
            pairs.append({
                "query": query,
                "doc1_name": rest_doc['doc_name'],
                "doc2_name": top_doc['doc_name'],
                "review_diff": rest_doc['review_score'] - top_doc['review_score'],
                "dist_diff": rest_doc['distance_km'] - top_doc['distance_km'],
                "label": 0
            })

df = pd.DataFrame(pairs)
df.head(5)

Unnamed: 0,query,doc1_name,doc2_name,review_diff,dist_diff,label
0,best pizza,best pizza_doc5,best pizza_doc10,1.4,-0.24,1
1,best pizza,best pizza_doc10,best pizza_doc5,-1.4,0.24,0
2,best pizza,best pizza_doc5,best pizza_doc2,0.69,-0.81,1
3,best pizza,best pizza_doc2,best pizza_doc5,-0.69,0.81,0
4,best pizza,best pizza_doc5,best pizza_doc1,0.43,-1.22,1


In [63]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

X = df[['review_diff', 'dist_diff']]
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False
)
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [64]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

for i in range(len(y_pred)):
    doc1 = df.iloc[X_test.index[i]]['doc1_name']
    doc2 = df.iloc[X_test.index[i]]['doc2_name']
    print(f"[{doc1} > {doc2}] 예측 확률: {y_pred_prob[i]:.2f}, 실제: {y_test.iloc[i]}")

[vegan burger_doc9 > vegan burger_doc7] 예측 확률: 0.89, 실제: 1
[best pizza_doc5 > best pizza_doc4] 예측 확률: 1.00, 실제: 1
[cheap sushi_doc7 > cheap sushi_doc1] 예측 확률: 0.01, 실제: 0
[cheap sushi_doc9 > cheap sushi_doc8] 예측 확률: 1.00, 실제: 1
[vegan burger_doc1 > vegan burger_doc7] 예측 확률: 0.98, 실제: 1
[romantic restaurant_doc8 > romantic restaurant_doc10] 예측 확률: 0.00, 실제: 0
[romantic restaurant_doc7 > romantic restaurant_doc9] 예측 확률: 0.00, 실제: 0
[romantic restaurant_doc10 > romantic restaurant_doc4] 예측 확률: 0.99, 실제: 1
[best pizza_doc6 > best pizza_doc5] 예측 확률: 0.00, 실제: 0
[vegan burger_doc5 > vegan burger_doc2] 예측 확률: 1.00, 실제: 1
[craft beer bar_doc6 > craft beer bar_doc8] 예측 확률: 0.99, 실제: 1
[craft beer bar_doc2 > craft beer bar_doc3] 예측 확률: 1.00, 실제: 1
[craft beer bar_doc9 > craft beer bar_doc3] 예측 확률: 0.99, 실제: 1
[craft beer bar_doc5 > craft beer bar_doc8] 예측 확률: 0.04, 실제: 1
[cheap sushi_doc7 > cheap sushi_doc9] 예측 확률: 0.00, 실제: 0
[craft beer bar_doc7 > craft beer bar_doc10] 예측 확률: 0.99, 실제: 1
[roma

In [68]:
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

# 예측 확률 & 클래스
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

# Accuracy
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy = {acc:.4f}") # 정답을 맞춘 비율

# AUC (확률 예측 품질)
auc = roc_auc_score(y_test, y_pred_prob)
print(f"AUC = {auc:.4f}") # 예측 확률이 잘 정렬되어 있음

# Log Loss
loss = log_loss(y_test, y_pred_prob)
print(f"Log Loss = {loss:.4f}") # 예측 값이 정답에 가까울 수록 0

Accuracy = 0.9867
AUC = 0.9986
Log Loss = 0.0549
