<a href="https://colab.research.google.com/github/karellen-kim/training-search/blob/main/1_pointwise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [None]:
# 가짜 데이터
np.random.seed(42)

# 설정
queries = [
    "best pizza", "cheap sushi", "cozy cafe", "vegan burger", "romantic restaurant",
    "24hr diner", "late night ramen", "fast delivery food", "healthy lunch", "craft beer bar"
]
n_queries = len(queries)
docs_per_query = 20
rows = []
doc_id = 1

# 클릭 스코어 계산 기준
weight_review = 1.0
weight_distance = 1.5  # 거리가 멀면 페널티를 더 크게 줌

for qid, query in enumerate(queries, start=1):
    scores = []
    query_docs = []

    for i in range(docs_per_query):
        review = np.round(np.random.normal(loc=4.0, scale=0.4), 2)
        review = np.clip(review, 3.0, 5.0)

        distance = np.round(np.random.exponential(scale=1.0), 2)
        distance = np.clip(distance, 0.1, 5.0)

        click_score = review * weight_review - distance * weight_distance

        rows.append({
            "query_id": qid,
            "query": query,
            "doc_id": doc_id,
            "doc_name": f"Doc {doc_id}",
            "review_score": review,
            "distance_km": distance,
            "click_score": click_score,
            "clicked" : 1 if click_score > 3.5 else 0
        })
        doc_id += 1

# 데이터프레임 만들기
df = pd.DataFrame(rows)

# 확인
df.head(10)

Unnamed: 0,query_id,query,doc_id,doc_name,review_score,distance_km,click_score,clicked
0,1,best pizza,1,Doc 1,4.2,1.32,2.22,0
1,1,best pizza,2,Doc 2,3.94,0.91,2.575,0
2,1,best pizza,3,Doc 3,3.91,0.1,3.76,1
3,1,best pizza,4,Doc 4,3.91,2.01,0.895,0
4,1,best pizza,5,Doc 5,4.63,0.1,4.48,1
5,1,best pizza,6,Doc 6,4.31,3.5,-0.94,0
6,1,best pizza,7,Doc 7,3.81,0.2,3.51,1
7,1,best pizza,8,Doc 8,4.22,0.2,3.92,1
8,1,best pizza,9,Doc 9,4.1,0.57,3.245,0
9,1,best pizza,10,Doc 10,3.23,0.34,2.72,0


In [None]:
X = df[['review_score', 'distance_km']]
y = df['clicked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = xgb.XGBClassifier(
    objective='binary:logistic',  # Pointwise는 binary classification
    eval_metric='logloss',
    use_label_encoder=False
)

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [None]:
X_test.head(2)

Unnamed: 0,review_score,distance_km
95,3.71,0.18
15,4.04,0.19


In [None]:
y_pred_prob = model.predict_proba(X_test)[:,1]  # 클릭 확률
rounded = np.round(y_pred_prob, 2)
rounded

array([0.13, 0.98, 1.  , 0.03, 0.01, 0.  , 0.99, 0.  , 0.98, 0.91, 0.  ,
       0.01, 0.  , 0.  , 0.97, 0.  , 0.17, 0.91, 0.  , 1.  , 0.96, 0.  ,
       0.  , 0.  , 0.44, 0.47, 0.01, 1.  , 0.99, 0.  , 0.  , 1.  , 0.01,
       0.  , 0.  , 0.  , 0.  , 0.01, 0.5 , 0.5 ], dtype=float32)

In [None]:
# X_test의 인덱스를 통해 원본 df에서 행 추출
test_df = df.loc[X_test.index].copy()
test_df['predicted_prob'] = y_pred_prob

# 확률 높은 순으로 정렬
test_df_sorted = test_df.sort_values(by='predicted_prob', ascending=False)
test_df_sorted.head(5)

Unnamed: 0,query_id,query,doc_id,doc_name,review_score,distance_km,click_score,clicked,predicted_prob
30,2,cheap sushi,31,Doc 31,4.13,0.1,3.98,1,0.998866
68,4,vegan burger,69,Doc 69,4.02,0.1,3.87,1,0.997898
67,4,vegan burger,68,Doc 68,3.99,0.1,3.84,1,0.997768
18,1,best pizza,19,Doc 19,3.99,0.1,3.84,1,0.997768
69,4,vegan burger,70,Doc 70,4.99,0.33,4.495,1,0.994161


In [None]:
X_features = pd.DataFrame([
    {'review_score': 4.8, 'distance_km': 0.2},
    {'review_score': 4.2, 'distance_km': 5.0},
    {'review_score': 2.2, 'distance_km': 0.1}
])
pred_probs = model.predict_proba(X_features)[:, 1]  # 클릭 확률 예측
pred_probs

array([0.9983158 , 0.00765158, 0.5039054 ], dtype=float32)