<a href="https://colab.research.google.com/github/karellen-kim/training-search/blob/main/3_listwise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
import numpy as np

np.random.seed(42)

# 예시 쿼리 3개, 문서당 5개
queries = ["best pizza", "cheap sushi", "vegan burger"]
docs = []

for qid, query in enumerate(queries):
    for i in range(5):
        review = np.round(np.random.normal(4.0, 0.3), 2)
        review = np.clip(review, 3.0, 5.0)
        distance = np.round(np.random.exponential(1.0), 2)
        distance = np.clip(distance, 0.1, 5.0)
        relevance = int(review * 1.0 - distance * 1.5)  # 새로운 relevance 계산
        docs.append({
            'query': query,
            'qid': qid,
            'doc_id': f"{query}_doc{i}",
            'review_score': review,
            'distance_km': distance,
            'relevance': relevance
        })

df = pd.DataFrame(docs)
df.head(5)

Unnamed: 0,query,qid,doc_id,review_score,distance_km,relevance
0,best pizza,0,best pizza_doc0,4.15,1.32,2
1,best pizza,0,best pizza_doc1,3.96,0.91,2
2,best pizza,0,best pizza_doc2,3.93,0.1,3
3,best pizza,0,best pizza_doc3,3.93,2.01,0
4,best pizza,0,best pizza_doc4,4.47,0.1,4


In [22]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# 2. 쿼리 단위로 분할 (query group 유지)
unique_qids = df['qid'].unique()
train_qids, test_qids = train_test_split(unique_qids, test_size=0.4, random_state=42)

train_df = df[df['qid'].isin(train_qids)].copy()
test_df = df[df['qid'].isin(test_qids)].copy()

# 3. 학습 데이터 준비
X_train = train_df[['review_score', 'distance_km']]
y_train = train_df['relevance']
group_train = train_df.groupby('qid').size().values

train_data = lgb.Dataset(X_train, label=y_train, group=group_train)

# 4. 모델 파라미터 설정
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],
    'learning_rate': 0.1,
    'num_leaves': 15,
    'min_data_in_leaf': 1,
    'verbose': -1
}

# 5. 모델 학습
model = lgb.train(params, train_data, num_boost_round=100)

# 6. 테스트 예측
X_test = test_df[['review_score', 'distance_km']]
y_pred = model.predict(X_test)

test_df = test_df.copy()
test_df['pred'] = y_pred

# 7. 쿼리별 정렬된 결과 보기
for qid in test_df['qid'].unique():
    query_text = test_df.loc[test_df['qid'] == qid, 'query'].iloc[0]
    print(f"\n[Query ID {qid} - '{query_text}']")
    display(
        test_df[test_df['qid'] == qid][['doc_id', 'relevance', 'pred']]
        .sort_values(by='pred', ascending=False)
    )


[Query ID 0 - 'best pizza']


Unnamed: 0,doc_id,relevance,pred
4,best pizza_doc4,4,0.15836
2,best pizza_doc2,3,0.016721
0,best pizza_doc0,2,-0.2
1,best pizza_doc1,2,-0.2
3,best pizza_doc3,0,-0.2



[Query ID 1 - 'cheap sushi']


Unnamed: 0,doc_id,relevance,pred
7,cheap sushi_doc2,3,0.15836
8,cheap sushi_doc3,3,0.15836
6,cheap sushi_doc1,3,0.016721
9,cheap sushi_doc4,2,0.016721
5,cheap sushi_doc0,-1,-0.2
