In [1]:
import pandas as pd    
import numpy as np
import pickle
from tqdm import tqdm
import json

from constants import *
from constants_id import *
from indexing import BasicInvertedIndex
from document_preprocessor import RegexTokenizer
from ranker import Ranker, BM25
from l2r import L2RFeatureExtractor, L2RRanker
from relevance import map_score, ndcg_score

In [2]:
document_preprocessor = RegexTokenizer('\\w+')
stopwords = set()
with open(STOPWORD_PATH, "r") as f:
    for word in f:
        stopwords.add(word.strip())

title_index = BasicInvertedIndex()
title_index.load(PAPER_TITLE_INDEX)
abstract_index = BasicInvertedIndex()
abstract_index.load(PAPER_ABSTRACT_INDEX)

100%|██████████| 199860/199860 [00:04<00:00, 44539.22it/s]
100%|██████████| 45129/45129 [00:20<00:00, 2182.56it/s] 


In [None]:
# print("Load docid list")
# with open(DOCID_LIST_PATH, 'rb') as f:
#     docid_list = pickle.load(f)

In [None]:
# print("Load categories")
# with open(DOC_CATEGORY_INFO_PATH, 'rb') as f:
#     doc_category_info = pickle.load(f)
# with open(RECOG_CATEGORY_PATH, 'rb') as f:
#     recognized_categories = pickle.load(f)

In [None]:
# print("Load year release")
# with open(DOCID_TO_YEAR_RELEASE_PATH, 'rb') as f:
#     docid_to_yr = pickle.load(f)

In [None]:
# print("Load citation")
# with open(DOCID_TO_CITATION_PATH, 'rb') as f:
#     docid_to_citation = pickle.load(f)

In [None]:
# print("Load network features")
# with open(DOCID_TO_NETWORK_FEATURES_PATH, 'rb') as f:
#     docid_to_network_features = pickle.load(f)

In [8]:
docid_list = []
with open(PAPER_DATA_PATH, 'r') as f:
    for i, line in enumerate(tqdm(f, total=TOTAL_PAPER_COUNT)):
        doc = json.loads(line)
        if doc['abstract'] == '' or doc['n_citation'] <=20:
            continue
        
        docid_list.append(doc['id'])

with open(DOCID_LIST_PATH, 'wb') as f:
    pickle.dump(docid_list, f, protocol=pickle.HIGHEST_PROTOCOL)

  2%|▏         | 131051/6404472 [00:06<04:52, 21453.78it/s]


KeyboardInterrupt: 

In [3]:
print("Load categories")
with open(ID_CATEGORY_INFO_PATH, 'rb') as f:
    doc_category_info = pickle.load(f)
with open(RECOG_CATEGORY_PATH, 'rb') as f:
    recognized_categories = pickle.load(f)
print("Load year release")
with open(ID_TO_YEAR_RELEASE_PATH, 'rb') as f:
    docid_to_yr = pickle.load(f)
print("Load citation")
with open(ID_TO_CITATION_PATH, 'rb') as f:
    docid_to_citation = pickle.load(f)
print("Load network features")
with open(ID_TO_NETWORK_FEATURES_PATH, 'rb') as f:
    docid_to_network_features = pickle.load(f)

Load categories
Load year release
Load citation
Load network features


In [4]:
print("Initializing Feature Extractor")
feature_extractor = L2RFeatureExtractor(abstract_index, title_index,
                doc_category_info, document_preprocessor, stopwords,
                recognized_categories, docid_to_network_features, docid_to_yr, docid_to_citation)

Initializing Feature Extractor


In [5]:
print("Initializing Ranker")
BM25scorer = BM25(abstract_index)
BM25Ranker = Ranker(abstract_index, document_preprocessor, stopwords, BM25scorer)

l2rRanker = L2RRanker(document_preprocessor, stopwords, BM25Ranker, feature_extractor)

# with open(BM25_RANKER_PATH, 'wb') as f:
#     pickle.dump(BM25Ranker, f, protocol=pickle.HIGHEST_PROTOCOL)
# with open(L2R_RANKER_PATH, 'wb') as f:
#     pickle.dump(l2rRanker, f, protocol=pickle.HIGHEST_PROTOCOL)

Initializing Ranker


In [6]:
id_col = 'docid'
test_rel_df = pd.read_csv("dataset/paper_test_data.csv")
query_list = test_rel_df['query'].unique()

boosting_type_list = ["gbdt", "rf"]
importance_type_list = ["split", "gain"]
n_estimators_list = [20, 50]
max_depth_list = [3, 8]
result_all = dict()
for i, boosting_type in enumerate(boosting_type_list):
    for j, importance_type in enumerate(importance_type_list):
        for k, n_estimators in enumerate(n_estimators_list):
            for l, max_depth in enumerate(max_depth_list):
                params = {
                    'boosting_type': boosting_type,
                    'importance_type': importance_type,
                    'n_estimators': n_estimators,
                    'max_depth': max_depth,
                }
                it = i*8 + j*4 + k*2 + l
                result_all[it] = params
                print("Iteration: ", it)
                print(params)
                l2rRanker.model.re_init(params=params)
                l2rRanker.train("dataset/paper_train_data.csv")

                map_list = []
                ndcg_list = []
                for query in tqdm(query_list):
                    rank_result = l2rRanker.query(query)
                    actual_rel = []
                    rel_selected_df = test_rel_df[test_rel_df['query'] == query]
                    rel_docid_list = list(rel_selected_df[id_col])

                    for result in rank_result:
                        if result[0] in rel_docid_list:
                            actual_rel.append(rel_selected_df[rel_selected_df[id_col] == result[0]].iloc[0]["rel"])
                        else:
                            actual_rel.append(1)

                    ideal_rel = sorted(actual_rel, reverse=True)
                    ndcg_list.append(ndcg_score(actual_rel, ideal_rel))
                print("mean NDCG: ", np.mean(ndcg_list))
                result_all[it]['mean_ndcg'] = np.mean(ndcg_list)

Iteration:  0
{'boosting_type': 'gbdt', 'importance_type': 'split', 'n_estimators': 20, 'max_depth': 3}


100%|██████████| 4520/4520 [00:00<00:00, 5887.17it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027342 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 149/149 [02:54<00:00,  1.17s/it]


mean NDCG:  0.5092554354503779
Iteration:  1
{'boosting_type': 'gbdt', 'importance_type': 'split', 'n_estimators': 20, 'max_depth': 8}


100%|██████████| 4520/4520 [00:00<00:00, 5898.69it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 149/149 [02:48<00:00,  1.13s/it]


mean NDCG:  0.493414045244047
Iteration:  2
{'boosting_type': 'gbdt', 'importance_type': 'split', 'n_estimators': 50, 'max_depth': 3}


100%|██████████| 4520/4520 [00:00<00:00, 5882.58it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 149/149 [02:48<00:00,  1.13s/it]


mean NDCG:  0.5119943292331592
Iteration:  3
{'boosting_type': 'gbdt', 'importance_type': 'split', 'n_estimators': 50, 'max_depth': 8}


100%|██████████| 4520/4520 [00:00<00:00, 5869.10it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029900 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 149/149 [02:53<00:00,  1.16s/it]


mean NDCG:  0.4933150767078196
Iteration:  4
{'boosting_type': 'gbdt', 'importance_type': 'gain', 'n_estimators': 20, 'max_depth': 3}


100%|██████████| 4520/4520 [00:00<00:00, 5907.44it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 149/149 [02:50<00:00,  1.15s/it]


mean NDCG:  0.5092554354503779
Iteration:  5
{'boosting_type': 'gbdt', 'importance_type': 'gain', 'n_estimators': 20, 'max_depth': 8}


100%|██████████| 4520/4520 [00:00<00:00, 5906.48it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 149/149 [02:51<00:00,  1.15s/it]


mean NDCG:  0.493414045244047
Iteration:  6
{'boosting_type': 'gbdt', 'importance_type': 'gain', 'n_estimators': 50, 'max_depth': 3}


100%|██████████| 4520/4520 [00:00<00:00, 5898.47it/s]


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013371 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 149/149 [02:51<00:00,  1.15s/it]


mean NDCG:  0.5119943292331592
Iteration:  7
{'boosting_type': 'gbdt', 'importance_type': 'gain', 'n_estimators': 50, 'max_depth': 8}


100%|██████████| 4520/4520 [00:00<00:00, 5868.38it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029901 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 149/149 [02:52<00:00,  1.15s/it]


mean NDCG:  0.4933150767078196
Iteration:  8
{'boosting_type': 'rf', 'importance_type': 'split', 'n_estimators': 20, 'max_depth': 3}


100%|██████████| 4520/4520 [00:00<00:00, 5748.06it/s]
[LightGBM] [Fatal] Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /__w/1/s/lightgbm-python/src/boosting/rf.hpp, line 36 .



LightGBMError: Check failed: (config->bagging_freq > 0 && config->bagging_fraction < 1.0f && config->bagging_fraction > 0.0f) || (config->feature_fraction < 1.0f && config->feature_fraction > 0.0f) at /__w/1/s/lightgbm-python/src/boosting/rf.hpp, line 36 .


In [6]:
id_col = 'docid'
params = {'n_estimators': 20, 'max_depth': 3}
print(params)
l2rRanker.model.re_init(params=params)
l2rRanker.train("dataset/paper_train_data.csv")

for path in ["dataset/paper_train_data.csv", "dataset/paper_test_data.csv"]:
    test_rel_df = pd.read_csv(path)
    query_list = test_rel_df['query'].unique()
    map_list = []
    ndcg_list = []
    for query in tqdm(query_list):
        rank_result = l2rRanker.query(query)
        actual_rel = []
        rel_selected_df = test_rel_df[test_rel_df['query'] == query]
        rel_docid_list = list(rel_selected_df[id_col])

        for result in rank_result:
            if result[0] in rel_docid_list:
                actual_rel.append(rel_selected_df[rel_selected_df[id_col] == result[0]].iloc[0]["rel"])
            else:
                actual_rel.append(1)

        ideal_rel = sorted(actual_rel, reverse=True)
        ndcg_list.append(ndcg_score(actual_rel, ideal_rel))
    print(path)
    print("mean NDCG: ", np.mean(ndcg_list))

{'n_estimators': 20, 'max_depth': 3}


100%|██████████| 4520/4520 [00:00<00:00, 5378.67it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2490
[LightGBM] [Info] Number of data points in the train set: 4520, number of used features: 14


100%|██████████| 154/154 [03:24<00:00,  1.33s/it]


dataset/paper_train_data.csv
mean NDCG:  0.39899261914284045


100%|██████████| 149/149 [02:54<00:00,  1.17s/it]

dataset/paper_test_data.csv
mean NDCG:  0.5092554354503779





In [6]:
params = {'n_estimators': 20, 'max_depth': 3}

with open(L2R_RANKER_PATH, 'rb') as f:
    l2rRanker = pickle.load(f)

l2rRanker.train("dataset/paper_all_data.csv")
with open(L2R_RANKER_FITTED_PATH, 'wb') as f:
    pickle.dump(l2rRanker, f, protocol=pickle.HIGHEST_PROTOCOL)

100%|██████████| 5650/5650 [00:01<00:00, 5623.48it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2498
[LightGBM] [Info] Number of data points in the train set: 5650, number of used features: 14


In [8]:
paper_data_df = pd.read_csv(f'{SCRACTCH_PATH}/paper_author_org/paper_level_edited.csv')

In [9]:
query = 'learning to rank lightgbm'
query_number = 10

rank_result_list = l2rRanker.query(query)
rank_result_df = pd.DataFrame(rank_result_list, columns=['docid','score'])
rank_result_df = pd.merge(paper_data_df, rank_result_df)
rank_result_df = rank_result_df.sort_values('score', ascending=False)[:query_number].drop(columns='score')
rank_result_df = rank_result_df.fillna('-')
rank_result_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,title,abstract,year,author,org,n_citation,docid
196907,1295745,1295745,Kernelized Subspace Ranking For Saliency Detec...,"In this paper, we propose a novel saliency met...",2016,Tiantian Wang; Lihe Zhang; Huchuan Lu; Chong S...,"Dalian Univ Technol, Sch Informat & Commun Eng...",102,4222231.0
96017,632899,632899,"The Singular Value Decomposition, Applications...",The singular value decomposition (SVD) is not ...,2015,zhihua zhang,-,47,2062995.0
145139,953281,953281,Motor imagery based brain-computer interface: ...,This article contains a new method to improvin...,2022,Said Abenna; Mohammed Nahid; Abderrahim Bajit,"Hassan II Univ, Fac Sci & Technol, Casablanca,...",44,3108240.0
237365,1561782,1561782,Sign rank versus VC dimension,This work studies the maximum possible sign ra...,2016,Noga Mordechai Alon; shay moran; amir yehudayoff,External Organizations; Algorithms and Complex...,40,5090405.0
31290,206073,206073,Adaptive affinity matrix learning for dimensio...,Conventional graph-based dimensionality reduct...,2023,Junran He; Xiaozhao Fang; Peipei Kang; Lin Jia...,"School of Computer Science and Technology, Gua...",42,671793.0
235039,1546398,1546398,Cognitive Diversity: A Measurement of Dissimil...,"In the context of computing and informatics, C...",2019,D. Frank Hsu; Bruce S. Kristal; Yuhan Hao; Chr...,"Fordham Univ, Dept Comp & Informat Sci, Lab In...",37,5040528.0
44749,294343,294343,A Low Rank Structural Large Margin Method For ...,Cross-modal retrieval is a classic research to...,2013,Xinyan Lu; Fei Wu; Siliang Tang; Zhongfei Zhan...,"Zhejiang Univ, Coll Comp Sci, Hangzhou, Zhejia...",54,958947.0
139511,917163,917163,Hyperspectral image denoising with bilinear lo...,•A bilinear low rank matrix factorization (BLR...,2019,Huixin Fan; Jie Li; Qiangqiang Yuan; Xinxin Li...,"School of Geodesy and Geomatics, Wuhan Univers...",43,2991144.0
21082,141004,141004,Nonconvex Regularizations for Feature Selectio...,Feature selection in learning to rank has rece...,2015,Léa Laporte; Rémi Flamary; Stéphane Canu; Séba...,"Univ Toulouse, Inst Rech Informat Toulouse, CN...",106,459294.0
37960,249546,249546,Fuzzy Ranking: Theory And Applications,The rank ordering of samples is widely used in...,2000,A Flaig; Ke Barner; Gr Arce,"Univ Delaware, Dept Elect & Comp Engn, Newark,...",25,813818.0
