In [1]:
!git clone https://github.com/katjakon/xmlc-knowledge.git

fatal: destination path 'xmlc-knowledge' already exists and is not an empty directory.


In [2]:
import sys
sys.path.insert(0,'/content/xmlc-knowledge')

In [3]:
!pip install datasets faiss-cpu



In [4]:

from datasets import Dataset
from torch.utils.data import DataLoader
import pandas as pd
import faiss
from tqdm import tqdm
import torch
from sentence_transformers import SentenceTransformer
import pickle
from transformers import pipeline

import networkx as nx
from retriever import Retriever
from reranker import BGEReranker
from utils import k_hop_neighbors, get_pref_label, precision_at_k, recall_at_k, f1_at_k, strip_uri, get_label_mapping

from statistics import mean, stdev, median

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [30]:
# Read in text data
data_path = "/content/drive/MyDrive/MASTER-THESIS/title_validate.feather"
gnd_path = "/content/drive/MyDrive/MASTER-THESIS/gnd.pickle"
data_df = pd.read_feather(data_path)
gnd = pickle.load(open(gnd_path, "rb"))

In [8]:
retriever_model_str = 'BAAI/bge-m3'

In [33]:
data_df.head()

Unnamed: 0,title,label-idn,label_list
0,Theoretica chimica acta a journal for structur...,"[040674886, 04185098X]","[Zeitschrift, Theoretische Chemie]"
1,Adressbuch deutscher Chemiker,"[004713532, 040098362, 040118827, 040118894, 0...","[Gesellschaft Deutscher Chemiker, Chemiker, De..."
2,Österreichischer Amtskalender zusammengestell...,"[040052982, 040432718, 040678709, 041133935, 0...","[Behörde, Österreich, Ortsverzeichnis, Einrich..."
3,Die Angestellten-Versicherung Zeitschrift der ...,"[040674886, 041424298]","[Zeitschrift, Angestelltenversicherung]"
4,Die Arbeiten des Statistischen Bundesamtes im ...,"[004017331, 040064328, 041289463, 042571316]",[Deutschland (Bundesrepublik) / Statistisches ...




In [10]:
retriever = Retriever(retriever_model_str, device=DEVICE)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [31]:
label_strings, label_mapping = get_label_mapping(gnd)

In [35]:
sim, idns = retriever.retrieve(
    mapping=label_mapping,
    labels=label_strings,
    texts=data_df["title"].tolist(),
    top_k=10,
    batch_size=512)

Batches:   0%|          | 0/677 [00:00<?, ?it/s]

Batches:   0%|          | 0/144 [00:00<?, ?it/s]

In [36]:
recall_dict = {}
precision_dict = {}
f1_dict = {}

for k in range(1, 6):
    recall_dict[k] = []
    precision_dict[k] = []
    f1_dict[k] = []
    for preds_i, golds_i in zip(idns, data_df["label-idn"]):
        recall_dict[k].append(recall_at_k(
            y_pred=preds_i, y_true=golds_i, k=k))
        precision_dict[k].append(precision_at_k(y_pred=preds_i, y_true=golds_i, k=k))
        f1_dict[k].append(f1_at_k(y_pred=preds_i, y_true=golds_i, k=k))

# All predictions
rec_all = []
prec_all = []
f1_all = []
for preds_i, golds_i in zip(idns, data_df["label-idn"]):
    rec_all.append(recall_at_k(y_pred=preds_i, y_true=golds_i, k=10))
    prec_all.append(precision_at_k(y_pred=preds_i, y_true=golds_i, k=10))
    f1_all.append(f1_at_k(y_pred=preds_i, y_true=golds_i, k=10))

print(f"Recall@10: {mean(rec_all)}")
print(f"Precision@10: {mean(prec_all)}")
print(f"F1@10: {mean(f1_all)}")
print("=====================================")
for k in range(1, 6):
    print(f"Recall@{k}: {mean(recall_dict[k])}")
    print(f"Precision@{k}: {mean(precision_dict[k])}")
    print(f"F1@{k}: {mean(f1_dict[k])}")
    print("-----------------")

Recall@10: 0.2087849311222917
Precision@10: 0.049753815518487704
F1@10: 0.0749121185914911
Recall@1: 0.11876325800626579
Precision@1: 0.23761917101979024
F1@1: 0.14479354937273317
-----------------
Recall@2: 0.148644801226148
Precision@2: 0.1578785853598658
F1@2: 0.13794223509415238
-----------------
Recall@3: 0.16504405840030664
Precision@3: 0.12045536172979264
F1@3: 0.12555278348279367
-----------------
Recall@4: 0.1760265505418435
Precision@4: 0.09852152920798156
F1@4: 0.11440346127027007
-----------------
Recall@5: 0.1837184176037238
Precision@5: 0.08363725637283652
F1@5: 0.10463609410291426
-----------------


In [37]:
idn_plus_neighbors = retriever.get_neighbors(idns, graph=gnd, k=2)

rec_all = []
prec_all = []
f1_all = []
for preds_i, golds_i in zip(idn_plus_neighbors, data_df["label-idn"]):
    rec_all.append(recall_at_k(y_pred=preds_i, y_true=golds_i, k=len(preds_i)))
    prec_all.append(precision_at_k(y_pred=preds_i, y_true=golds_i, k=len(preds_i)))
    f1_all.append(f1_at_k(y_pred=preds_i, y_true=golds_i, k=len(preds_i)))

print(f"Recall@10: {mean(rec_all)}")
print(f"Precision@10: {mean(prec_all)}")
print(f"F1@10: {mean(f1_all)}")

Recall@10: 0.27523913462844307
Precision@10: 0.021553963037750062
F1@10: 0.03828521734400748


In [38]:
reranker_str = 'BAAI/bge-reranker-v2-m3'
reranker = BGEReranker(reranker_str, device=DEVICE)


In [39]:

pair_dict = {
    "pair": [],
    "label-idn": [],
    "title-idx": []
}

c = 0
for idx, (row, idn_i_list) in tqdm(enumerate(zip(data_df.itertuples(), idn_plus_neighbors)), total=len(data_df)):
    title_i = row.title
    for idn_i in idn_i_list:
        idn_i_str = get_pref_label(gnd, idn_i)
        pair_dict["pair"].append((title_i, idn_i_str))
        pair_dict["label-idn"].append(idn_i)
        pair_dict["title-idx"].append(idx)
        c += 1

100%|██████████| 73319/73319 [00:04<00:00, 15432.70it/s]


In [40]:
ds = Dataset.from_dict(pair_dict)

In [41]:
def tokenize(example):
    pair = example["pair"]
    return reranker.tokenizer(pair, padding=True, truncation=True, return_tensors='pt', max_length=64)

In [42]:
ds = ds.map(tokenize, batched=True, batch_size=2000)
ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label-idn', 'title-idx'])

Map:   0%|          | 0/3066748 [00:00<?, ? examples/s]

In [43]:
dataloader = DataLoader(ds, batch_size=1000, shuffle=False)

In [44]:
sim = {
    "title-idx": [],
    "label-idn": [],
    "score": []
}

for batch in tqdm(dataloader):
    scores = reranker.similarities(
        batch["input_ids"].to(DEVICE),
        batch["attention_mask"].to(DEVICE)
    )
    sim["title-idx"].extend(batch["title-idx"])
    sim["label-idn"].extend(batch["label-idn"])
    sim["score"].extend(scores.tolist())

100%|██████████| 3067/3067 [1:52:31<00:00,  2.20s/it]


In [45]:
df = pd.DataFrame(sim)
df["title-idx"] = df["title-idx"].astype(int)

In [46]:
recall_dict = {}
precision_dict = {}
f1_dict = {}

for idx in tqdm(set(df["title-idx"])):
    df_i = df[df["title-idx"] == idx]
    df_i = df_i.sort_values(by="score", ascending=False)
    pred = df_i["label-idn"].tolist()
    gold = data_df["label-idn"].iloc[idx]
    for k in range(1, 6):
        if k not in recall_dict:
            recall_dict[k] = []
            precision_dict[k] = []
            f1_dict[k] = []
        recall_dict[k].append(recall_at_k(y_pred=pred, y_true=gold, k=k))
        precision_dict[k].append(precision_at_k(y_pred=pred, y_true=gold, k=k))
        f1_dict[k].append(f1_at_k(y_pred=pred, y_true=gold, k=k))

for k in range(1, 6):
    print(f"Recall@{k}: {mean(recall_dict[k])}")
    print(f"Precision@{k}: {mean(precision_dict[k])}")
    print(f"F1@{k}: {mean(f1_dict[k])}")
    print("-----------------")

100%|██████████| 73319/73319 [02:31<00:00, 482.60it/s]


Recall@1: 0.13909799632945988
Precision@1: 0.30622348913651304
F1@1: 0.1746737950077627
-----------------
Recall@2: 0.1798777225619678
Precision@2: 0.20952277035966121
F1@2: 0.17439916200835606
-----------------
Recall@3: 0.20252845909711598
Precision@3: 0.16199075273803515
F1@3: 0.16241740565949184
-----------------
Recall@4: 0.2174033273622458
Precision@4: 0.1328611955973213
F1@4: 0.14952434047412028
-----------------
Recall@5: 0.22763364989019924
Precision@5: 0.11264747200589206
F1@5: 0.13742467662596472
-----------------


In [47]:
OUT = "/content/drive/MyDrive/MASTER-THESIS"

In [48]:
import os

In [49]:
df.to_feather(os.path.join(OUT, "validate-2hop.feather"))

In [50]:
df

Unnamed: 0,title-idx,label-idn,score
0,0,041837908,-1.254775
1,0,040057283,-5.863499
2,0,042525799,-10.970883
3,0,040118894,-10.626570
4,0,040437930,-6.481873
...,...,...,...
3066743,73318,956449158,-11.012754
3066744,73318,94139803X,-7.073450
3066745,73318,04059758X,-7.778935
3066746,73318,961363053,0.882374
