# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [2]:
version = 'v2'

In [3]:
import os 
import sys

In [4]:
sys.path.append(os.path.join(root_dir, src_dir))

In [5]:
corpus_filename = f'wikidata_corpus_{version}.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [6]:
chunks_filename = f'wikidata_chunks_{version}.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [7]:
from training import TrainingCorpus

In [8]:
corpus = TrainingCorpus()
corpus.load(corpus_filepath)
corpus.load_chunks(chunks_filepath)

---

## Load the model

Check if GPU is available

In [9]:
import torch
print(torch.cuda.is_available())

True


In [10]:
from model import BertModel

Using TensorFlow backend.


In [11]:
model_dir = f'models/wikidata_bert_{version}'
model_dir_path = os.path.join(root_dir, data_dir, model_dir)

In [12]:
model = BertModel(model_dir_path, batch_size=128, use_cuda=True)

All TF 2.0 model weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


---

## Find relevant terms for each cluster label using LIME

### Instantiate TermFinder

In [13]:
from termfinder import LimeTermFinder

In [14]:
term_finder = LimeTermFinder(model, corpus)

### Retrieve predicted labels for each instance in the corpus

In [15]:
import numpy as np

In [16]:
label_to_data_idx_dict = model.label_to_data_idx(corpus)

In [17]:
len(label_to_data_idx_dict)

195

### Retrieve relevant terms using LimeTermFinder

In [18]:
from tqdm.notebook import tqdm

In [19]:
df_data = []

In [20]:
for label_idx, data_idxs in tqdm(label_to_data_idx_dict.items()):
    for data_idx in tqdm(data_idxs, desc=f'Relevant terms for entity {label_idx}', leave=False):
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)

        if relevant_terms:
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx], 
                              'term': term, 
                              'weight': weight, 
                              'data_id': corpus.docs[data_idx]}
                df_data.append(dict_entry)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=195.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 51'), FloatProgress(value=0.0, max=13.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 53'), FloatProgress(value=0.0, max=23.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 190'), FloatProgress(value=0.0, max=21.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 36'), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 97'), FloatProgress(value=0.0, max=11.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 93'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 182'), FloatProgress(value=0.0, max=13.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 35'), FloatProgress(value=0.0, max=25.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 148'), FloatProgress(value=0.0, max=12.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 52'), FloatProgress(value=0.0, max=26.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 127'), FloatProgress(value=0.0, max=23.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 75'), FloatProgress(value=0.0, max=11.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 155'), FloatProgress(value=0.0, max=15.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 181'), FloatProgress(value=0.0, max=22.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 131'), FloatProgress(value=0.0, max=11.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 5'), FloatProgress(value=0.0, max=18.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 120'), FloatProgress(value=0.0, max=24.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 96'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 165'), FloatProgress(value=0.0, max=21.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 94'), FloatProgress(value=0.0, max=24.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 186'), FloatProgress(value=0.0, max=26.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 57'), FloatProgress(value=0.0, max=11.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 7'), FloatProgress(value=0.0, max=23.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 116'), FloatProgress(value=0.0, max=23.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 143'), FloatProgress(value=0.0, max=11.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 4'), FloatProgress(value=0.0, max=28.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 177'), FloatProgress(value=0.0, max=11.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 194'), FloatProgress(value=0.0, max=10.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 171'), FloatProgress(value=0.0, max=17.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 0'), FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 8'), FloatProgress(value=0.0, max=22.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 17'), FloatProgress(value=0.0, max=26.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 70'), FloatProgress(value=0.0, max=13.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 48'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 161'), FloatProgress(value=0.0, max=15.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 162'), FloatProgress(value=0.0, max=16.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 67'), FloatProgress(value=0.0, max=21.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 60'), FloatProgress(value=0.0, max=16.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 37'), FloatProgress(value=0.0, max=17.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 125'), FloatProgress(value=0.0, max=18.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 68'), FloatProgress(value=0.0, max=20.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 65'), FloatProgress(value=0.0, max=19.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 81'), FloatProgress(value=0.0, max=21.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 18'), FloatProgress(value=0.0, max=20.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 2'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 1'), FloatProgress(value=0.0, max=11.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 108'), FloatProgress(value=0.0, max=8.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 22'), FloatProgress(value=0.0, max=20.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 39'), FloatProgress(value=0.0, max=10.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 174'), FloatProgress(value=0.0, max=9.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 49'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 169'), FloatProgress(value=0.0, max=8.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 119'), FloatProgress(value=0.0, max=16.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 3'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 80'), FloatProgress(value=0.0, max=24.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 82'), FloatProgress(value=0.0, max=22.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 34'), FloatProgress(value=0.0, max=28.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 176'), FloatProgress(value=0.0, max=7.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 27'), FloatProgress(value=0.0, max=13.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 163'), FloatProgress(value=0.0, max=20.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 62'), FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 166'), FloatProgress(value=0.0, max=26.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 107'), FloatProgress(value=0.0, max=23.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 189'), FloatProgress(value=0.0, max=23.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 136'), FloatProgress(value=0.0, max=28.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 137'), FloatProgress(value=0.0, max=17.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 187'), FloatProgress(value=0.0, max=15.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 90'), FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 28'), FloatProgress(value=0.0, max=25.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 138'), FloatProgress(value=0.0, max=24.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 40'), FloatProgress(value=0.0, max=23.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 29'), FloatProgress(value=0.0, max=18.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 26'), FloatProgress(value=0.0, max=14.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 78'), FloatProgress(value=0.0, max=29.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 83'), FloatProgress(value=0.0, max=13.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 192'), FloatProgress(value=0.0, max=12.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 58'), FloatProgress(value=0.0, max=10.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 149'), FloatProgress(value=0.0, max=16.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 188'), FloatProgress(value=0.0, max=9.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 31'), FloatProgress(value=0.0, max=27.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 178'), FloatProgress(value=0.0, max=12.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 115'), FloatProgress(value=0.0, max=13.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 16'), FloatProgress(value=0.0, max=14.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 19'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 15'), FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 46'), FloatProgress(value=0.0, max=29.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 130'), FloatProgress(value=0.0, max=6.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 111'), FloatProgress(value=0.0, max=16.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 179'), FloatProgress(value=0.0, max=23.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 151'), FloatProgress(value=0.0, max=28.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 74'), FloatProgress(value=0.0, max=23.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 85'), FloatProgress(value=0.0, max=14.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 99'), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 88'), FloatProgress(value=0.0, max=24.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 193'), FloatProgress(value=0.0, max=7.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 191'), FloatProgress(value=0.0, max=12.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 92'), FloatProgress(value=0.0, max=27.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 12'), FloatProgress(value=0.0, max=20.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 13'), FloatProgress(value=0.0, max=10.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 112'), FloatProgress(value=0.0, max=13.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 157'), FloatProgress(value=0.0, max=6.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 170'), FloatProgress(value=0.0, max=28.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 14'), FloatProgress(value=0.0, max=20.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 168'), FloatProgress(value=0.0, max=6.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 72'), FloatProgress(value=0.0, max=15.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 114'), FloatProgress(value=0.0, max=23.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 139'), FloatProgress(value=0.0, max=17.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 146'), FloatProgress(value=0.0, max=20.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 42'), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 6'), FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 175'), FloatProgress(value=0.0, max=21.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 56'), FloatProgress(value=0.0, max=20.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 25'), FloatProgress(value=0.0, max=29.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 173'), FloatProgress(value=0.0, max=15.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 38'), FloatProgress(value=0.0, max=12.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 71'), FloatProgress(value=0.0, max=19.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 98'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 41'), FloatProgress(value=0.0, max=11.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 20'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 95'), FloatProgress(value=0.0, max=24.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 144'), FloatProgress(value=0.0, max=18.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 76'), FloatProgress(value=0.0, max=20.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 109'), FloatProgress(value=0.0, max=8.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 23'), FloatProgress(value=0.0, max=12.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 50'), FloatProgress(value=0.0, max=19.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 123'), FloatProgress(value=0.0, max=19.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 118'), FloatProgress(value=0.0, max=27.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 145'), FloatProgress(value=0.0, max=14.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 140'), FloatProgress(value=0.0, max=24.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 104'), FloatProgress(value=0.0, max=11.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 11'), FloatProgress(value=0.0, max=11.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 134'), FloatProgress(value=0.0, max=6.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 142'), FloatProgress(value=0.0, max=8.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 91'), FloatProgress(value=0.0, max=21.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 133'), FloatProgress(value=0.0, max=23.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 159'), FloatProgress(value=0.0, max=26.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 147'), FloatProgress(value=0.0, max=24.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 47'), FloatProgress(value=0.0, max=11.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 24'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 103'), FloatProgress(value=0.0, max=25.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 158'), FloatProgress(value=0.0, max=29.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 54'), FloatProgress(value=0.0, max=15.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 113'), FloatProgress(value=0.0, max=25.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 69'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 30'), FloatProgress(value=0.0, max=21.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 172'), FloatProgress(value=0.0, max=13.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 100'), FloatProgress(value=0.0, max=14.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 61'), FloatProgress(value=0.0, max=18.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 79'), FloatProgress(value=0.0, max=27.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 77'), FloatProgress(value=0.0, max=7.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 45'), FloatProgress(value=0.0, max=23.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 124'), FloatProgress(value=0.0, max=24.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 117'), FloatProgress(value=0.0, max=10.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 150'), FloatProgress(value=0.0, max=6.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 122'), FloatProgress(value=0.0, max=13.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 121'), FloatProgress(value=0.0, max=22.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 86'), FloatProgress(value=0.0, max=26.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 141'), FloatProgress(value=0.0, max=13.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 110'), FloatProgress(value=0.0, max=13.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 55'), FloatProgress(value=0.0, max=14.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 105'), FloatProgress(value=0.0, max=11.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 32'), FloatProgress(value=0.0, max=12.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 33'), FloatProgress(value=0.0, max=29.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 128'), FloatProgress(value=0.0, max=27.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 63'), FloatProgress(value=0.0, max=9.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 129'), FloatProgress(value=0.0, max=12.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 180'), FloatProgress(value=0.0, max=15.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 154'), FloatProgress(value=0.0, max=17.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 84'), FloatProgress(value=0.0, max=23.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 167'), FloatProgress(value=0.0, max=24.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 9'), FloatProgress(value=0.0, max=21.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 10'), FloatProgress(value=0.0, max=28.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 106'), FloatProgress(value=0.0, max=20.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 21'), FloatProgress(value=0.0, max=27.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 126'), FloatProgress(value=0.0, max=27.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 102'), FloatProgress(value=0.0, max=22.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 66'), FloatProgress(value=0.0, max=27.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 44'), FloatProgress(value=0.0, max=6.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 64'), FloatProgress(value=0.0, max=12.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 153'), FloatProgress(value=0.0, max=7.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 89'), FloatProgress(value=0.0, max=18.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 132'), FloatProgress(value=0.0, max=15.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 59'), FloatProgress(value=0.0, max=19.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 152'), FloatProgress(value=0.0, max=16.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 101'), FloatProgress(value=0.0, max=14.0), HTML(value='')…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Build a DataFrame out of `df_data`

In [21]:
df_data[:3]

[{'label': 'Q1754',
  'term': 'sweden',
  'weight': 0.42651232272320827,
  'data_id': 0},
 {'label': 'Q1754',
  'term': 'stockholm',
  'weight': 0.4007281724283034,
  'data_id': 0},
 {'label': 'Q1754',
  'term': 'city',
  'weight': 0.22206449681787813,
  'data_id': 0}]

In [22]:
import pandas as pd

In [23]:
relevant_terms_df = pd.DataFrame(df_data)

In [24]:
relevant_terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,Q1754,sweden,0.426512,0
1,Q1754,stockholm,0.400728,0
2,Q1754,city,0.222064,0
3,Q1754,capital,0.218597,0
4,Q1754,capital,0.606604,1


---

## Save retrieved terms to a file

In [25]:
terms_dir = 'terms'
filename = f'relevant_terms_wikidata_bert_{version}.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)

In [26]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---