# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [2]:
import os 
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = 'abstracts_corpus.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [5]:
chunks_filename = 'abstracts_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [6]:
from training import TrainingCorpus

In [7]:
corpus = TrainingCorpus()
corpus.load(corpus_filepath)
corpus.load_chunks(chunks_filepath)

---

## Load the model

Check if GPU is available

In [8]:
import torch
print(torch.cuda.is_available())

True


In [9]:
from model import BertModel

Using TensorFlow backend.


In [10]:
model_dir = 'models/abstracts_bert'
model_dir_path = os.path.join(root_dir, data_dir, model_dir)

In [11]:
model = BertModel(model_dir_path, batch_size=512, use_cuda=True, from_tf=False)

---

## Find relevant terms for each cluster label using LIME

### Instantiate TermFinder

In [12]:
from termfinder import LimeTermFinder

In [13]:
term_finder = LimeTermFinder(model, corpus)

### Retrieve predicted labels for each instance in the corpus

In [14]:
import numpy as np

In [15]:
label_to_data_idx_dict = model.label_to_data_idx(corpus)

In [16]:
len(label_to_data_idx_dict)

4

### Retrieve relevant terms using LimeTermFinder

In [17]:
from tqdm.notebook import tqdm

In [18]:
df_data = []

In [19]:
for label_idx, data_idxs in tqdm(label_to_data_idx_dict.items()):
    for data_idx in tqdm(data_idxs, desc=f'Relevant terms for entity {label_idx}', leave=False):
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)

        if relevant_terms:
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx], 
                              'term': term, 
                              'weight': weight, 
                              'data_id': corpus.docs[data_idx]}
                df_data.append(dict_entry)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 0'), FloatProgress(value=0.0, max=513.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 3'), FloatProgress(value=0.0, max=493.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 2'), FloatProgress(value=0.0, max=497.0), HTML(value=''))…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)






Build a DataFrame out of `df_data`

In [20]:
df_data[:3]

[{'label': 'cs',
  'term': 'architecture_description_language',
  'weight': 0.016073527165445283,
  'data_id': 0},
 {'label': 'cs',
  'term': 'software_based_systems',
  'weight': 0.01522997463515223,
  'data_id': 0},
 {'label': 'cs',
  'term': 'concurrent_systems',
  'weight': 0.0140515597026716,
  'data_id': 0}]

In [21]:
import pandas as pd

In [22]:
relevant_terms_df = pd.DataFrame(df_data)

In [23]:
relevant_terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,cs,architecture_description_language,0.016074,0
1,cs,software_based_systems,0.01523,0
2,cs,concurrent_systems,0.014052,0
3,cs,compositional_rely/guarantee_reasoning,0.009831,0
4,cs,components,0.009536,0


---

## Save retrieved terms to a file

In [24]:
terms_dir = 'terms'
filename = 'relevant_terms_abstracts_bert.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)

In [25]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---