# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [2]:
import os 
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = 'alaska_corpus_noisy.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [5]:
chunks_filename = 'alaska_chunks_noisy.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [6]:
from training import TrainingCorpus

In [7]:
corpus = TrainingCorpus()
corpus.load(corpus_filepath)
corpus.load_chunks(chunks_filepath)

---

## Load the model

Check if GPU is available

In [8]:
import torch
print(torch.cuda.is_available())

True


In [9]:
from model import BertModel

Using TensorFlow backend.


In [10]:
model_dir = 'models/alaska_bert_noisy'
model_dir_path = os.path.join(root_dir, data_dir, model_dir)

In [11]:
model = BertModel(model_dir_path, batch_size=512, use_cuda=True, from_tf=False)

---

## Find relevant terms for each cluster label using LIME

### Instantiate TermFinder

In [12]:
from termfinder import LimeTermFinder

In [13]:
term_finder = LimeTermFinder(model, corpus)

### Retrieve predicted labels for each instance in the corpus

In [14]:
import numpy as np

In [15]:
label_to_data_idx_dict = model.label_to_data_idx(corpus)

In [16]:
len(label_to_data_idx_dict)

12

### Retrieve relevant terms using LimeTermFinder

In [17]:
from tqdm.notebook import tqdm

In [18]:
df_data = []

In [19]:
for label_idx, data_idxs in tqdm(label_to_data_idx_dict.items()):
    for data_idx in tqdm(data_idxs, desc=f'Relevant terms for entity {label_idx}', leave=False):
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)

        if relevant_terms:
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx], 
                              'term': term, 
                              'weight': weight, 
                              'data_id': corpus.docs[data_idx]}
                df_data.append(dict_entry)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 7'), FloatProgress(value=0.0, max=332.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 3'), FloatProgress(value=0.0, max=233.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 2'), FloatProgress(value=0.0, max=223.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 4'), FloatProgress(value=0.0, max=155.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 6'), FloatProgress(value=0.0, max=411.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 10'), FloatProgress(value=0.0, max=124.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 0'), FloatProgress(value=0.0, max=117.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 11'), FloatProgress(value=0.0, max=112.0), HTML(value='')…

HBox(children=(HTML(value='Relevant terms for entity 1'), FloatProgress(value=0.0, max=91.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 8'), FloatProgress(value=0.0, max=80.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 9'), FloatProgress(value=0.0, max=236.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 5'), FloatProgress(value=0.0, max=57.0), HTML(value='')))




Build a DataFrame out of `df_data`

In [20]:
df_data[:3]

[{'label': 'ENTITY#44',
  'term': 'nikon_d3200_dslr_camera',
  'weight': 0.554291608423336,
  'data_id': 0},
 {'label': 'ENTITY#44',
  'term': 'nikon_d3200',
  'weight': 0.2209094099413281,
  'data_id': 1},
 {'label': 'ENTITY#44',
  'term': '55-200/4',
  'weight': 0.03125963275115239,
  'data_id': 1}]

In [21]:
import pandas as pd

In [22]:
relevant_terms_df = pd.DataFrame(df_data)

In [23]:
relevant_terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,ENTITY#44,nikon_d3200_dslr_camera,0.554292,0
1,ENTITY#44,nikon_d3200,0.220909,1
2,ENTITY#44,55-200/4,0.03126,1
3,ENTITY#44,new_zealand_prices,0.026744,1
4,ENTITY#44,nikon_d3200,0.306589,2


---

## Save retrieved terms to a file

In [24]:
terms_dir = 'terms'
filename = 'relevant_terms_alaska_bert_noisy.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)

In [25]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---