# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [2]:
import os 
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
dataset_name = 'alaska'
test_name = 'sampling_records_test1'

In [5]:
corpus_filename = f'{dataset_name}_corpus.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [6]:
chunks_filename = f'{dataset_name}_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [7]:
from training import TrainingCorpus

In [8]:
corpus = TrainingCorpus()
corpus.load(corpus_filepath)
corpus.load_chunks(chunks_filepath)

---

## Load the classification model

Check if GPU is available

In [9]:
import torch
print(torch.cuda.is_available())

True


In [10]:
from model import BertModel

Using TensorFlow backend.


In [11]:
model_dir = f'models/{dataset_name}_bert'
model_dir_path = os.path.join(root_dir, data_dir, model_dir)

In [12]:
model = BertModel(model_dir_path, batch_size=512, use_cuda=True, from_tf=True)

All TF 2.0 model weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


---

## Sample data

### Load the embedding model

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained(model_dir_path)
embedding_model = AutoModel.from_pretrained(model_dir_path, from_tf=True)
_ = embedding_model.to(device)

All TF 2.0 model weights were used when initializing DistilBertModel.

All the weights of DistilBertModel were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertModel for predictions without further training.


### Get embeddings

In [15]:
def get_batch(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i: i + batch_size]

In [16]:
import numpy as np

In [17]:
texts = [' '.join(corpus.get_tokens(doc_id)) for doc_id in corpus.docs]
embeddings = []
batch_size = 512

for batch in get_batch(texts, batch_size):
    encoded_batch = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    encoded_batch.to(device)
    
    with torch.no_grad():
        batch_embeddings = embedding_model(**encoded_batch)['last_hidden_state'][:,0]
        batch_embeddings = batch_embeddings.cpu().numpy()
        embeddings.append(batch_embeddings)

# concatenate all batch embeddings
embeddings = np.concatenate(embeddings, axis=0)
embeddings.shape

(2171, 768)

### Free memory

In [18]:
del(embedding_model)
torch.cuda.empty_cache()

### Sample records using K-means

In [19]:
label_to_idx_map = model.label_to_data_idx(corpus)

In [20]:
from sample import KMeansSampler

sampler = KMeansSampler(corpus, embeddings, label_to_idx_map, min_size=50, max_size=100)
label_to_sample_idx_map = {l: sampler.sample_data(l) for l in label_to_idx_map}

## Find relevant terms for each cluster label using LIME

### Instantiate TermFinder

In [21]:
from termfinder import LimeTermFinder

In [22]:
term_finder = LimeTermFinder(model, corpus)

### Retrieve relevant terms using LimeTermFinder

In [23]:
from tqdm.notebook import tqdm

In [24]:
df_data = []

In [25]:
import time
import datetime

start_time = time.time()



for label_idx, data_idxs in tqdm(label_to_sample_idx_map.items()):
    for data_idx in tqdm(data_idxs, desc=f'Relevant terms for entity {label_idx}', leave=False):
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)

        if relevant_terms:
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx], 
                              'term': term, 
                              'weight': weight, 
                              'data_id': corpus.docs[data_idx]}
                df_data.append(dict_entry)

end_time = time.time()

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 11'), FloatProgress(value=0.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 7'), FloatProgress(value=0.0, max=99.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 4'), FloatProgress(value=0.0, max=99.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 8'), FloatProgress(value=0.0, max=94.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 10'), FloatProgress(value=0.0, max=91.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 6'), FloatProgress(value=0.0, max=84.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 14'), FloatProgress(value=0.0, max=89.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 17'), FloatProgress(value=0.0, max=88.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 0'), FloatProgress(value=0.0, max=89.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 18'), FloatProgress(value=0.0, max=75.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 19'), FloatProgress(value=0.0, max=63.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 3'), FloatProgress(value=0.0, max=63.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 12'), FloatProgress(value=0.0, max=54.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 15'), FloatProgress(value=0.0, max=63.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 5'), FloatProgress(value=0.0, max=70.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 13'), FloatProgress(value=0.0, max=64.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 9'), FloatProgress(value=0.0, max=45.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 16'), FloatProgress(value=0.0, max=45.0), HTML(value=''))…

HBox(children=(HTML(value='Relevant terms for entity 2'), FloatProgress(value=0.0, max=48.0), HTML(value='')))

HBox(children=(HTML(value='Relevant terms for entity 1'), FloatProgress(value=0.0, max=52.0), HTML(value='')))




In [26]:
delta_t = end_time - start_time
elapsed_time = str(datetime.timedelta(seconds=delta_t))
print(f'Elapsed time: {elapsed_time}')

Elapsed time: 1:03:45.711075


Build a DataFrame out of `df_data`

In [27]:
df_data[:3]

[{'label': 'ENTITY#44',
  'term': 'd3200_24_2_mp_digital_slr_camera_kit_body',
  'weight': 0.9252412343054901,
  'data_id': 83},
 {'label': 'ENTITY#44',
  'term': 'd3200',
  'weight': 0.8703311308888413,
  'data_id': 160},
 {'label': 'ENTITY#44',
  'term': 'nikon_d3200',
  'weight': 0.752942383134718,
  'data_id': 12}]

In [28]:
import pandas as pd

In [29]:
relevant_terms_df = pd.DataFrame(df_data)

In [30]:
relevant_terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,ENTITY#44,d3200_24_2_mp_digital_slr_camera_kit_body,0.925241,83
1,ENTITY#44,d3200,0.870331,160
2,ENTITY#44,nikon_d3200,0.752942,12
3,ENTITY#44,nikon_d3200_dslr_camera,0.742707,80
4,ENTITY#44,lens,0.040852,80


---
## Save retrieved terms to a file

In [31]:
terms_dir = 'terms'
filename = f'relevant_terms_{dataset_name}_bert_{test_name}.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)
filepath

'../../data/terms/relevant_terms_alaska_bert_sampling_records_test1.csv'

In [32]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---