# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [2]:
version = 'v2'

In [3]:
import os 
import sys

In [4]:
sys.path.append(os.path.join(root_dir, src_dir))

In [5]:
corpus_filename = f'wikidata_corpus_{version}.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [6]:
chunks_filename = f'wikidata_chunks_{version}.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [7]:
from training import TrainingCorpus

In [8]:
corpus = TrainingCorpus()
corpus.load(corpus_filepath)
corpus.load_chunks(chunks_filepath)

---

## Load the model

In [9]:
from model import BertModel

Using TensorFlow backend.


In [10]:
model_dir = f'models/wikidata_bert_{version}'
model_dir_path = os.path.join(root_dir, data_dir, model_dir)

In [11]:
model = BertModel(model_dir_path)

All TF 2.0 model weights were used when initializing DistilBertForSequenceClassification.

All the weights of DistilBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DistilBertForSequenceClassification for predictions without further training.


---

## Find relevant terms for each cluster label using LIME

### Define utility function for multicore processing

In [12]:
from termfinder import LimeTermFinder

In [13]:
def get_relevant_terms_mp(model, corpus, input_data):
    result_list = []
    
    term_finder = LimeTermFinder(model, corpus)
    
    for label_idx, data_idx in input_data:
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)
        
        if relevant_terms:
            
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx],
                              'term': term,
                              'weight': weight,
                              'data_id': corpus.docs[data_idx]}
                result_list.append(dict_entry)
    
    return result_list

### Split the data into multiple batches

In [14]:
import numpy as np

In [15]:
label_to_data_idx_dict = model.label_to_data_idx(corpus)

In [16]:
input_data = np.array([(label_idx, data_idx) for label_idx, data_idxs in label_to_data_idx_dict.items()
                       for data_idx in data_idxs])

Get the number of available CPU cores

In [17]:
import psutil

In [18]:
psutil.cpu_count(logical=False)

22

Set the number of parallel jobs

In [19]:
lime_jobs = 12

Compute batches

In [20]:
input_slices = np.array_split(input_data, lime_jobs)

Finally, find relevant terms using `LIME`

In [21]:
from joblib import Parallel, delayed

In [None]:
terms_list_tmp = Parallel(n_jobs=lime_jobs, verbose=10, batch_size=1)(delayed(get_relevant_terms_mp)(model, corpus, input_batch) for input_batch in input_slices)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed: 219.8min
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed: 219.8min remaining: 659.3min
[Parallel(n_jobs=12)]: Done   5 out of  12 | elapsed: 219.8min remaining: 307.7min
[Parallel(n_jobs=12)]: Done   7 out of  12 | elapsed: 219.8min remaining: 157.0min
[Parallel(n_jobs=12)]: Done   9 out of  12 | elapsed: 219.8min remaining: 73.3min


Build a DataFrame out of `terms_list_tmp`

In [None]:
df_data = []
for sublist in terms_list_tmp:
    df_data += sublist

In [None]:
df_data[:3]

In [None]:
import pandas as pd

In [None]:
relevant_terms_df = pd.DataFrame(df_data)

In [None]:
relevant_terms_df.head()

---

## Save retrieved terms to a file

In [None]:
terms_dir = 'terms'
filename = f'relevant_terms_wikidata_bert_{version}.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)

In [None]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---

## Check for pending joblib processes

In [None]:
from multiprocessing import active_children

In [None]:
active_children()

---