# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [2]:
import os 
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = 'alaska_corpus.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [5]:
chunks_filename = 'alaska_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [6]:
from training import TrainingCorpus

In [7]:
alaska_corpus = TrainingCorpus()
alaska_corpus.load(corpus_filepath)
alaska_corpus.load_chunks(chunks_filepath)

---

## Load the model

In [8]:
from model import TensorflowTfIdfModel

Using TensorFlow backend.


In [9]:
models_dir = 'models'

In [10]:
model_filename = 'alaska_nn_tfidf_model.h5'
model_filepath = os.path.join(root_dir, data_dir, models_dir, model_filename)

In [11]:
vectorizer_filename = f'alaska_tfidf_vectorizer.pkl'
vectorizer_filepath = os.path.join(root_dir, data_dir, models_dir, vectorizer_filename)

In [12]:
nn_model = TensorflowTfIdfModel(model_filepath, vectorizer_filepath)

---

## Find relevant terms for each cluster label using LIME

### Define utility function for multicore processing

In [13]:
from termfinder import LimeTermFinder

In [14]:
def get_relevant_terms_mp(model, corpus, input_data):
    result_list = []
    
    term_finder = LimeTermFinder(model, corpus)
    
    for label_idx, data_idx in input_data:
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)
        
        if relevant_terms:
            
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx],
                              'term': term,
                              'weight': weight,
                              'data_id': corpus.docs[data_idx]}
                result_list.append(dict_entry)
    
    return result_list

### Split the data into multiple batches

In [15]:
import numpy as np

In [16]:
label_to_data_idx_dict = nn_model.label_to_data_idx(alaska_corpus)

In [17]:
input_data = np.array([(label_idx, data_idx) for label_idx, data_idxs in label_to_data_idx_dict.items()
                       for data_idx in data_idxs])

Get the number of available CPU cores

In [18]:
import psutil

In [19]:
psutil.cpu_count(logical=False)

22

Set the number of parallel jobs

In [20]:
lime_jobs = 12

Compute batches

In [21]:
input_slices = np.array_split(input_data, lime_jobs)

Finally, find relevant terms using `LIME`

In [22]:
from joblib import Parallel, delayed

In [23]:
terms_list_tmp = Parallel(n_jobs=lime_jobs, verbose=10, batch_size=1)(delayed(get_relevant_terms_mp)(nn_model, alaska_corpus, input_batch) for input_batch in input_slices)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed: 11.2min
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed: 11.3min remaining: 33.8min
[Parallel(n_jobs=12)]: Done   5 out of  12 | elapsed: 11.3min remaining: 15.8min
[Parallel(n_jobs=12)]: Done   7 out of  12 | elapsed: 11.4min remaining:  8.1min
[Parallel(n_jobs=12)]: Done   9 out of  12 | elapsed: 11.7min remaining:  3.9min
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed: 11.8min finished


Build a DataFrame out of `terms_list_tmp`

In [26]:
df_data = []
for sublist in terms_list_tmp:
    df_data += sublist

In [27]:
df_data[:3]

[{'label': 'ENTITY#44',
  'term': 'nikon_d3200_dslr_camera',
  'weight': 0.8744028530235662,
  'data_id': 0},
 {'label': 'ENTITY#44',
  'term': 'nikon_d3200',
  'weight': 0.8809640176133363,
  'data_id': 1},
 {'label': 'ENTITY#44',
  'term': 'nikon_d3200',
  'weight': 0.8871804648013564,
  'data_id': 2}]

In [28]:
import pandas as pd

In [29]:
relevant_terms_df = pd.DataFrame(df_data)

In [30]:
relevant_terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,ENTITY#44,nikon_d3200_dslr_camera,0.874403,0
1,ENTITY#44,nikon_d3200,0.880964,1
2,ENTITY#44,nikon_d3200,0.88718,2
3,ENTITY#44,nikon_d3200_digital_dslr_camera,0.783001,3
4,ENTITY#44,24,0.068562,3


---

## Save retrieved terms to a file

In [31]:
terms_dir = 'terms'
filename = 'relevant_terms_alaska_nn_tfidf.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)

In [32]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---

## Check for pending joblib processes

In [33]:
from multiprocessing import active_children

In [34]:
active_children()

[]

---