# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [None]:
version = 'v1'

In [2]:
import os 
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
corpus_filename = f'wikidata_corpus_{version}.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [5]:
chunks_filename = f'wikidata_chunks_{version}.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [6]:
from training import TrainingCorpus

In [7]:
corpus = TrainingCorpus()
corpus.load(corpus_filepath)
corpus.load_chunks(chunks_filepath)

---

## Load the model

In [8]:
from model import TensorflowModel

Using TensorFlow backend.


In [9]:
models_dir = 'models'

In [10]:
model_filename = f'wikidata_nn_model_{version}.h5'
model_filepath = os.path.join(root_dir, data_dir, models_dir, model_filename)

In [11]:
word_index_filename = f'wikidata_nn_word_index_{version}.csv'
word_index_filepath = os.path.join(root_dir, data_dir, models_dir, word_index_filename)

In [12]:
nn_model = TensorflowModel(model_filepath, word_index_filepath)



---

## Find relevant terms for each cluster label using LIME

### Define utility function for multicore processing

In [13]:
from termfinder import LimeTermFinder

In [14]:
def get_relevant_terms_mp(model, corpus, input_data):
    result_list = []
    
    term_finder = LimeTermFinder(model, corpus)
    
    for label_idx, data_idx in input_data:
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)
        
        if relevant_terms:
            
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx],
                              'term': term,
                              'weight': weight,
                              'data_id': corpus.docs[data_idx]}
                result_list.append(dict_entry)
    
    return result_list

### Split the data into multiple batches

In [15]:
import numpy as np

In [16]:
label_to_data_idx_dict = nn_model.label_to_data_idx(corpus)

In [17]:
input_data = np.array([(label_idx, data_idx) for label_idx, data_idxs in label_to_data_idx_dict.items()
                       for data_idx in data_idxs])

Get the number of available CPU cores

In [18]:
import psutil

In [19]:
psutil.cpu_count(logical=False)

22

Set the number of parallel jobs

In [20]:
lime_jobs = 12

Compute batches

In [21]:
input_slices = np.array_split(input_data, lime_jobs)
len(input_slices[0])

10

Finally, find relevant terms using `LIME`

In [22]:
from joblib import Parallel, delayed

In [23]:
terms_list_tmp = Parallel(n_jobs=lime_jobs, verbose=10, batch_size=1)(delayed(get_relevant_terms_mp)(nn_model, corpus, input_batch) for input_batch in input_slices)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:  1.9min
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:  2.0min remaining:  5.9min
[Parallel(n_jobs=12)]: Done   5 out of  12 | elapsed:  2.3min remaining:  3.2min
[Parallel(n_jobs=12)]: Done   7 out of  12 | elapsed:  2.5min remaining:  1.8min
[Parallel(n_jobs=12)]: Done   9 out of  12 | elapsed:  3.0min remaining:   59.7s
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed:  3.3min finished


Build a DataFrame out of `terms_list_tmp`

In [24]:
df_data = []
for sublist in terms_list_tmp:
    df_data += sublist

In [25]:
df_data[:3]

[{'label': 'Q64',
  'term': 'germany',
  'weight': 0.26230446460146156,
  'data_id': 0},
 {'label': 'Q64',
  'term': 'berlin',
  'weight': 0.16725196567664263,
  'data_id': 0},
 {'label': 'Q64',
  'term': 'largest_city',
  'weight': 0.16549557059539513,
  'data_id': 0}]

In [26]:
import pandas as pd

In [27]:
relevant_terms_df = pd.DataFrame(df_data)

In [28]:
relevant_terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,Q64,germany,0.262304,0
1,Q64,berlin,0.167252,0
2,Q64,largest_city,0.165496,0
3,Q64,capital,0.155119,0
4,Q64,convention_venues,0.027901,1


---

## Save retrieved terms to a file

In [29]:
terms_dir = 'terms'
filename = f'relevant_terms_wikidata_nn_{version}.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)

In [34]:
filepath

'../../data/terms/relevant_terms_wikidata_nn.csv'

In [30]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---

## Check for pending joblib processes

In [31]:
from multiprocessing import active_children

In [33]:
active_children()

[]

---