# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [2]:
version = 'v2'

In [3]:
import os 
import sys

In [4]:
sys.path.append(os.path.join(root_dir, src_dir))

In [5]:
corpus_filename = f'wikidata_corpus_{version}.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [6]:
chunks_filename = f'wikidata_chunks_{version}.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [7]:
from training import TrainingCorpus

In [8]:
corpus = TrainingCorpus()
corpus.load(corpus_filepath)
corpus.load_chunks(chunks_filepath)

---

## Load the model

In [9]:
from model import TensorflowModel

Using TensorFlow backend.


In [10]:
models_dir = 'models'

In [11]:
model_filename = f'wikidata_nn_model_{version}.h5'
model_filepath = os.path.join(root_dir, data_dir, models_dir, model_filename)

In [12]:
word_index_filename = f'wikidata_nn_word_index_{version}.csv'
word_index_filepath = os.path.join(root_dir, data_dir, models_dir, word_index_filename)

In [13]:
nn_model = TensorflowModel(model_filepath, word_index_filepath)



---

## Find relevant terms for each cluster label using LIME

### Define utility function for multicore processing

In [14]:
from termfinder import LimeTermFinder

In [15]:
def get_relevant_terms_mp(model, corpus, input_data):
    result_list = []
    
    term_finder = LimeTermFinder(model, corpus)
    
    for label_idx, data_idx in input_data:
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)
        
        if relevant_terms:
            
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx],
                              'term': term,
                              'weight': weight,
                              'data_id': corpus.docs[data_idx]}
                result_list.append(dict_entry)
    
    return result_list

### Split the data into multiple batches

In [16]:
import numpy as np

In [17]:
label_to_data_idx_dict = nn_model.label_to_data_idx(corpus)

In [18]:
input_data = np.array([(label_idx, data_idx) for label_idx, data_idxs in label_to_data_idx_dict.items()
                       for data_idx in data_idxs])

Get the number of available CPU cores

In [19]:
import psutil

In [20]:
psutil.cpu_count(logical=False)

22

Set the number of parallel jobs

In [21]:
lime_jobs = 12

Compute batches

In [22]:
input_slices = np.array_split(input_data, lime_jobs)
len(input_slices[0])

273

Finally, find relevant terms using `LIME`

In [23]:
from joblib import Parallel, delayed

In [24]:
terms_list_tmp = Parallel(n_jobs=lime_jobs, verbose=10, batch_size=1)(delayed(get_relevant_terms_mp)(nn_model, corpus, input_batch) for input_batch in input_slices)

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed: 33.5min
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed: 65.7min remaining: 197.0min
[Parallel(n_jobs=12)]: Done   5 out of  12 | elapsed: 65.9min remaining: 92.3min
[Parallel(n_jobs=12)]: Done   7 out of  12 | elapsed: 66.0min remaining: 47.2min
[Parallel(n_jobs=12)]: Done   9 out of  12 | elapsed: 66.1min remaining: 22.0min
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed: 66.3min finished


Build a DataFrame out of `terms_list_tmp`

In [25]:
df_data = []
for sublist in terms_list_tmp:
    df_data += sublist

In [26]:
df_data[:3]

[{'label': 'Q1754',
  'term': 'stockholm',
  'weight': 0.5558592607920941,
  'data_id': 0},
 {'label': 'Q1754',
  'term': 'sweden',
  'weight': 0.4064425521969092,
  'data_id': 0},
 {'label': 'Q1754',
  'term': 'city',
  'weight': 0.1387460025264278,
  'data_id': 0}]

In [27]:
import pandas as pd

In [28]:
relevant_terms_df = pd.DataFrame(df_data)

In [29]:
relevant_terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,Q1754,stockholm,0.555859,0
1,Q1754,sweden,0.406443,0
2,Q1754,city,0.138746,0
3,Q1754,capital,0.130178,0
4,Q1754,stockholm,0.395502,1


---

## Save retrieved terms to a file

In [30]:
terms_dir = 'terms'
filename = f'relevant_terms_wikidata_nn_{version}.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)

In [31]:
filepath

'../../data/terms/relevant_terms_wikidata_nn_v2.csv'

In [32]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---

## Check for pending joblib processes

In [33]:
from multiprocessing import active_children

In [34]:
active_children()

[]

---