# Find cluster related terms using LIME

## Load the data

In [1]:
root_dir = '../..'
data_dir = 'data'
corpus_dir = 'corpus'
src_dir = 'src'

In [2]:
import os 
import sys

In [3]:
sys.path.append(os.path.join(root_dir, src_dir))

In [4]:
dataset_name = 'alaska'
test_name = 'sampling_terms_test1'

In [5]:
corpus_filename = f'{dataset_name}_corpus.json'
corpus_filepath = os.path.join(root_dir, data_dir, corpus_dir, corpus_filename)

In [6]:
chunks_filename = f'{dataset_name}_chunks.json'
chunks_filepath = os.path.join(root_dir, data_dir, corpus_dir, chunks_filename)

In [7]:
from training import TrainingCorpus

In [8]:
alaska_corpus = TrainingCorpus()
alaska_corpus.load(corpus_filepath)
alaska_corpus.load_chunks(chunks_filepath)

---

## Load the model

In [9]:
from model import TensorflowModel

Using TensorFlow backend.


In [10]:
models_dir = 'models'

In [11]:
model_filename = f'{dataset_name}_nn_model.h5'
model_filepath = os.path.join(root_dir, data_dir, models_dir, model_filename)

In [12]:
word_index_filename = f'{dataset_name}_nn_word_index.csv'
word_index_filepath = os.path.join(root_dir, data_dir, models_dir, word_index_filename)

In [13]:
nn_model = TensorflowModel(model_filepath, word_index_filepath)



---

## Find relevant terms for each cluster label using LIME

### Define utility function for multicore processing

In [14]:
from termfinder import LimeTermFinder

In [15]:
def get_relevant_terms_mp(model, corpus, input_data):
    result_list = []
    
    term_finder = LimeTermFinder(model, corpus, min_fts=15, max_fts=30)
    
    for label_idx, data_idx in input_data:
        relevant_terms = term_finder.get_relevant_terms(data_idx, label_idx)
        
        if relevant_terms:
            
            for term, weight in relevant_terms.items():
                dict_entry = {'label': corpus.labels[label_idx],
                              'term': term,
                              'weight': weight,
                              'data_id': corpus.docs[data_idx]}
                result_list.append(dict_entry)
    
    return result_list

### Split the data into multiple batches

In [16]:
import numpy as np

In [17]:
label_to_data_idx_dict = nn_model.label_to_data_idx(alaska_corpus)

In [18]:
input_data = np.array([(label_idx, data_idx) for label_idx, data_idxs in label_to_data_idx_dict.items()
                       for data_idx in data_idxs])

Get the number of available CPU cores

In [19]:
import psutil

In [20]:
psutil.cpu_count(logical=False)

22

Set the number of parallel jobs

In [21]:
lime_jobs = 12

Compute batches

In [22]:
input_slices = np.array_split(input_data, lime_jobs)

Finally, find relevant terms using `LIME`

In [23]:
from joblib import Parallel, delayed

In [24]:
import time
import datetime

start_time = time.time()
terms_list_tmp = Parallel(n_jobs=lime_jobs, verbose=10, batch_size=1)(delayed(get_relevant_terms_mp)(nn_model, alaska_corpus, input_batch) for input_batch in input_slices)

end_time = time.time()

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   1 tasks      | elapsed:  9.1min
[Parallel(n_jobs=12)]: Done   3 out of  12 | elapsed:  9.2min remaining: 27.5min
[Parallel(n_jobs=12)]: Done   5 out of  12 | elapsed:  9.3min remaining: 13.0min
[Parallel(n_jobs=12)]: Done   7 out of  12 | elapsed:  9.3min remaining:  6.6min
[Parallel(n_jobs=12)]: Done   9 out of  12 | elapsed: 10.8min remaining:  3.6min
[Parallel(n_jobs=12)]: Done  12 out of  12 | elapsed: 14.0min finished


In [25]:
delta_t = end_time - start_time
elapsed_time = str(datetime.timedelta(seconds=delta_t))
print(f'Elapsed time: {elapsed_time}')

Elapsed time: 0:13:58.276370


Build a DataFrame out of `terms_list_tmp`

In [26]:
df_data = []
for sublist in terms_list_tmp:
    df_data += sublist

In [27]:
df_data[:3]

[{'label': 'ENTITY#44',
  'term': 'nikon_d3200_dslr_camera',
  'weight': 0.5635722869060087,
  'data_id': 0},
 {'label': 'ENTITY#44',
  'term': '18_55mm',
  'weight': 0.265820261228533,
  'data_id': 0},
 {'label': 'ENTITY#44',
  'term': '55_200mm_lenses',
  'weight': 0.1758571540790074,
  'data_id': 0}]

In [28]:
import pandas as pd

In [29]:
relevant_terms_df = pd.DataFrame(df_data)

In [30]:
relevant_terms_df.head()

Unnamed: 0,label,term,weight,data_id
0,ENTITY#44,nikon_d3200_dslr_camera,0.563572,0
1,ENTITY#44,18_55mm,0.26582,0
2,ENTITY#44,55_200mm_lenses,0.175857,0
3,ENTITY#44,black_ebay,0.12929,0
4,ENTITY#44,nikon_d3200,0.812544,1


---

## Save retrieved terms to a file

In [31]:
terms_dir = 'terms'
filename = f'relevant_terms_{dataset_name}_nn_{test_name}.csv'
filepath = os.path.join(root_dir, data_dir, terms_dir, filename)
filepath

'../../data/terms/relevant_terms_alaska_nn_sampling_terms_test1.csv'

In [32]:
relevant_terms_df.to_csv(filepath, encoding='utf-8', index=False)

---

## Check for pending joblib processes

In [33]:
from multiprocessing import active_children

In [36]:
active_children()

[]

---