## Install Required Libraries

In [None]:
!pip install transformers
!pip install shap
!pip install torchmetrics
!pip install seaborn
!pip install matplotlib
!pip install pandas
!pip install numpy 
!pip install tqdm

## Import libraries

In [None]:
import pandas as pd 
import scipy as sp
from torch import nn
import torch
import numpy as np
from transformers import BertTokenizer
from tqdm import tqdm
from torchmetrics import AUROC
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
from shap_utils import *
from data_utils import *
from models import TextClassifierModel, Dataset, train_model, evaluate_model

In [None]:
# add font family into matplotlib and seaborn for showing Chinese Text
font_dirs = ['../fonts/']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
for font_file in font_files:
    font_manager.fontManager.addfont(font_file)

plt.rcParams['font.family'] = 'SimHei'

sns.set(font_scale=2)
sns.set(font="SimHei")

## Load Dataset

In [None]:
input_path = '../data/'
data_name = 'task_output_5313.txt'
df = clean_data(input_path, data_name)

In [None]:
# for each user, keep the most recent 50 records. most users have < 5 records
sort_df = df.sort_values(['event_time'], ascending=True).groupby('user_no')
df = sort_df.head(50).reset_index()
df.shape

In [None]:
# df = df[:500]

## Define the input for text-classifier

In [None]:
df['input_info'] = df['clean_title'] + ' ' + df['clean_abstract']
df['input_info'].head(2)

In [None]:
# split dataset into training, validation and testing dataset
np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),[int(.8*len(df)), int(.9*len(df))])
print(len(df_train),len(df_val), len(df_test))


In [None]:
# find the average text length which used as max-length in Tokenizer
length = int(df['input_info'].apply(len).mean())
length = np.min([200, length])
print('length', length)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case = True)

In [None]:
# construct pytorch Dataset 
train_dataset, val_dataset, test_dataset = Dataset(df_train, length, tokenizer), Dataset(df_val, length, tokenizer), Dataset(df_test, length, tokenizer)

In [None]:
model = TextClassifierModel(bert_freeze=False)

## Import tokenizer and BERT 

In [None]:
# hyperparameters for training 
EPOCHS = 10
LR = 1e-5
BATCH_SIZE = 16

In [None]:
# classification metric: ROC-AUC
auroc = AUROC(num_classes=2)

## Train model

In [None]:
# train the text-classifier model
train_model(model, train_dataset, val_dataset, auroc, LR, EPOCHS, BATCH_SIZE)

## Evaluate Model

In [None]:
# evaluate the text-classifier model
auroc = AUROC(num_classes=2)

In [None]:
evaluate_model(model, test_dataset, auroc, BATCH_SIZE)

In [None]:
# # torch.save(model, '../saved_models/bert_model.pt')
# # model = torch.load('../saved_models/bert_model.pt')
# # device = torch.device('cuda')
# # model.to(device)

## Build SHAP explainer

In [None]:
import shap

In [None]:
# define the classifier model which will be used in SHAP
def classifier_model(x):
    tv = [tokenizer(v, padding='max_length', max_length=200, truncation=True) for v in x]
    mask = torch.tensor([tv[i]['attention_mask'] for i in range(x.shape[0])]).unsqueeze(1).cuda()
    input_id = torch.tensor([tv[i]['input_ids'] for i in range(x.shape[0])]).cuda()
    logits = model(input_id, mask).detach().cpu().numpy()
    scores = (np.exp(logits).T / np.exp(logits).sum(-1)).T
    outputs = scores[:,1]
    return outputs

In [None]:
# # test classifier_model to make sure it works as expected
# df_try = df[['input_info']][:10]
# classifier_model(df_try['input_info'])

In [None]:
# construct SHAP explainer: combine SHAP and classificer model
explainer = shap.Explainer(classifier_model, tokenizer)

## Run SHAP explainer 

In [None]:
# select test input for SHAP explainer: only select data rows with label==1 (positive data rows)
shap_input = df[df.key_label == 1]['input_info'][:10].values

In [None]:
# calculate the shaply value of input
shap_values = explainer(shap_input)

## Show shaply value of tokens in single input text

In [None]:
shap.plots.text(shap_values[3], grouping_threshold=0.01)

In [None]:
# shap.plots.text(shap_values[3], grouping_threshold=5)

In [None]:
# shap.plots.text(shap_values[3], grouping_threshold=10)

## Show Bar Plots

In [None]:
show_top_n_tokens(shap_values, top_n=10, grouping_threshold=0.01)

In [None]:
show_top_n_tokens(shap_values, top_n=10, grouping_threshold=1)

In [None]:
show_top_n_tokens(shap_values, top_n=10, grouping_threshold=5)

In [None]:
# from scipy.cluster.hierarchy import dendrogram
# values, clustering = unpack_shap_explanation_contents(shap_values[0])
# fig = plt.figure(figsize=(12, 6))
# dn = dendrogram(clustering)
# plt.show()

## Find shaply value of Tags (industry and concept)

In [None]:
!pip install fuzzywuzzy

In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [None]:
tags = df['tags'].apply(lambda x: [i for i in x.split(' ')])

In [None]:
# collect all tags
all_tags = []
for tag in tags:
    all_tags.extend(tag)

unique_tags = np.unique(all_tags)

In [None]:
# collect all tokens and shaply values
all_tokens = []
all_values = []
for i, v in enumerate(shap_values):
    values, clustering = unpack_shap_explanation_contents(v)
    tokens, values, group_sizes = process_shap_values(v.data, values, 0.1, '', clustering)
    all_tokens += list(tokens)
    all_values += list(values)

In [None]:
# find the most similar token for each tag 
tag_values = []
for tag in unique_tags:
    matched_token = process.extract(tag, all_tokens, limit=3)[0][0]
    index = np.where(np.array(all_tokens)==matched_token)[0][0]    
    value = all_values[index]
    tag_values.extend([value])

In [None]:
value_index = np.argsort(tag_values)
sorted_values = np.array(tag_values)[value_index]
sorted_tags = list(np.array(unique_tags)[value_index])
data_dict = {'tags': sorted_tags[::-1], 'shaply-value': sorted_values[::-1]}
score_df = pd.DataFrame(data_dict)

In [None]:
sns.barplot(data=score_df[:20], x="shaply-value", y="tags", color='c')
plt.ylabel('') 
plt.title('top %s tags'%(20), fontsize=12)
plt.tight_layout()
# plt.savefig('../data/'+'tags_values'+'.png', dpi=100)
plt.show()