# Related Question Analysis

In [None]:
import torch
import torch.nn as nn

import re
from tqdm import tqdm, trange
import pickle
import numpy as np
from multiprocessing import Pool

import seaborn as sn
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt


loadpath = "processed_data_bert_expand"
bert_data_path = "bert_expand.pkl"
analysis_output_path = "related_question_analysis"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Read dataset with bert sentence representation

In [None]:
with open(bert_data_path, "rb") as f:
    bert_data = pickle.load(f)
clean_data = bert_data["clean_data"]
reduced_data = bert_data["reduced_data"]
token_data = bert_data["token_data"]
bert_output = bert_data["bert_data"]

print("Type: ", type(bert_output), bert_output.shape)
bert_tensor = torch.from_numpy(bert_output).to(device)
print(bert_tensor.size())
bert_norm = bert_tensor / torch.norm(bert_tensor, dim=1).view(-1, 1)

## Testing
這邊模擬 testing 的情境，當有一個新的 query sentence，要先把句子預處理完後再丟進 `predict()` function。這邊我偷懶就直接拿之前已經預處理好的句子丟進去。

要注意要執行 `predict()` function 前還是要在 shell 用 `bert-serving-start` 把 model run 起來。

`predict()` function 會先取得 query sentence 的 sentence representation，接著再與先前 dataset 取得的 Bert sentence representation 去算 cosine similarity，數值越高就與現在這個 query sentence 越相似。

In [None]:
from sklearn import preprocessing
import pandas as pd
import os

path = os.path.join("..","data","./newdata_clean.xlsx")

df = pd.read_excel(path).drop_duplicates(subset="question", keep='last')

le = preprocessing.LabelEncoder()
le.fit(df['catName'].unique())
num_classes = len(le.classes_)
class_list = list(le.classes_)

for i, c in enumerate(class_list):
    print("{}: {}".format(i, c))

print("number of classes:",num_classes)
df.loc[:,'catName'] = le.transform(df.loc[:,'catName'])
data = df[['question', 'catName']]

data.set_index('question',inplace=True)
print(data.head())
data = data
data.head()

In [None]:


def predict_index(index, num_related=3, verbal=False):
    if verbal:
        print("Query: {}".format(clean_data[index]))
    
    similarity = torch.matmul(bert_norm, bert_norm[index].view(-1,1))
    rank = torch.argsort(similarity, dim=0, descending=True)
    ret = []
    for i in range(1, num_related + 1):
        ret.append(reduced_data[rank[i]])
        if verbal:
            print("\n" + "=" * 10 + "Similarity: {}".format(similarity[rank[i]][0]) + "=" * 10)
            print(re.sub(r'<[^<]*?/?>', '', reduced_data[rank[i]])) # remove output sentence html 
            #print()
            #print(clean_data[rank[i]])
    return ret

In [None]:
gt_list = []
predict_list = []

for index in trange(len(clean_data)):
    ret = predict_index(index, verbal=False)
    gt = data.loc[reduced_data[index]]['catName']
    #print(gt)
    gt_list.append(gt)
    predict_class_index = list(map(lambda s: data.loc[s]['catName'], ret))
    #print(predict_class_index)
    predict_list.append(predict_class_index)

print("clean_data:", len(clean_data))
print("gt_list:", len(gt_list))
print("predict_list:", len(predict_list))

In [None]:
analysis_data = {
    ""
    "gt": gt_list,
    "predict": predict_list,
}
with open(analysis_output_path, "wb") as f:
    pickle.dump(analysis_data, f)

In [None]:
with open(analysis_output_path, "rb") as f:
    analysis_data = pickle.load(f)
gt_list = analysis_data["gt"]
predict_list = analysis_data["predict"]

In [None]:
def analysis(predicts, gt, labels):
    matrix = confusion_matrix(gt, predicts, labels=labels)
    df = pd.DataFrame(matrix, columns=labels, index=labels)
    #print(df.head())
    
    plt.figure(figsize=(35, 15))
    plt.title('Confusion Matrix', y=1.03, fontsize = 25)
    #cmap = sn.cubehelix_palette(start = 1.5, rot = 3, gamma=0.8, as_cmap = True)
    heatmap = sn.heatmap(df, annot=True, annot_kws={"size": 16}) # , cmap=cmap
    plt.ylabel('Ground Truth', fontsize = 20)
    plt.xlabel('Prediction', fontsize = 20)
    heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, horizontalalignment="right")
    plt.savefig('ConfusionMatrix_newdata.jpg', bbox_inches = "tight")
    
    normalized_matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis] # normalize
    normalized_matrix = np.nan_to_num(normalized_matrix).round(2)
    df = pd.DataFrame(normalized_matrix, columns=labels, index=labels)
    
    plt.figure(figsize=(35, 15))
    plt.title('Normalized Confusion Matrix', y=1.03, fontsize = 25)
    #cmap = sn.cubehelix_palette(start = 1.5, rot = 3, gamma=0.8, as_cmap = True)
    heatmap = sn.heatmap(df, annot=True, annot_kws={"size": 16}) # , cmap=cmap
    plt.ylabel('Ground Truth', fontsize = 20)
    plt.xlabel('Prediction', fontsize = 20)
    heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=45, horizontalalignment="right")
    plt.savefig('Normalized_ConfusionMatrix_newdata.jpg', bbox_inches = "tight")


    n = 0
    n_correct = 0
    '''
    for i in range(len(gt)):
        n += 1
        if gt[i][maxindex[i]] == 1:
            n_correct += 1
    print("Accuracy: {}".format(n_correct / n))
    print(len(matrix), len(matrix[0]))
    '''

In [None]:

predict_list_first = np.array(predict_list)[:,0]

predict_str = list(le.inverse_transform(predict_list_first))
gt_str = list(le.inverse_transform(gt_list))
    
analysis(predict_str, gt_str, labels=class_list)

## Future Work
從最後輸出的結果來看其實還不錯，可是再經過一些調查後發現 bert 並不適合這樣直接當作 sentence encoder，目前想到的解決方法如下。

1. 先 find tune 在一些 task 上，像是最一開始做得 supervised classification 後再拿 `[CLS]` 的 output 作為 sentence represention。
2. 使用 [Universal Sentence Encoder](https://arxiv.org/pdf/1803.11175.pdf)。