# Customer Service Data Clustering

In [1]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import os
import re

import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from MulticoreTSNE import MulticoreTSNE as TSNE

import pickle

from tqdm import tqdm

from sklearn.cluster import DBSCAN

### Configuration

In [2]:
path = os.path.join("..","data","./newdata_clean.xlsx")
n_cpu = 15
batch_size = 10000
max_k = 40
max_features = 256  # only consider the top max_features ordered by term frequency across the corpus.
loadpath = "processed_data_not_rmsw"

### Read Data

In [3]:
df = pd.read_excel(path)

df = df.dropna() # drop nan entry
# df[pd.isnull(df).any(axis=1)]

le = preprocessing.LabelEncoder()
le.fit(df['catName'].unique())
num_classes = len(le.classes_)
class_list = list(le.classes_)

#print(class_list)

print("number of classes:",num_classes)
df.loc[:,'catName'] = le.transform(df.loc[:,'catName'])
data = df.question

number of classes: 64


### Preprocess Data

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

import langid

from nltk.corpus import stopwords

stw = stopwords.words('english') + ['nbsp', 'powerdirector', 'cyberlink', 'powerdvd', 'power', 'director', 'ba']
#print(stw)
print("Stopwords length: {}".format(len(stw)))

def _filter(ori_x):
    x = re.sub('<[^<]*?/?>', ' ', ori_x)        # remove all html tag
    x = re.sub('https?:\/\/[^ ]*', ' ', x)  # remove all url
    x = re.sub('\S*@\S*\s?', ' ', x)        # remove all email address
    x = re.sub('\S*\.\S*\s?', ' ', x, flags=re.IGNORECASE)        # remove all filename
    x = re.sub('[^a-z A-Z]', ' ', x)        # remove all non-english alphabat
    return x
'''
def _correct_word(text1):
    pattern = re.compile(r"(.)\1{2,}")
    text2 = pattern.sub(r"\1\1", text1) # reduce lengthening
    #if text1 != text2:
    #    print(text1, text2)
    text3 = spell(text2).lower() # spell correction
    #if text2 != text3:
    #    print(text2, text3)
    return text3
'''
def _get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

def _lemmatization(tokens):
    tagged_sent = pos_tag(tokens)   
    ret = []
    for tag in tagged_sent:
        wordnet_pos = _get_wordnet_pos(tag[1]) or wordnet.NOUN
        ret.append(wnl.lemmatize(tag[0], pos=wordnet_pos))
    return ret

def _remove_stopword(tokens):
    ret = []
    for word in tokens:
        if word not in stw and len(word) > 2:
            ret.append(word)
    return ret

def preprocess(sentence):
    sentence = _filter(sentence.lower())
    tokens = nltk.word_tokenize(sentence)

    #tokens = [self._correct_word(word) for word in tokens] # spell correction
    tokens = _lemmatization(tokens) # lemmatization
    #tokens = _remove_stopword(tokens) # remove stopwords
    s = " ".join(tokens)

    return s, tokens

def process_batch(batch):   
    clean_batch = []
    reduced_batch = []
    token_batch = []
    for s in tqdm(batch):
        ret = langid.classify(s)
        if ret[0] != "en" and ret[1] < -100: # remove language other than english
            #print(ret)
            #print(s)
            continue
        else:
            processed, tokens = preprocess(s)
            if len(tokens) <= 3: # remove too short sentence
                print(s,processed)
                continue
            #print(processed)
            #print(tokens)
            reduced_batch.append(s)
            clean_batch.append(processed)
            token_batch.append(tokens)
    return clean_batch, reduced_batch, token_batch

Stopwords length: 186


[nltk_data] Downloading package punkt to
[nltk_data]     /home/student/05/b05505004/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/05/b05505004/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
clean_data = []
reduced_data = []
token_data = []

n_workers = n_cpu
from multiprocessing import Pool
ret = [None] * n_workers
n_data = len(data)
with Pool(processes=n_workers) as pool:
    for i in range(n_workers):
        batch_start = (n_data // n_workers) * i 
        if i == n_workers - 1:
            batch_end = n_data
        else:
            batch_end = (n_data // n_workers) * (i + 1)
        batch = data[batch_start:batch_end]
        ret[i] = pool.apply_async(process_batch, [batch])
    pool.close()
    pool.join()
    
for result in ret:
    clean_batch, reduced_batch, token_batch = result.get()
    clean_data += clean_batch
    reduced_data += reduced_batch
    token_data += token_batch
print("done")

  0%|          | 29/7098 [00:05<1:44:26,  1.13it/s]

help<br><br>Attach File : <a href='http://csupload.cyberlink.com/upload-file/support/cs/2018-09-23/CS001936718/CL_PHOTODIRECTOR_DATA.zip' target=_blank>CL_PHOTODIRECTOR_DATA.zip</a> help attach file


  0%|          | 1/7106 [00:06<13:08:42,  6.66s/it]

Dudihovkcv<br><br>Attach File : <a href='http://csupload.cyberlink.com/upload-file/support/cs/2018-11-12/CS001953678/feedback_data.zip' target=_blank>feedback_data.zip</a> dudihovkcv attach file


  1%|▏         | 91/7098 [00:08<05:56, 19.65it/s]t]

hi<br><br>Attach File : <a href='http://csupload.cyberlink.com/upload-file/support/cs/2018-02-05/CS001854032/CL_PHOTODIRECTOR_DATA.zip' target=_blank>CL_PHOTODIRECTOR_DATA.zip</a> hi attach file


  0%|          | 16/7098 [00:08<1:51:55,  1.05it/s]

﻿<html><div dir="auto"> <br> <br> <div data-smartmail="gmail_signature"> A</div> </div>   </html><br/>﻿ a


  1%|▏         | 103/7098 [00:08<06:21, 18.36it/s]]

I Like you i like you


  0%|          | 7/7098 [00:09<5:59:20,  3.04s/it]]

great<br><br>Attach File : <a href='http://csupload.cyberlink.com/upload-file/support/cs/2018-09-18/CS001935208/CL_PHOTODIRECTOR_DATA.zip' target=_blank>CL_PHOTODIRECTOR_DATA.zip</a> great attach file


  2%|▏         | 140/7098 [00:10<06:28, 17.92it/s]]Process ForkPoolWorker-9:
Process ForkPoolWorker-4:
Process ForkPoolWorker-10:
Process ForkPoolWorker-2:
Process ForkPoolWorker-8:
Process ForkPoolWorker-13:


KeyboardInterrupt: 

In [None]:
def find_optimal_clusters(data, max_k, batch_size):
    iters = range(2, max_k+1, 2)
    
    sse = []
    bar = tqdm(iters)
    for k in bar:
        sse.append(MiniBatchKMeans(n_clusters=k, batch_size=batch_size, random_state=20).fit(data).inertia_)
        bar.set_description('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')

In [None]:
def cluster_and_plot(data, cluster_algo, n_clusters, eps, calculate_portion, draw_portion):
    n_data = data.shape[0]
    print("Input Data shape:", data.shape)
    
    # random sample (n_sample) points
    np.random.seed(5)
    n_sample = n_data // calculate_portion
    print("Only calculate {} data points".format(n_sample))
    sample_items = np.random.choice(range(n_data), size=n_sample, replace=False)
    
    
    if cluster_algo == "kmeans":
        labels = MiniBatchKMeans(n_clusters=n_clusters, batch_size=batch_size, random_state=20).fit_predict(data)
        npdata = data[sample_items,:]
        label_subset = labels[sample_items]
    elif cluster_algo == "dbscan":
        npdata = data[sample_items]
        labels = DBSCAN(eps=eps, n_jobs=n_cpu).fit_predict(npdata)
        label_subset = labels
        print("Number of noise data: {}".format(len(label_subset[label_subset == -1])))
    
    # number of clusters
    max_label = max(labels) 
    print("Number of clusters: {}".format(max_label + 1))
    pca = PCA(n_components=2, whiten=True).fit_transform(npdata)
    print("PCA done")
    tsne_pca = TSNE(n_jobs=n_cpu, n_iter=5000).fit_transform(PCA(n_components=(max_features // 2)).fit_transform(npdata))
    print("tsne_pca done")
    tsne = TSNE(n_jobs=n_cpu, n_iter=5000).fit_transform(npdata)
    print("tsne30 done")
    tsne_per50 = TSNE(n_jobs=n_cpu, perplexity=50, n_iter=5000).fit_transform(npdata)
    print("tsne50 done")
    
    # draw only (n_draw) points
    np.random.seed(5)
    n_draw = n_sample // draw_portion
    print("Only draw {} data points".format(n_draw))
    idx = np.random.choice(range(pca.shape[0]), size=n_draw, replace=False)
    
    # draw scatter
    if cluster_algo == "kmeans":
        start_idx = 0
    else:
        start_idx = -1
    f, ax = plt.subplots(2, 2, figsize=(13, 10))
    for i in range(start_idx,max_label + 1):
        sub_idx = idx[label_subset[idx] == i]
        #print("{} points in group {}".format(len(sub_idx),i))
        label_subset_color = np.array([cm.hsv(i/ ( max_label + 1)) for i in label_subset[sub_idx]])
        ax[0,0].scatter(pca[sub_idx, 0], pca[sub_idx, 1])   # , c=label_subset_color
        ax[0,0].set_title('PCA Cluster Plot')
        
        ax[1,0].set_title('t-SNE & PCA Cluster Plot')
        ax[1,0].scatter(tsne_pca[sub_idx, 0], tsne_pca[sub_idx, 1], label="Group {} | {}".format(i,len(sub_idx))) # , c=label_subset_color
        
        ax[0,1].set_title('t-SNE Cluster Plot')
        ax[0,1].scatter(tsne[sub_idx, 0], tsne[sub_idx, 1]) # , c=label_subset_color
        
        ax[1,1].set_title('t-SNE Cluster Plot (Perplexity 50)')
        ax[1,1].scatter(tsne_per50[sub_idx, 0], tsne_per50[sub_idx, 1]) # , c=label_subset_color
    f.legend() # plot only one legend
    
    if cluster_algo == "kmeans":
        return data, labels
    else:
        return npdata, labels

In [None]:
output = {
    "clean_data": clean_data,
    "reduced_data": reduced_data,
    "token_data": token_data
}
with open(loadpath, "wb") as f:
    pickle.dump(output, f)

In [None]:
with open(loadpath, "rb") as f:
    output = pickle.load(f)
clean_data = output["clean_data"]
reduced_data = output["reduced_data"]
token_data = output["token_data"]

## TF-IDF Clustering

In [None]:
print("max_feature",max_features)
tfidf = TfidfVectorizer(
    min_df = 0.001,
    max_df = 0.95,
    max_features = max_features,
    stop_words = 'english'
)
tfidf.fit(clean_data)
text = tfidf.transform(clean_data)

In [None]:
find_optimal_clusters(text, max_k, batch_size)

### Clustering by Kmeans

In [None]:
sample_text, sample_text_labels = cluster_and_plot(text, "kmeans", 10, None, 500, 10)

### Clustering by DBSCAN

In [None]:
cluster_and_plot(text, "dbscan", None, 0.999, 50, 10)

### Get keywords of each TF-IDF cluster
作法是將每個 cluster 的前幾高 tfidf 字取出來，作為這個 cluster 的代表字。但是效果不好，且很多 cluster 都有相同常見的字(ex: download)

In [None]:
def get_top_keywords(data, text, clusters, labels, n_terms):
    data = np.array(data)
    clusters = np.array(clusters)
    text_feature = text.toarray()
    labels = np.array(labels)
    group = pd.DataFrame(text.todense()).groupby(clusters)
    #print("Data point in each group:\n", group.size())
    df = group.mean() # [(clusters) rows x (feature) columns]

    for i,r in df.iterrows():
        sub_text = data[clusters == i]
        sub_text_list = sub_text
        sub_text_feature = text_feature[clusters == i]
        #print("sub_text: {} | sub_text_feature: {}".format(sub_text.shape,sub_text_feature.shape))

        dist = np.linalg.norm(sub_text_feature - np.array(r), axis=1)

        #print("Euclidean distance:", dist.shape, dist)
        
        print('\nCluster {}'.format(i))
        print(', '.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
        #print(','.join([str(r[t]) for t in np.argsort(r)[-n_terms:]]))
        print('\n'.join([re.sub(' +', ' ',_filter(sub_text_list[t])) + " | " + str(dist[t]) for t in np.argsort(dist)[-2:]]))
            
get_top_keywords(reduced_data, sample_text, sample_text_labels, tfidf.get_feature_names(), 10)

## Doc2vec Clustering

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
tagged_data = [TaggedDocument(words=tokens, tags=[str(i)]) for i, tokens in enumerate(token_data)]

max_epochs = 100
alpha = 0.025

model = Doc2Vec(vector_size=max_features,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                workers=n_cpu,
                dm=1)
  
model.build_vocab(tagged_data)
bar = tqdm(range(max_epochs))
for epoch in bar:
    bar.set_description('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

In [None]:
#model= Doc2Vec.load("d2v.model")
model= Doc2Vec.load("enwiki_dbow/doc2vec.bin")
'''
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)


# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)


# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])
'''

n_reduced_data = len(reduced_data)
docvec = []
for i in tqdm(range(n_reduced_data)):
    #docvec.append(model.docvecs[str(i)])
    docvec.append(model.infer_vector(token_data[i]))
docvec = np.array(docvec)
print("Number of data: {}".format(n_reduced_data))

In [None]:
find_optimal_clusters(docvec, max_k, batch_size)

### Clustering by Kmeans

In [None]:
_ = cluster_and_plot(docvec, "kmeans", 10, None, 50, 10)

### Clustering by DBSCAN

In [None]:
_ = cluster_and_plot(docvec, "dbscan", None, 2.1, 50, 10)

### Reference
* [Clustering documents with TFIDF and KMeans](https://www.kaggle.com/jbencina/clustering-documents-with-tfidf-and-kmeans)
* [Analyzing tf-idf results in scikit-learn](https://buhrmann.github.io/tfidf-analysis.html)

## LDA Topic Model
如果不移除 stopword 的話效果很差，主題的字都會是 of, for, it...

In [None]:
import gensim
from gensim.models import LdaMulticore

dictionary = gensim.corpora.Dictionary(token_data)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

bow_corpus = [dictionary.doc2bow(doc) for doc in token_data]

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

### LDA using BOW

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=n_cpu)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

### LDA using TF-IDF

In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=n_cpu)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

## Bert Sentence Encoding Clustering

1. 要先把 bert server 開起來
    `gpu0 bert-serving-start -model_dir uncased_L-12_H-768_A-12 -num_worker 4  -port 1355 -max_seq_len 40 -device_map 0`
2. 直到出現 `all set, ready to serve request!` 才可以 run client 的 command
Note:
1. 如果是 run 大的 model 會 load 不進去 GPU 中，跑出 OOM
2. 如果系統記憶體不夠則不會出現 all set, ready to serve request!，CPU 版來說，最多只能 -num_worker 2

In [None]:
from bert_serving.client import BertClient
bc = BertClient(port=1355)
print("Start predicting")
bert_output = bc.encode(clean_data)
bert_output[:10]

In [None]:
bert_data_path = "bert_base.pkl"

In [None]:
bert_data = {
    "clean_data": clean_data,
    "reduced_data": reduced_data,
    "token_data": token_data,
    "bert_data": bert_output
}
with open(bert_data_path, "wb") as f:
    pickle.dump(bert_data, f)

In [None]:
with open(bert_data_path, "rb") as f:
    bert_data = pickle.load(f)
clean_data = bert_data["clean_data"]
reduced_data = bert_data["reduced_data"]
token_data = bert_data["token_data"]
bert_output = bert_data["bert_data"]

In [None]:
find_optimal_clusters(bert_output, max_k, batch_size)

### Clustering by Kmeans

In [None]:
_ = cluster_and_plot(bert_output, "kmeans", 10, None, 50, 10)

### Clustering by DBSCAN

In [None]:
_ = cluster_and_plot(bert_output, "dbscan", None, 5.95, 50, 10)