In [1]:
import json
import numpy as np

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_file_path = "data/sample_conversations.json"

with open(data_file_path) as data_file:
    raw_data = json.load(data_file)
    
messages = [message['Text'] for datum in raw_data['Issues'] for message in datum['Messages']]

In [3]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english',
                             max_df=0.01, min_df=0.001)
X = vectorizer.fit_transform(messages)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 22264, n_features: 853


In [4]:
n_components = 500

svd = TruncatedSVD(n_components)
lsa = make_pipeline(svd, Normalizer(copy=False))

X = lsa.fit_transform(X)
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

Explained variance of the SVD step: 91%


In [5]:
n_clusters = 30

km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=1,
                     init_size=1000, batch_size=1000)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(n_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: 100 10 active users 10 minutes 6pm 20 discount order able help 703 afternoon apartment need 50
Cluster 1: 15 assist cell assist cell number charged twice 1p cellphone number check appointment march 100 evening
Cluster 2: additional assist order address phone address serviceable available appointment march book appears authorized apologize inconvenience awesome great
Cluster 3: having trouble apartment need update acme apartment number days assist account number happy help happy assist appointment march area serviceable
Cluster 4: 10 active invoice john smith know enjoy rest installed installation inbox issues isn error
Cluster 5: 20 discount airline 20 address phone ahead apologize active airport apt apartment need
Cluster 6: assist cell assist cell number appointment march changed charged confirmation number assist new address concert april apt
Cluster 7: appointment march declined apt april deliver days ago book like appointment area serviceable date

In [6]:
message_predictions = km.predict(X)

test_sentence = vectorizer.transform(['wifi is free'])
test_sentence = lsa.transform(test_sentence)

test_sentence_prediction = km.predict(test_sentence)

similar_messages = [message for i, message in enumerate(messages)
                    if message_predictions[i] == test_sentence_prediction]
similar_messages

["Hi! I placed an order on your website and I can't find the tracking number. Can you help me find out where my package is?",
 'I think I used my email address to log in.',
 'What should I do!',
 "I'm interested in upgrading my plan.",
 'Can you tell me a bit about Prime?',
 'Hello',
 'Hello Werner how may I help you today?',
 'I have recently moved to a new apartment and would like to update my contact details',
 'Sure I can help you with that? Could you please provide me with your new address?',
 'OK Wernzio, I have updated your address to the system',
 'Great, thank. I also need to place a order for a new installation as the place I live currently does not have the required wiring',
 'Ok let me go ahead and request a work order for a new installation. Give me a moment...',
 'OK a installation order has been places. Seems the earilest we will be able to help you  is from the 20th February onwards',
 'Yes. It soes',
 'No that is it right now.',
 'Hey Acme, I am having problems with my

In [None]:
#labels = km.labels_
#metrics.silhouette_score(X, labels, metric='euclidean')