In [1]:
import json
import numpy as np

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_file_path = "data/sample_conversations.json"

with open(data_file_path) as data_file:
    raw_data = json.load(data_file)
    
messages = [message['Text'] for datum in raw_data['Issues'] for message in datum['Messages']]

In [3]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english',
                             max_df=0.008, min_df=0.001)
X = vectorizer.fit_transform(messages)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 22264, n_features: 822


In [5]:
n_components = 500

svd = TruncatedSVD(n_components)
lsa = make_pipeline(svd, Normalizer(copy=False))

X = lsa.fit_transform(X)
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

Explained variance of the SVD step: 92%


In [6]:
n_clusters = 30

km = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=1,
                     init_size=1000, batch_size=1000)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(n_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: 20 15 2794 conaway street 100 10 active users 10a 12 4pm 2p 9am
Cluster 1: 10 minutes 6p 24 hours 1st declined 20 discount 9am 2794 conaway 12th 10 active 4p
Cluster 2: active users ave active available appointment march 9am 9a noon authorized refund arrive account hacked begin
Cluster 3: cell phone address phone apartment actually anytime cell number channels bought confirmation number care
Cluster 4: accounts apologize 9am booster 9a noon 20 9a 4pm active apologize inconvenience
Cluster 5: couldn departure city days ago deliver confirmation available flights credit card delivery address confirm come
Cluster 6: check inbox change changed code 20 coffee choice checked charged com charged twice
Cluster 7: 20 discount 20 15 bought 24 bought tickets 5pm 12th 2794 conaway 703
Cluster 8: cable box care cable channels available appointment march guess available flights great thank authorized refund awesome thanks bloomington
Cluster 9: let pull account let t

In [7]:
message_predictions = km.predict(X)

test_sentence = vectorizer.transform(['wifi help free'])
test_sentence = lsa.transform(test_sentence)

test_sentence_prediction = km.predict(test_sentence)

similar_messages = [message for i, message in enumerate(messages)
                    if message_predictions[i] == test_sentence_prediction]
similar_messages

["Hi! I placed an order on your website and I can't find the tracking number. Can you help me find out where my package is?",
 'I think I used my email address to log in.',
 'My battery exploded!',
 "It's on fire, it's melting the carpet!",
 'What should I do!',
 "I'm interested in upgrading my plan.",
 'Can you tell me a bit about Prime?',
 'My friend has it, and it seems like a great deal',
 'Hello',
 'Hello Werner how may I help you today?',
 'I have recently moved to a new apartment and would like to update my contact details',
 'Sure I can help you with that? Could you please provide me with your new address?',
 'Ok, 5 Seaman Ave, Apt 3F, New York, New York, 10034',
 'Let me update that information on our system',
 'OK Wernzio, I have updated your address to the system',
 'Great, thank. I also need to place a order for a new installation as the place I live currently does not have the required wiring',
 'Ok let me go ahead and request a work order for a new installation. Give me a

In [None]:
#labels = km.labels_
#metrics.silhouette_score(X, labels, metric='euclidean')