In [14]:
import json
import numpy as np

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_file_path = "data/sample_conversations.json"

with open(data_file_path) as data_file:
    raw_data = json.load(data_file)
    
messages = [message['Text'] for datum in raw_data['Issues'] for message in datum['Messages']]

In [20]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english',
                             max_df=0.008, min_df=0.001)
X = vectorizer.fit_transform(messages)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 22264, n_features: 822


In [None]:
#n_components = 5000

#svd = TruncatedSVD(n_components)
#lsa = make_pipeline(svd, Normalizer(copy=False))

#X = lsa.fit_transform(X)
#explained_variance = svd.explained_variance_ratio_.sum()
#print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

In [21]:
n_clusters = 300

km = MiniBatchKMeans(n_clusters=n_clusters, n_init=1, init_size=10000, batch_size=5000)

km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(min(n_clusters, 20)):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: internet modem wasn deliver recently ordered arrived purchased required signature make sure recently purchased
Cluster 1: excellent anytime missed flight service area wanted info john smith news box
Cluster 2: file card file 50 running credit card error date credit 2794 conaway ahead
Cluster 3: left door door isn ordered new left new modem got home package week fine
Cluster 4: confirm don okay thank check com smith fee flight leaving free book free
Cluster 5: oh wow guess wasn peter mistake pull account sorry hear let hear let let help
Cluster 6: ok thank bye flight flight number free charge free book free food ordering account food ordering food flights
Cluster 7: restarting unable zip code free food ordering account food ordering food flights flight number departure flight number
Cluster 8: great thanks wanted really let help fast works wow thanks help oh process
Cluster 9: ordered oak high oak bookshelf correct oak bookshelf ok thank overnight oak s

In [22]:
message_predictions = km.predict(X)

test_sentence = vectorizer.transform(['wifi help'])
#test_sentence = lsa.transform(test_sentence)

test_sentence_prediction = km.predict(test_sentence)

similar_messages = [message for i, message in enumerate(messages)
                    if message_predictions[i] == test_sentence_prediction]
similar_messages

['From like a hour ago my able service just stopped workig',
 'Seems since moving into the new house things stopped working',
 'Hello, my name is josé. I hace a problem with my cellphone. It stopped working and i have a busy day today',
 "Hi, I've completely lost cellphone service on a day when I really need it for work",
 "Hi, I don't know what happened but my cellphone service is not working. Luckily the wifi on my phone does. Can you help me.",
 'Hello. My cellphone service is not working at all, and I only have wifi access. I have a lot of work to do, is there anyway this situation can be resolved quickly?',
 'There was no notice. It just stopped',
 'My cell phone service just stopped working',
 'Cellphone stopped working',
 "My phone service is out I'll do I still have Wi-Fi I have a lot of work to do today I need to get this problem solved help",
 'I have no cellphone reception',
 'My cellphone service stopped working all of a sudden even though I already paid the charge for it. 

In [7]:
message_predictions = km.predict(X)

test_sentence = vectorizer.transform(['wifi help'])
#test_sentence = lsa.transform(test_sentence)

test_sentence_prediction = km.predict(test_sentence)

similar_messages = [message for i, message in enumerate(messages)
                    if message_predictions[i] == test_sentence_prediction]
similar_messages

["Hi! I placed an order on your website and I can't find the tracking number. Can you help me find out where my package is?",
 'I think I used my email address to log in.',
 'Can you tell me a bit about Prime?',
 'Hello Werner how may I help you today?',
 'I have recently moved to a new apartment and would like to update my contact details',
 'Great, thank. I also need to place a order for a new installation as the place I live currently does not have the required wiring',
 'Ok let me go ahead and request a work order for a new installation. Give me a moment...',
 'OK a installation order has been places. Seems the earilest we will be able to help you  is from the 20th February onwards',
 'does that suite you?',
 'good, I have scheduled. A operator will be contacted you one day prior to lock down a time slot.',
 'No that is it right now.',
 'Hey Acme, I am having problems with my cable tv...',
 'From like a hour ago my able service just stopped workig',
 'Interesting, let me quickly ha

In [None]:
#labels = km.labels_
#metrics.silhouette_score(X, labels, metric='euclidean')