In [1]:
import json
import numpy as np

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_file_path = "data/sample_conversations.json"

with open(data_file_path) as data_file:
    raw_data = json.load(data_file)
    
messages = [message['Text'] for datum in raw_data['Issues'] for message in datum['Messages']]

In [23]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english',
                             max_df=0.008, min_df=0.0009)
X = vectorizer.fit_transform(messages)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 22264, n_features: 916


In [24]:
#n_components = 5000

#svd = TruncatedSVD(n_components)
#lsa = make_pipeline(svd, Normalizer(copy=False))

#X = lsa.fit_transform(X)
#explained_variance = svd.explained_variance_ratio_.sum()
#print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

In [25]:
n_clusters = 1000

km = MiniBatchKMeans(n_clusters=n_clusters, n_init=1, init_size=10000, batch_size=5000)

km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(min(n_clusters, 20)):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: channels working issues having right channels tickets live performance live purchase double
Cluster 1: discount hi cable associated ahead sort couldn town make sure associated account confirmed march
Cluster 2: created ticket set created 2pm enjoy install takes hours later place
Cluster 3: perfect oh sounds great sounds actually yeah zip code free book free fraudulent
Cluster 4: appreciate asap need new new modem really like cancel customer years years cancel guess
Cluster 5: welcome great welcome great day loyal zip code flight number friday free charge free book free fraudulent
Cluster 6: departure flight number departure number departure flight number fine hear missed flight hear missed sorry hear missed missed flight flight flight number
Cluster 7: thx book like sounds great sounds paid assist order number assist order helps funds nancy
Cluster 8: free book choice free book choice like send deal having thats book like like offer
Cluster 9: great ne

In [26]:
message_predictions = km.predict(X)

test_sentence = vectorizer.transform(['wifi help'])
#test_sentence = lsa.transform(test_sentence)

test_sentence_prediction = km.predict(test_sentence)

similar_messages = [message for i, message in enumerate(messages)
                    if message_predictions[i] == test_sentence_prediction]
similar_messages

['I do have my wifi connection up',
 'My cellphone seems to not be dialing out at all right now but I do have wifi. Help!',
 'Whenever i try to connect, I do make sure that both my chromecast and my nexus are housed on the same wifi network which i made sure is relayed through a stronger buffalo router. But everytime i play a movie over the network attached storage, it just disconnects',
 'Which system? There are 4 systems at work here. The chromecast, the nexus, the wifi router, the television',
 'It seems your modem must have been insecure and people have access to it. When last did you change you wifi password?',
 'It seems your modem must have been insecure and people have access to it. When last did you change you wifi password?',
 'Still can connect to wifi',
 'Only wifi works',
 'Hello, I just noticed that my 3G service is no longer working on my phone. However, wifi is working fine. Can you help me with this issue?',
 'Wifi working',
 'Hello, I just noticed that the service for

In [None]:
#labels = km.labels_
#metrics.silhouette_score(X, labels, metric='euclidean')