In [1]:
import json
import numpy as np

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_file_path = "data/sample_conversations.json"

with open(data_file_path) as data_file:
    raw_data = json.load(data_file)
    
messages = [message['Text'] for datum in raw_data['Issues'] for message in datum['Messages']]

In [3]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english',
                             max_df=0.008, min_df=0.0001)
X = vectorizer.fit_transform(messages)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 22264, n_features: 8480


In [4]:
#n_components = 5000

#svd = TruncatedSVD(n_components)
#lsa = make_pipeline(svd, Normalizer(copy=False))

#X = lsa.fit_transform(X)
#explained_variance = svd.explained_variance_ratio_.sum()
#print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

In [5]:
n_clusters = 3000

km = MiniBatchKMeans(n_clusters=n_clusters, n_init=1, init_size=10000, batch_size=5000)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(min(n_clusters, 20)):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: john sorry hear hi john sorry john sorry hi john john hear account hacked hear account sorry hear account account hacked hacked
Cluster 1: aspen truly 7163522001 course pretty court truly sorry 10p expensive help problem
Cluster 2: awesome awesome day great awesome day great awesome king ok great awesome awesome help stephen king stephen like great
Cluster 3: ve pulled account account payment march pulled account payment ve pulled account payment march 1st declined payment march 1st payment march 1st declined march 1st
Cluster 4: needed uou needed help thank help thank needed help thank help zipcode great let service great let quickly great let pull
Cluster 5: hope problem great problem great day fixing doesn happen finally soon ok hope hope help unacceptable
Cluster 6: computer log account leave remember log verify great news service great news provide great hear great ordered
Cluster 7: welcom idea andrew va yay yes home ll thanks 464 disregard 51859

In [9]:
message_predictions = km.predict(X)

test_sentence = vectorizer.transform(['wifi help'])
#test_sentence = lsa.transform(test_sentence)

test_sentence_prediction = km.predict(test_sentence)

similar_messages = [message for i, message in enumerate(messages)
                    if message_predictions[i] == test_sentence_prediction]
similar_messages

['That is strange, it seems the signal does not connect passed the exchange.',
 'Whenever i try to connect, I do make sure that both my chromecast and my nexus are housed on the same wifi network which i made sure is relayed through a stronger buffalo router. But everytime i play a movie over the network attached storage, it just disconnects',
 'I am unable to consistently connect to the internet',
 'It seem the connect strenght has increased from our end',
 'OK I have your modems status in front of me. I see that you are syncing with the exchange at the expected speed. I do however notice you have 10 devices connect currently?',
 "Hello, I noticed that my phone service recently stopped. I'm able to connect to wifi just fine but whenever I use 3G anything that uses internet just hangs and fails to load. Can you help me with this problem?",
 'I can use wifi but phone service is dead',
 'Hi my cell reception is not picking up. I have wifi for now but have a lot to do and need to see what

In [7]:
#labels = km.labels_
#metrics.silhouette_score(X, labels, metric='euclidean')