In [1]:
import json
import numpy as np

from sklearn import metrics
from sklearn.pipeline import make_pipeline
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data_file_path = "data/sample_conversations.json"

with open(data_file_path) as data_file:
    raw_data = json.load(data_file)
    
messages = [message['Text'] for datum in raw_data['Issues'] for message in datum['Messages']]

In [3]:
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), stop_words='english',
                             max_df=0.008, min_df=0.0001)
X = vectorizer.fit_transform(messages)

print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 22264, n_features: 8480


In [None]:
#n_components = 5000

#svd = TruncatedSVD(n_components)
#lsa = make_pipeline(svd, Normalizer(copy=False))

#X = lsa.fit_transform(X)
#explained_variance = svd.explained_variance_ratio_.sum()
#print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

In [17]:
n_clusters = 3000

km = MiniBatchKMeans(n_clusters=n_clusters, n_init=1, init_size=10000, batch_size=5000)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(min(n_clusters, 20)):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Top terms per cluster:
Cluster 0: welcome great day welcome great 6pm 9pm 9pm 6pm available flight available flights flights great let quickly great moment
Cluster 1: 12a disappointed good time input valuable court 20th feb salazar hi salazar help
Cluster 2: wonderful day wonderful zipcode great let insert great let service great let quickly great let pull great let know great let booked great ll
Cluster 3: rest day rest good rest day good rest ok great enjoy great enjoy great enjoy rest enjoy enjoy rest day enjoy rest
Cluster 4: perfect thank yiu yiu ok perfect ok perfect scheduled perfect scheduled scheduled sounds perfect emailing tickets emailing
Cluster 5: afternoon earliest 6p return good afternoon schedule late busy good afternoon just afternoon just
Cluster 6: welcome great day welcome great zipcode great moment great like great let service great let quickly great let pull great let know great let insert
Cluster 7: thank help ok thank great let insert great ll great like great 

In [18]:
message_predictions = km.predict(X)

test_sentence = vectorizer.transform(['wifi help free'])
#test_sentence = lsa.transform(test_sentence)

test_sentence_prediction = km.predict(test_sentence)

similar_messages = [message for i, message in enumerate(messages)
                    if message_predictions[i] == test_sentence_prediction]
similar_messages

['If it is not working within 35-50min feel free to contact me again',
 'Great, feel free to sumbit the HIt so long...',
 'Seeing that you are a book lover, could we offer you a free book to help fill the bookcase?',
 'Ok we have it. I will be sure to send that along with your free delivery. Is there anything else I can assist you with?',
 'Bakr, this is werner. I apologize for this. Please feel free to submit the Hit, and just leave a note that we did not respond in sufficient time.',
 "Feel free to contact us if the issue isn't resolved by then.",
 "Feel free to contact us again if you don't see it soon.",
 'I will send you a shelf for free',
 'Ok great! I am going to send you a free shelf',
 'If we cannot locate your modem or contact you within 24 hours  then please feel free to contact us back1',
 'It retails for $675 but I can offer to you for free']

In [None]:
#labels = km.labels_
#metrics.silhouette_score(X, labels, metric='euclidean')