### Project Dependencies

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
import sys
from operator import add
from pyspark import SparkContext
import re
from wordfreq import word_frequency
import collections
from gensim.models import Word2Vec
import csv
import nltk
from nltk.collocations import *

import unidecode
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet as wn



### Load and Format Data

In [2]:
outfile = open('data.txt', 'w')

with open('satisfaction.csv') as csvDataFile:
    csvReader = csv.reader(csvDataFile)
    for row in csvReader:
        outfile.write("%s\n" % row[2])

tokenizer = RegexpTokenizer(r'\w+')

def read_words(words_file):
    return [word.lower() for line in open(words_file, 'r') for word in tokenizer.tokenize(line)]

words = read_words('data.txt')

stop_words = set(stopwords.words('english'))
stop_words.add('sanitized')
filtered_words = [w for w in words if not w in stop_words]

In [3]:
print(len(words))
print(len(filtered_words))

14167
7601


### Combine Frequent Bigrams

In [4]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(filtered_words, window_size=2)
finder.apply_freq_filter(1)
top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, 20)
print(top_bigrams)

[('easy', 'use'), ('customer', 'service'), ('ease', 'use'), ('user', 'friendly'), ('tech', 'support'), ('question', 'types'), ('customer', 'support'), ('great', 'customer'), ('survey', 'tool'), ('great', 'tool'), ('easy', 'learn'), ('survey', 'monkey'), ('already', 'recommended'), ('row', 'per'), ('e', 'g'), ('top', 'notch'), ('quality', 'product'), ('panel', 'triggers'), ('much', 'easier'), ('reasonably', 'priced')]


In [5]:
combined_words = [w for w in words if not w in stop_words]
combined_words = filtered_words
numB = len(filtered_words) - 1
count = 0
for i in range(numB):
    bigram = (filtered_words[i], filtered_words[i+1])
    if bigram in top_bigrams:
        j = i - count
        k = (i + 1) - count
        combined_words[j:k] = ['_'.join(bigram)]
        count = count + 1

In [6]:
combined_words[:10]

['reasons',
 'score',
 'excellent',
 'platform',
 'highly',
 'responsive',
 'customer_service',
 'service',
 'variety',
 'questions']

### SIP Scores
An alternative method that is a bit faster for smaller datasets than using Spark. Note that dividing by "num_words" normalizes the SIP score, so that any word with SIP = 1 is used in the corpus just as often as in normal english language. A word with SIP = 5 is used 5 times as often in the corpus as it is in normal english.

In [7]:
num_words = len(combined_words)
text = nltk.Text(combined_words)
uniq_words = list(set(combined_words))
word_pool = []
word_count = []
word_SIP = []
for word in uniq_words:
    c = text.count(word)
    # only interested in words that occur more than 5 times
    if c < 6:
        continue
    word_pool.append(word)
    freq = word_frequency(word, 'en')
    if freq == 0:
        freq = float(.00001)
    # normalize the SIP scores
    word_SIP.append(c/num_words/freq)
    word_count.append(c)

word_pool_sorted = [x for _,x in sorted(zip(word_SIP, word_pool), reverse = True)]
word_count_sorted = [x for _,x in sorted(zip(word_SIP, word_count), reverse = True)]
word_SIP_sorted = word_SIP
word_SIP_sorted.sort(reverse = True)
SIPscoreslist = list(zip(word_pool_sorted, word_SIP_sorted))

In [9]:
SIPscoreslist[:20]

[('qualtrics', 1881.3314037626626),
 ('easy_use', 1499.8026575450597),
 ('customer_service', 1052.493093014077),
 ('surveys', 848.5417258661168),
 ('intuitive', 765.7775240465305),
 ('functionality', 731.609347717143),
 ('ease_use', 710.432837784502),
 ('user_friendly', 447.30956453098275),
 ('responsive', 393.74996348550434),
 ('customer', 365.2187140363602),
 ('ease', 352.59854662620455),
 ('tool', 333.7119639702683),
 ('survey', 318.09068592290186),
 ('triggers', 304.3489283636335),
 ('customer_support', 302.59176424154714),
 ('survey_tool', 236.81094592816734),
 ('great_tool', 236.81094592816734),
 ('robust', 225.46161822435596),
 ('capabilities', 219.45655741626007),
 ('tech_support', 184.18629127746348)]

### Word2Vec
Word2Vec feels very unstable to me, at least with this few responses. 

In [10]:
sentences = [combined_words]
model = Word2Vec(sentences, size=300, window=4, min_count=1, workers=4)

In [11]:
# most similar words according to Word2Vec
print(model.wv.most_similar(positive=['qualtrics'], topn=5))
print(model.wv.most_similar(positive=['easy_use'], topn=5))
print(model.wv.most_similar(positive=['tool'], topn=5))

[('support', 0.9393998980522156), ('service', 0.9318608641624451), ('survey', 0.9308834671974182), ('easy', 0.9308336973190308), ('use', 0.9290825724601746)]
[('customer_service', 0.924868106842041), ('service', 0.9244619607925415), ('support', 0.9243979454040527), ('qualtrics', 0.9235543608665466), ('use', 0.9222701191902161)]
[('use', 0.9071529507637024), ('qualtrics', 0.9061402082443237), ('customer_service', 0.9014229774475098), ('survey', 0.9013200998306274), ('easy', 0.8990861177444458)]


In [12]:
print(model.wv.similarity('customer_support', 'customer_service'))
print(model.wv.similarity('customer', 'support'))
print(model.wv.similarity('affordable', 'cost'))
print(model.wv.similarity('cost','price'))

0.860642877298
0.922574525181
0.290757756691
0.502701048523


### Synonymns

In [25]:
# this takes longer...so let's limit to top 30
NUM_TOPICS = 30
top_words = word_pool_sorted[:NUM_TOPICS]
synonyms = []
for word in top_words:
    word_synonyms = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            word_synonyms.append(l.name())
    synonyms.append(list(set(word_synonyms[:3])))

In [9]:
synonyms

[[],
 [],
 [],
 ['study', 'sketch', 'survey'],
 ['intuitive', 'visceral', 'nonrational'],
 [],
 ['functionality'],
 [],
 ['antiphonal', 'responsive', 'reactive'],
 ['customer', 'client'],
 ['ease', 'simplicity', 'simpleness', 'easiness'],
 ['instrument', 'tool', 'creature'],
 ['study', 'sketch', 'survey'],
 ['gun_trigger', 'trigger'],
 [],
 [],
 [],
 ['capability', 'capacity', 'capableness'],
 ['rich', 'robust', 'racy', 'full-bodied'],
 [],
 [],
 ['elastic', 'flexile', 'flexible'],
 ['reportage', 'coverage', 'reporting', 'report'],
 ['flexibility', 'flexibleness'],
 ['user_interface', 'interface'],
 ['favorable', 'friendly'],
 ['stand_out', 'excel', 'surpass'],
 ['ware', 'product', 'merchandise'],
 ['political_platform', 'platform', 'political_program'],
 []]

### Base "Noun" Clusters

In [26]:
stemmer = SnowballStemmer('english')
clusters = []
blacklist = {}
for i, parent_word in enumerate(SIPscoreslist):
    if parent_word[0] not in blacklist:
        # this is trying to account for "survey" and "surveys"
        if stemmer.stem(parent_word[0]) not in blacklist:
            
            # begin the cluster with parent_word
            cluster = []
            blacklist[parent_word[0]] = 1
            cluster.append(parent_word[0])
            
            # add 6 similar words from Word2Vec
            top_sim = [p[0] for p in model.wv.most_similar(positive=parent_word[0], topn=20)] # top 20
            top_sim = [w for w in top_sim if w not in blacklist] # remove any in blacklist
            top_sim = [w for w in top_sim if stemmer.stem(w) not in blacklist]
            for word in top_sim[:6]: # take the top 6 not in blacklist
                cluster.append(word)
                blacklist[word] = 1
            
            # add 3 similar words from WordNet synonyms
            top_sim = synonyms[i]
            for word in top_sim:
                if word not in blacklist:
                    if stemmer.stem(word) not in blacklist:
                        cluster.append(word)
                        blacklist[word] = 1
            
            # finish the cluster
            clusters.append(cluster)
    # force it to stop if we run past NUM_TOPICS
    if i == NUM_TOPICS - 1:
        break

In [36]:
# print(model.wv.most_similar(positive=['qualtrics'], topn=10))
# print(model.wv.most_similar(positive=['user_friendly'], topn=10))
# print(model.wv.most_similar(positive=['great_tool'], topn=10))

In [27]:
print(len(clusters))
print(clusters[0])
print(clusters[1])
print(clusters[2])
print(clusters[3])
print(clusters[10])

20
['qualtrics', 'support', 'service', 'survey', 'easy', 'use', 'customer_service']
['easy_use', 'features', 'great', 'customer', 'product', 'like', 'software']
['intuitive', 'tool', 'time', 'good', 'ease_use', 'reporting', 'nonrational']
['functionality', 'need', 'work', 'get']
['robust', 'help', 'racy', 'full-bodied']


### Descriptive "Adjective" Clusters

In [28]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
descriptions = []

# this is a bigram filter to remove bigrams with words in the blacklist
def create_myfilter(parent):
    def bigram_filter(w1, w2):
        if w1 == parent:
            return w2 in blacklist
        if w2 == parent:
            return w1 in blacklist
    return bigram_filter

# for every main/noun cluster....
for i, cluster in enumerate(clusters):
    # make a copy of the text, and replace 
    # all cluster words with the parent_word of that cluster
    parent_word = cluster[0]
    similar_words = cluster[1:]
    words_copy = [parent_word if w in similar_words else w for w in combined_words]
    
    # find top 10 bigrams containing the parent_word
    # and NOT contains blacklist words
    finder = BigramCollocationFinder.from_words(words_copy, window_size=3)
    parent_filter = lambda *w: parent_word not in w   
    blacklist_filter = create_myfilter(parent_word)
    finder.apply_ngram_filter(parent_filter)      # bigram must contain parent_word
    finder.apply_ngram_filter(blacklist_filter)   # bigram does not contain blacklist words
    finder.apply_freq_filter(3)                   # bigram occurs at least 3 times
    best_bigrams = finder.nbest(bigram_measures.likelihood_ratio, 10)
    adj = []
    list(adj.extend(row) for row in best_bigrams)
    l = [w for w in adj if w != parent_word]
    descriptions.append(adj)

for i,cluster in enumerate(clusters):
    descriptions[i] = [w for w in descriptions[i] if w != cluster[0]]

In [29]:
print(descriptions[0])
print(descriptions[1])
print(descriptions[2])

['affordable', 'tech', 'always', 'learn', 'versatile', 'fairly', 'monkey', 'allows', 'let', 'staff']
['quality', 'adding', 'layout', 'documentation', 'centric', 'collaboration', 'efficient', 'overall', 'wish', 'pleased']
['functions', 'still', 'money', 'surveys', '360', 'outstanding', 'tools', 'lot', 'research', 'improvement']


### Cluster Titles

In [19]:
# def create_clustfilter(clust):
#     def bigram_filter(w1, w2):
#         return w2 not in clust and w1 not in clust
#     return bigram_filter

# topics = []
# for i in range(len(clusters)):
#     pool = clusters[i] + descriptions[i]
#     finder = BigramCollocationFinder.from_words(combined_words, window_size=2)
#     clust_filter = create_clustfilter(pool)  
#     topic_filter = lambda w1, w2: (w1, w2) in set(topics)
#     finder.apply_ngram_filter(clust_filter)
#     finder.apply_ngram_filter(topic_filter)
#     topics.append(finder.nbest(bigram_measures.likelihood_ratio, 1)[0])

### Save Topics

In [30]:
final_topics = []
for i,clust in enumerate(clusters):
    adj = descriptions[i]
    if len(adj) == 0:
        topicName = clust[0]
        queryParams = [topicName, clust, []]
    else:
        topicName = " ".join([clust[0], adj[0]])
        queryParams = [topicName, clust, adj]
    final_topics.append(queryParams)

In [31]:
final_topics[:3]

[['qualtrics affordable',
  ['qualtrics',
   'support',
   'service',
   'survey',
   'easy',
   'use',
   'customer_service'],
  ['affordable',
   'tech',
   'always',
   'learn',
   'versatile',
   'fairly',
   'monkey',
   'allows',
   'let',
   'staff']],
 ['easy_use quality',
  ['easy_use', 'features', 'great', 'customer', 'product', 'like', 'software'],
  ['quality',
   'adding',
   'layout',
   'documentation',
   'centric',
   'collaboration',
   'efficient',
   'overall',
   'wish',
   'pleased']],
 ['intuitive functions',
  ['intuitive',
   'tool',
   'time',
   'good',
   'ease_use',
   'reporting',
   'nonrational'],
  ['functions',
   'still',
   'money',
   'surveys',
   '360',
   'outstanding',
   'tools',
   'lot',
   'research',
   'improvement']]]

In [33]:
outfile = open('topicsKaite_var4.txt', 'w')

for topic in final_topics[:10]:
    outfile.write("%s\n" % topic[0])
    outfile.write("%s\n" % topic[1:])