### Project Dependencies

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import pos_tag
import sys
from operator import add
from pyspark import SparkContext
import re
from wordfreq import word_frequency
import collections
from gensim.models import Word2Vec
import csv
import nltk
from nltk.collocations import *

import unidecode
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet as wn



### Load and Format Data

In [2]:
outfile = open('data.txt', 'w')

with open('satisfaction.csv') as csvDataFile:
    csvReader = csv.reader(csvDataFile)
    for row in csvReader:
        outfile.write("%s\n" % row[2])

tokenizer = RegexpTokenizer(r'\w+')

def read_words(words_file):
    return [word.lower() for line in open(words_file, 'r') for word in tokenizer.tokenize(line)]

words = read_words('data.txt')

stop_words = set(stopwords.words('english'))
stop_words.add('sanitized')
filtered_words = [w for w in words if not w in stop_words]

In [3]:
print(len(words))
print(len(filtered_words))

14167
7601


### Combine Frequent Bigrams

In [3]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(filtered_words, window_size=2)
finder.apply_freq_filter(4)
top_bigrams = finder.nbest(bigram_measures.likelihood_ratio, 20)
print(top_bigrams)

[('easy', 'use'), ('customer', 'service'), ('ease', 'use'), ('user', 'friendly'), ('tech', 'support'), ('question', 'types'), ('customer', 'support'), ('great', 'customer'), ('survey', 'tool'), ('great', 'tool'), ('easy', 'learn'), ('survey', 'monkey'), ('already', 'recommended'), ('row', 'per'), ('quality', 'product'), ('panel', 'triggers'), ('much', 'easier'), ('one', 'row'), ('table', 'one'), ('great', 'product')]


In [4]:
combined_words = [w for w in words if not w in stop_words]
combined_words = filtered_words
numB = len(filtered_words) - 1
count = 0
for i in range(numB):
    bigram = (filtered_words[i], filtered_words[i+1])
    if bigram in top_bigrams:
        j = i - count
        k = (i + 1) - count
        combined_words[j:k] = ['_'.join(bigram)]
        count = count + 1

In [6]:
combined_words[:10]

['reasons',
 'score',
 'excellent',
 'platform',
 'highly',
 'responsive',
 'customer_service',
 'service',
 'variety',
 'questions']

### SIP Scores
An alternative method that is a bit faster for smaller datasets than using Spark. Note that dividing by "num_words" normalizes the SIP score, so that any word with SIP = 1 is used in the corpus just as often as in normal english language. A word with SIP = 5 is used 5 times as often in the corpus as it is in normal english.

In [5]:
num_words = len(combined_words)
text = nltk.Text(combined_words)
uniq_words = list(set(combined_words))
word_pool = []
word_count = []
word_SIP = []
for word in uniq_words:
    c = text.count(word)
    # only interested in words that occur more than 10 times
    if c < 11:
        continue
    word_pool.append(word)
    freq = word_frequency(word, 'en')
    if freq == 0:
        freq = float(.00001)
    # normalize the SIP scores
    word_SIP.append(c/num_words/freq)
    word_count.append(c)

word_pool_sorted = [x for _,x in sorted(zip(word_SIP, word_pool), reverse = True)]
word_count_sorted = [x for _,x in sorted(zip(word_SIP, word_count), reverse = True)]
word_SIP_sorted = word_SIP
word_SIP_sorted.sort(reverse = True)
SIPscoreslist = list(zip(word_pool_sorted, word_SIP_sorted))

In [10]:
SIPscoreslist[:15]

[('qualtrics', 1881.3314037626626),
 ('easy_use', 1499.8026575450597),
 ('customer_service', 1052.493093014077),
 ('surveys', 861.3984186822701),
 ('intuitive', 736.3245423524331),
 ('ease_use', 710.432837784502),
 ('functionality', 708.7465556009823),
 ('user_friendly', 447.30956453098275),
 ('responsive', 367.49996591980397),
 ('customer', 365.2187140363602),
 ('ease', 346.72190418243446),
 ('tool', 330.00405325948753),
 ('survey', 320.65593339002197),
 ('customer_support', 302.59176424154714),
 ('survey_tool', 236.81094592816734)]

### Word2Vec
Word2Vec feels very unstable to me, at least with this few responses. 

In [6]:
sentences = [combined_words]
model = Word2Vec(sentences, size=300, window=5, min_count=1, workers=4)

In [23]:
# most similar words according to Word2Vec
print(model.wv.most_similar(positive=['qualtrics'], topn=5))
print(model.wv.most_similar(positive=['easy_use'], topn=5))
print(model.wv.most_similar(positive=['tool'], topn=5))

[('use', 0.9681816697120667), ('easy', 0.9637120962142944), ('easy_use', 0.9594351053237915), ('survey', 0.9586905241012573), ('service', 0.9581952691078186)]
[('qualtrics', 0.9594351053237915), ('use', 0.9579805135726929), ('easy', 0.9527338147163391), ('support', 0.9501928091049194), ('service', 0.9495858550071716)]
[('use', 0.9490972757339478), ('easy', 0.9471516013145447), ('qualtrics', 0.9438937306404114), ('great', 0.9418753385543823), ('support', 0.9384625554084778)]


In [24]:
print(model.wv.similarity('customer_support', 'customer_service'))
print(model.wv.similarity('customer', 'support'))
print(model.wv.similarity('affordable', 'cost'))
print(model.wv.similarity('cost','price'))

0.853489343449
0.930141041698
0.430844719892
0.587260808308


### Synonymns

In [7]:
# this takes longer...so let's limit to top 30
NUM_TOPICS = 30
top_words = word_pool_sorted[:NUM_TOPICS]
synonyms = []
for word in top_words:
    word_synonyms = []
    for syn in wn.synsets(word):
        for l in syn.lemmas():
            word_synonyms.append(l.name())
    synonyms.append(list(set(word_synonyms[:4])))

In [17]:
synonyms

[[],
 [],
 [],
 ['survey', 'study', 'sketch'],
 ['intuitive', 'visceral', 'nonrational'],
 [],
 ['functionality'],
 [],
 ['antiphonal', 'reactive', 'responsive'],
 ['customer', 'client'],
 ['easiness', 'simplicity', 'simpleness', 'ease'],
 ['instrument', 'tool', 'creature'],
 ['survey', 'study', 'sketch'],
 [],
 [],
 [],
 ['capability', 'capableness', 'capacity'],
 ['rich', 'racy', 'robust', 'full-bodied'],
 [],
 [],
 ['flexible', 'flexile', 'elastic'],
 ['coverage', 'report', 'reporting', 'reportage'],
 ['flexibleness', 'flexibility'],
 ['interface', 'user_interface'],
 ['favorable', 'friendly'],
 ['merchandise', 'ware', 'product'],
 ['political_platform', 'political_program', 'platform'],
 ['first-class', 'splendid', 'excellent', 'fantabulous'],
 ['easy', 'easygoing', 'leisurely'],
 ['user', 'drug_user', 'exploiter']]

### Base "Noun" Clusters

In [9]:
stemmer = SnowballStemmer('english')
clusters = []
blacklist = {}
for i, parent_word in enumerate(SIPscoreslist):
    if parent_word[0] not in blacklist:
        # this is trying to account for "survey" and "surveys"
        if stemmer.stem(parent_word[0]) not in blacklist:
            
            # begin the cluster with parent_word
            cluster = []
            blacklist[parent_word[0]] = 1
            cluster.append(parent_word[0])
            
            # add 5 similar words from Word2Vec
            top_sim = [p[0] for p in model.wv.most_similar(positive=parent_word[0], topn=20)] # top 20
            top_sim = [w for w in top_sim if w not in blacklist] # remove any in blacklist
            top_sim = [w for w in top_sim if stemmer.stem(w) not in blacklist]
            for word in top_sim[:5]: # take the top 5 not in blacklist
                cluster.append(word)
                blacklist[word] = 1
            
            # add 4 similar words from WordNet synonyms
            top_sim = synonyms[i]
            for word in top_sim:
                if word not in blacklist:
                    if stemmer.stem(word) not in blacklist:
                        cluster.append(word)
                        blacklist[word] = 1
            
            # finish the cluster
            clusters.append(cluster)
    # force it to stop if we run past NUM_TOPICS
    if i == NUM_TOPICS - 1:
        break

In [36]:
# print(model.wv.most_similar(positive=['qualtrics'], topn=10))
# print(model.wv.most_similar(positive=['user_friendly'], topn=10))
# print(model.wv.most_similar(positive=['great_tool'], topn=10))

In [27]:
print(len(clusters))
print(clusters[0])
print(clusters[1])
print(clusters[2])
print(clusters[3])
print(clusters[10])

18
['qualtrics', 'use', 'easy', 'easy_use', 'survey', 'service']
['customer_service', 'customer', 'support', 'good', 'great', 'product']
['intuitive', 'like', 'friendly', 'ease', 'tool', 'user', 'visceral', 'nonrational']
['ease_use', 'features', 'time', 'data']
['capabilities', 'much', 'capability', 'capableness', 'capacity']


### Descriptive "Adjective" Clusters

In [10]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
descriptions = []

# this is a bigram filter to remove bigrams with words in the blacklist
def create_myfilter(parent):
    def bigram_filter(w1, w2):
        if w1 == parent:
            return w2 in blacklist
        if w2 == parent:
            return w1 in blacklist
    return bigram_filter

# for every main/noun cluster....
for i, cluster in enumerate(clusters):
    # make a copy of the text, and replace 
    # all cluster words with the parent_word of that cluster
    parent_word = cluster[0]
    similar_words = cluster[1:]
    words_copy = [parent_word if w in similar_words else w for w in combined_words]
    
    # find top 10 bigrams containing the parent_word
    # and NOT contains blacklist words
    finder = BigramCollocationFinder.from_words(words_copy, window_size=5)
    parent_filter = lambda *w: parent_word not in w   
    blacklist_filter = create_myfilter(parent_word)
    finder.apply_ngram_filter(parent_filter)      # bigram must contain parent_word
    finder.apply_ngram_filter(blacklist_filter)   # bigram does not contain blacklist words
    finder.apply_freq_filter(3)                   # bigram occurs at least 3 times
    best_bigrams = finder.nbest(bigram_measures.likelihood_ratio, 10)
    adj = []
    list(adj.extend(row) for row in best_bigrams)
    l = [w for w in adj if w != parent_word]
    descriptions.append(adj)

for i,cluster in enumerate(clusters):
    descriptions[i] = [w for w in descriptions[i] if w != cluster[0]]

In [11]:
print(descriptions[0])
print(descriptions[1])
print(descriptions[2])

['lot', 'affordable', 'enables', 'versitile', 'versatile', 'fairly', 'wide', 'sophisticated', 'relatively', 'provides']
['accessibility', 'restrictive', 'monkey', 'row_per', 'package', 'sent', 'layout', 'purpose', 'improvements', 'centric']
['desk', 'organization', 'value', 'functional', 'bit', 'pretty', 'really', 'satisfied', 'quality', '8']


### Cluster Titles

In [51]:
# def create_clustfilter(clust):
#     def bigram_filter(w1, w2):
#         return w2 not in clust or w1 not in clust
#     return bigram_filter

# topics = []
# for i in range(len(clusters)):
#     pool = clusters[i] + descriptions[i]
#     finder = BigramCollocationFinder.from_words(combined_words, window_size=2)
#     clust_filter = create_clustfilter(pool)  
#     topic_filter = lambda w1, w2: (w1, w2) in set(topics)
#     finder.apply_ngram_filter(clust_filter)
#     finder.apply_ngram_filter(topic_filter)
#     topics.append(finder.nbest(bigram_measures.likelihood_ratio, 1)[0])

### Save Topics

In [12]:
final_topics = []
for i,clust in enumerate(clusters):
    adj = descriptions[i]
    if len(adj) == 0:
        topicName = clust[0]
        queryParams = [topicName, clust, []]
    else:
        topicName = " ".join([clust[0], adj[0]])
        queryParams = [topicName, clust, adj]
    final_topics.append(queryParams)

In [13]:
final_topics[:2]

[['qualtrics lot',
  ['qualtrics', 'use', 'easy_use', 'easy', 'support', 'service'],
  ['lot',
   'affordable',
   'enables',
   'versitile',
   'versatile',
   'fairly',
   'wide',
   'sophisticated',
   'relatively',
   'provides']],
 ['customer_service accessibility',
  ['customer_service', 'survey', 'customer', 'good', 'great', 'user'],
  ['accessibility',
   'restrictive',
   'monkey',
   'row_per',
   'package',
   'sent',
   'layout',
   'purpose',
   'improvements',
   'centric']]]

In [15]:
outfile = open('topicsKaite_var2.txt', 'w')

for topic in final_topics[:10]:
    outfile.write("%s\n" % topic[0])
    outfile.write("%s\n" % topic[1:])