In [1]:
import pandas as pd
import numpy as np
import spacy
import pickle

from functools import reduce
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine, mahalanobis

nlp = spacy.load('en')

In [2]:
STOP_WORDS = spacy.lang.en.STOP_WORDS

## Subsetting Data

Only taking Vitamin D3 Supplements in this initial test:

In [3]:
data = pd.read_table('./amzn_reviews/reviews_data_clean', delimiter='|')

In [5]:
vit_d3 = data.query('categories_clean ==  "Health & Personal Care, Vitamins & Dietary Supplements, Vitamins, Vitamin D, D3"')

## Class-Level: Cluster on the Set of Nouns

In [6]:
def ret_noun(blob):
    '''
    input:  text blob
    output: returns only noun using spacy
    notes:  this function returns only nouns that are at least 3 letters long and not in spacy
            stop words
    '''
    blob = blob.lower()
    doc = nlp(blob)
    
    # helper function so i can keep stringing on conditions like length and no stop words
    def acceptable(word):
        if len(word) > 2 and word not in STOP_WORDS:
            return True
        else:
            return False
        
    return [token for token in doc if acceptable(token) and (token.pos_=='NOUN')]

In [7]:
def load_glove(filename):
    '''
    input: path to glove file
    output: dictionary mapping word to embedding
    '''
    tmpDict = {}
    with open(filename, 'r') as file:
        for line in file:
            line_list = line.rstrip().split(" ")
            key = line_list.pop(0)
            # convert list of values to numpy array and add to dictionary
            tmpDict[key] = np.array(line_list, dtype='float')

    return tmpDict

In [8]:
# load glove
glove_dict = load_glove('./emb/glove.840B.300d.txt')

In [10]:
# take nouns only
%time test = (vit_d3.reviewText.apply(lambda x : ret_noun(x)))

# take the set of all words
test_set = reduce(lambda x, y : x | y, test.apply(lambda x : set(x)))

# list of words
words_avail = list(glove_dict.keys())

# list of glove arrays from words
#test_glove = [glove_dict[w.string] for w in test_set if w.string in words_avail]

CPU times: user 14min 43s, sys: 17.7 s, total: 15min
Wall time: 2min 30s


In [None]:
# write test_glove to pickle since it takes a while
with open('test_glove', 'wb') as fp:
    pickle.dump(test_glove, fp)

In [8]:
# to load the pickle
test_glove = pickle.load( open( "test_glove", "rb" ) )

In [None]:
kmeans = KMeans(20).fit(test_glove)

In [None]:
# cosine similarity
# returns the top 5 words closest to each cluster center for aid interpretation
cluster_term = []
for cluster in kmeans.cluster_centers_:
    # find nearest word in glove
    nearest = 1000
    nearest_k = []
    for k,v in glove_dict.items():
        curr_dist = cosine(v, cluster)
        if curr_dist < nearest:
            nearest = curr_dist
            if len(nearest_k) < 5:
                nearest_k.append(k)
            else:
                nearest_k.pop(0)
                nearest_k.append(k)
    cluster_term.append(nearest_k)

print(cluster_term)

In [None]:
# euclidean distance: worthless
cluster_term = []
for cluster in kmeans.cluster_centers_:
    # find nearest word in glove
    nearest = 1000
    nearest_k = []
    for k,v in glove_dict.items():
        curr_dist = np.sqrt(np.sum((v - cluster)**2))
        if curr_dist < nearest:
            nearest = curr_dist
            if len(nearest_k) < 5:
                nearest_k.append(k)
            else:
                nearest_k.pop(0)
                nearest_k.append(k)
    cluster_term.append(nearest_k)

print(cluster_term)