In [1]:
import nltk
from nltk.tokenize import wordpunct_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist

import numpy as np
from numpy.linalg import svd, matrix_rank, norm
import time
import copy

import sklearn
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering, MeanShift
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

from nltk.cluster import KMeansClusterer, euclidean_distance, cosine_distance
print("Imported all libraries succesfully")
#np.random.seed(9)

Imported all libraries succesfully


In [2]:
# -- Step 1 -- Cleaning and pre-processing all reviews
# Read raw text from the review files
sentences = list()
path = "product_reviews/"
products = ["Canon_PowerShot_SD500", "Canon_S100", "Diaper_Champ", "Hitachi_router", "ipod", "Linksys_Router", "MicroMP3", "Nokia_6600", "norton"]

# Split raw text in sentences
for p in products:
    f = open(path+p+'.txt', 'r')
    for line in f:
        if line.strip() != "[t]":
            try:
                sentences.append(line.strip().split("##", 1)[1])
            except IndexError:
                # Some sentences don't have the "##" so skip them
                pass
                #sentences.append(line.strip().split("##", 1)[0])

fdist = FreqDist()
token_sentences = list()
stop_words = set(stopwords.words('english'))
stop_words.add('u') # Decided to also remove this as a lot of people in the reviews used it instead of 'you'
lem = WordNetLemmatizer()

# Tokenizing raw text in the sentences
for sentence in sentences:
    tokens = wordpunct_tokenize(sentence)
    tokens_filtered = [t for t in tokens if t.lower() not in stop_words]
    tokens_filtered = [t.lower() for t in tokens_filtered if t.lower().isalnum() and not t.lower().isnumeric()]
    
    tokens_filtered = [lem.lemmatize(t) for t in tokens_filtered]
    
    tokens_filtered = [t for t in tokens_filtered if len(t.lower()) > 1]
    
    token_sentences.append(tokens_filtered)
    fdist.update(tokens_filtered)

# Getting the target and pseudo words
target_words = [x[0] for x in fdist.most_common(50)]
pseudo_words = [x[::-1] for x in target_words]

# Create position index for terms
terms = np.array(target_words+pseudo_words, dtype=object)
position_index_init = {k:[] for k in terms}
for t in target_words:
    for i,s in enumerate(token_sentences):
        for j,word in enumerate(s):
            if t == word:
                position_index_init[t].append((i, j))
                
print("Succesfully read reviews for: " + str(products))
print("Succesfully selected top words and created their pseudo words")
print("Top 50 words: " + str(target_words))

Succesfully read reviews for: ['Canon_PowerShot_SD500', 'Canon_S100', 'Diaper_Champ', 'Hitachi_router', 'ipod', 'Linksys_Router', 'MicroMP3', 'Nokia_6600', 'norton']
Succesfully selected top words and created their pseudo words
Top 50 words: ['one', 'ipod', 'use', 'phone', 'get', 'router', 'camera', 'player', 'like', 'great', 'time', 'battery', 'work', 'problem', 'good', 'diaper', 'product', 'would', 'zen', 'also', 'computer', 'well', 'really', 'feature', 'quality', 'take', 'easy', 'even', 'thing', 'micro', 'first', 'need', 'used', 'want', 'much', 'better', 'creative', 'software', 'go', 'picture', 'little', 'bag', 'music', 'sound', 'buy', 'still', 'mp3', 'make', 'song', 'review']


In [3]:
def construct_feature_vector(terms, vocab, documents, position_index, window):
    context_labels = {terms[i] : i for i in range(0, len(terms))}
    context_vocab = {vocab[i] : i for i in range(0, len(vocab))}
    term_context = np.zeros((len(context_labels.keys()), len(context_vocab.keys())))
    for key_word in position_index.keys():
        for location in position_index[key_word]:
            sentence = documents[location[0]]
            for j in range(max(0, location[1] - window), min(len(sentence), location[1] + window + 1)):
                if location[1] == j:
                    continue
                term_context[context_labels[key_word]][context_vocab[sentence[j]]] += 1
    index = np.argwhere(np.all(term_context[..., :] == 0, axis=0))
    term_context_final = np.delete(term_context, index, axis=1)
    return term_context_final, context_labels, context_vocab

def run(testing_number_, window_, verbose_):
    # -- Step 6 -- Repeating the process testing_number times
    testing_number = testing_number_
    window = window_
    verbose = verbose_
    performance = []
    timer = []
    for testing in range(testing_number):
        loop_time = time.time()
        # -- Step 2 -- Sampeling half of the occurences in the corpus with the pseudo words --
        # Create new temporary documents and position index for each iteration
        documents = np.array(copy.deepcopy(token_sentences), dtype=object)
        position_index = copy.deepcopy(position_index_init)

        # Uniformly sample half of its occurances in the position index for each word pseudo
        for k in position_index.keys():
            pseudo = k[::-1]
            if pseudo == list(position_index.keys())[0]:
                break
            pseudo_sample = np.random.choice(range(0, len(position_index[k])), size=int(len(position_index[k])/2), replace=False)
            for i in pseudo_sample:
                documents[position_index[k][i][0]][position_index[k][i][1]] = pseudo
                position_index[pseudo].append((position_index[k][i][0], position_index[k][i][1]))
                position_index[k][i] = "x"
        # Deleting swapped indexed locations of the target words
        for k in position_index.keys():
            position_index[k] = [value for value in position_index[k] if value != "x"]

        # -- Step 3 -- Constructing the feature context matrix --
        # Adding all words in the documents including the pseudo_words to the vocabulary
        vocab = list(fdist.keys()) + pseudo_words
        vocab = list(dict.fromkeys(vocab))

        # Creating the feature context matrix based on the vocabulary and the indexed terms (labels)
        term_context, context_labels, context_vocab = construct_feature_vector(terms, vocab, documents, position_index, window)

        # Apply LSI technique, with truncated SVD calculations and a normalizer
        svd = TruncatedSVD(n_components=100)

        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        transformed = lsa.fit_transform(term_context)

        # -- Step 4 -- Applying clustering on the feature context matrix --
        km = KMeans(n_clusters=50).fit(transformed)
        labels = km.labels_
        labels.tolist()

        # -- Step 5 -- Checking clusters --
        performance.append((np.sum(labels[0:50] == labels[50:100])) / len(labels[0:50]))
        timer.append(time.time() - loop_time)
        if verbose == True:
            print("---- Labels for run " + str(testing+1) +" ----")
            print("Target: " + str(labels[0:50]))
            print("Pseudo: " + str(labels[50:100]))
            print("Performance: " + str(performance[testing]))
    return performance, timer


In [15]:
start_time = time.time()
# Testing number, window size and verbose parameters
testing_number = 5
window = 1
verbose = True
performance, timer = run(testing_number_=testing_number, window_=window, verbose_=verbose)

# Displaying results
print("Info and hyperparameters:")
print("| Ran " + str(len(performance)) + " times")
print("| Window size " + str(window))
print("Performance:")
print("| %s  mean" % (np.mean(performance)))
print("| %s  std" % (np.std(performance)))
print("Time:")
print("| %s  total seconds" % (time.time() - start_time))
print("| %s  mean seconds for loops" % (np.mean(timer)))

---- Labels for run 1 ----
Target: [22  5 24 31  1 20 14  9 45  3 39 15  4 10 47 19 40 38  8 42  0 25 11 11
  2 46 16 28 33  7 34 36 29 32 12 30  7 35 21 13 41 26  6 13 43 11 23 27
 17 18]
Pseudo: [22  5 24 31  1 20 14  9 45  3 39 15  4 10 47 19 48 38  8 44  0 25 33 11
  2 46 16 28 33  7 34 36 29 37 12 30  7 35 21 13 23 26  6 13 43 49 23 27
 17 18]
Performance: 0.88
---- Labels for run 2 ----
Target: [23  5  9 21 48 37 24 15 14 22 49 10 17 11 22  8 32 18  4 46 26 12 45  0
 13 36 16 42 25  3 29 21 43 40 28  2  3  1 38 41 19 33 44 41 31 27  7 47
 35 20]
Pseudo: [23  5  9 21 48 37 24 15 14 22 49 10 17 11 22  8 32 18  4 39 26 12 45  0
 13 36 16 42 25  3 29 48 30 40 28  2  3  1 38  6 19 33 44 41 31 34  7 47
 35 20]
Performance: 0.9
---- Labels for run 3 ----
Target: [ 1  4 13 28  3 24 14  6  8 11 38 25 15 27 16 18 48 30 12 43 20 45 35  2
 23 31 10 33 35  0 21 47 37 41 22 29  0 34 26  9 32 19  5  9 42  1  7 36
 40 17]
Pseudo: [ 1  4 13 28  3 24 14  6  8 11 38 25 15 27 16 18 48 30 12 10 20 45