In [5]:
import nltk
from nltk.tokenize import wordpunct_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.probability import FreqDist

import numpy as np
from numpy.linalg import svd, matrix_rank, norm
import time
import copy

import sklearn
from sklearn.cluster import KMeans, MiniBatchKMeans, SpectralClustering, MeanShift
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

from nltk.cluster import KMeansClusterer, euclidean_distance, cosine_distance
print("Imported all libraries succesfully")
#np.random.seed(0)

Imported all libraries succesfully


In [6]:
# -- Step 1 -- Cleaning and pre-processing all reviews
# Read raw text from the review files
sentences = list()
path = "product_reviews/"
products = ["Canon_PowerShot_SD500", "Canon_S100", "Diaper_Champ", "Hitachi_router", "ipod", "Linksys_Router", "MicroMP3", "Nokia_6600", "norton"]

# Split raw text in sentences
for p in products:
    f = open(path+p+'.txt', 'r')
    for line in f:
        if line.strip() != "[t]":
            try:
                sentences.append(line.strip().split("##", 1)[1])
            except IndexError:
                # Some sentences don't have the "##" so skip them
                pass
                #sentences.append(line.strip().split("##", 1)[0])

fdist = FreqDist()
token_sentences = list()
stop_words = set(stopwords.words('english'))
stop_words.add('u') # Decided to also remove this as a lot of people in the reviews used it instead of 'you'
lem = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

# Tokenizing raw text in the sentences
for sentence in sentences:
    tokens = wordpunct_tokenize(sentence)
    tokens_filtered = [t for t in tokens if t.lower() not in stop_words]
    tokens_filtered = [t.lower() for t in tokens_filtered if t.lower().isalnum() and not t.lower().isnumeric()]
    
    tokens_filtered = [lem.lemmatize(t) for t in tokens_filtered]
    
    tokens_filtered = [t for t in tokens_filtered if len(t.lower()) > 1]
    
    token_sentences.append(tokens_filtered)
    fdist.update(tokens_filtered)

# Getting the target and pseudo words
target_words = [x[0] for x in fdist.most_common(50)]
pseudo_words = [x[::-1] for x in target_words]

# Create position index for terms, first half with the target words, second half with the pseudo words
terms = np.array(target_words+pseudo_words, dtype=object)
position_index_init = {k:[] for k in terms}
for t in target_words:
    for i,s in enumerate(token_sentences):
        for j,word in enumerate(s):
            if t == word:
                position_index_init[t].append((i, j))
                
print("Succesfully read reviews for: " + str(products))
print("Succesfully selected top words and created their pseudo words")
print("Top 50 words: " + str(target_words))

Succesfully read reviews for: ['Canon_PowerShot_SD500', 'Canon_S100', 'Diaper_Champ', 'Hitachi_router', 'ipod', 'Linksys_Router', 'MicroMP3', 'Nokia_6600', 'norton']
Succesfully selected top words and created their pseudo words
Top 50 words: ['one', 'ipod', 'use', 'phone', 'get', 'router', 'camera', 'player', 'like', 'great', 'time', 'battery', 'work', 'problem', 'good', 'diaper', 'product', 'would', 'zen', 'also', 'computer', 'well', 'really', 'feature', 'quality', 'take', 'easy', 'even', 'thing', 'micro', 'first', 'need', 'used', 'want', 'much', 'better', 'creative', 'software', 'go', 'picture', 'little', 'bag', 'music', 'sound', 'buy', 'still', 'mp3', 'make', 'song', 'review']


In [7]:
def construct_feature_vector(terms, vocab, documents, position_index):
    """
    Create a word (terms) - feature (vocab in document) vector
    Improved by use of the positional index of the terms
    """
    context_labels = {terms[i] : i for i in range(0, len(terms))}
    context_vocab = {vocab[i] : i for i in range(0, len(vocab))}
    term_context = np.zeros((len(context_labels.keys()), len(context_vocab.keys())))
    for key_word in position_index.keys():
        for location in position_index[key_word]:
            sentence = documents[location[0]]
            for j in range(max(0, location[1] - window), min(len(sentence), location[1] + window+1)):
                if location[1] == j:
                    continue
                term_context[context_labels[key_word]][context_vocab[stemmer.stem(sentence[j])]] += 1
    index = np.argwhere(np.all(term_context[..., :] == 0, axis=0))
    term_context_final = np.delete(term_context, index, axis=1)
    return term_context_final, context_labels, context_vocab

def run(testing_number_, window_, verbose_):
# -- Step 6 -- Repeating the process testing_number times
    testing_number = testing_number_
    window = window_
    verbose = verbose_
    performance = []
    timer = []
    for testing in range(testing_number):
        loop_time = time.time()
        # -- Step 2 -- Sampeling half of the occurences in the corpus with the pseudo words --
        # Create new temporary documents and position index for each iteration
        documents = np.array(copy.deepcopy(token_sentences), dtype=object)
        position_index = copy.deepcopy(position_index_init)

        # Uniformly sample half of its occurances in the position index for each word pseudo
        for k in position_index.keys():
            pseudo = k[::-1]
            if pseudo == list(position_index.keys())[0]:
                break
            pseudo_sample = np.random.choice(range(0, len(position_index[k])), size=int(len(position_index[k])/2), replace=False)
            for i in pseudo_sample:
                documents[position_index[k][i][0]][position_index[k][i][1]] = pseudo
                position_index[pseudo].append([position_index[k][i][0], position_index[k][i][1]])
                position_index[k][i] = "x"
        # Deleting swapped indexed locations of the target words
        for k in position_index.keys():
            position_index[k] = [value for value in position_index[k] if value != "x"]

        # -- Step 3 -- Constructing the feature context matrix --
        # Adding all words in the documents including the pseudo_words to the vocabulary
        vocab = list(fdist.keys()) + pseudo_words
        vocab = [stemmer.stem(t) for t in vocab]
        vocab = list(dict.fromkeys(vocab))

        # Creating the feature context matrix based on the vocabulary and the indexed terms (labels)
        term_context, context_labels, context_vocab = construct_feature_vector(terms, vocab, documents, position_index)

        # -- Step 4 -- Applying clustering on the feature context matrix --
        km = KMeans(n_clusters=50).fit(term_context)
        labels = km.labels_
        labels.tolist()

        # -- Step 5 -- Checking clusters --
        performance.append((np.sum(labels[0:50] == labels[50:100])) / len(labels[0:50]))
        timer.append(time.time() - loop_time)
        if verbose == True:
            print("---- Labels for run " + str(testing+1) +" ----")
            print("Target: " + str(labels[0:50]))
            print("Pseudo: " + str(labels[50:100]))
            print("Performance: " + str(performance[testing]))
    return performance, timer


In [11]:
start_time = time.time()
# Testing number, window size and verbose parameters
testing_number = 5
window = 1
verbose = True
performance, timer = run(testing_number_=testing_number, window_=window, verbose_=verbose)

# Displaying results
print("Info and hyperparameters:")
print("| Ran " + str(len(performance)) + " times")
print("| Window size " + str(window))
print("Performance:")
print("| %s  mean" % (np.mean(performance)))
print("| %s  std" % (np.std(performance)))
print("Time:")
print("| %s  total seconds" % (time.time() - start_time))
print("| %s  mean seconds for loops" % (np.mean(timer)))

---- Labels for run 1 ----
Target: [42 10 36 22 23 13 11  9  6 35 30 14 19 43 25  8 25  5  7 25 45  0 25 25
 20 48 15 25  4  2 25  6 25 25 27 39 28 24 41 21  5 40 49 21 25 25 12 25
 25  1]
Pseudo: [16 38 17 26 31 34 37 32  6 35 30 33 44 43 46  8 25 47 29 25  3  0  4 25
 20 25 15 25 25 18 25 25 25 25 27 39 28 24 41 21 25 40 49 21 25 25 12 25
 25  1]
Performance: 0.6
---- Labels for run 2 ----
Target: [13  9 14 16 29 10 25 32  5 40 36  3 19 35 48  2 47  5  4 39 34 24  5  5
  6 42 21 49 39  1  5 20 39 39 44 37 41 38  0 30 39 31 18 30  5 39 11 39
 18 27]
Pseudo: [46  8 26 23 22 17 12  7 28 15 18 45 19 35 33  2 47 43  4  5 39 24 39  5
  6 39 21 49 39  1 39 20 20  5 44 37 41 38  0 30 39 31 18 30  5 39 11 39
 18 27]
Performance: 0.58
---- Labels for run 3 ----
Target: [24 12 27 16 35 13  9 46 29 31 33  7 40 34 41  5 44  0  1 22 37  3 22 22
 19 22 15 43 22  4 22 22 17 22 28 47 20 45 49 14 22 18 32 23 22 22  8 22
 32 30]
Pseudo: [21  2 11 39 38 10 26  6 42 48 36  7 25 34  0  5 44 22  1 22 22  3