In [1]:
# Load a pre-trained model
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
import numpy as np
from KaggleWord2VecUtility import KaggleWord2VecUtility



In [2]:
# Define a function to create bags of centroids
#
def create_bag_of_centroids(wordlist, word_centroid_map):
    #
    # The number of cluster is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max(word_centroid_map.values()) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros(num_centroids, dtype='float32')
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the 'bag of centroids'
    return bag_of_centroids

In [3]:
# Load the model that I created in 'Word2Vec_AverageVectors.ipynb'
model = Word2Vec.load('./300features_40minwords_10context')

# ****** Run k-means on the word vectors and print a few clusters
#

start = time.time() # Start time

# Set "k" (num_cluster) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.syn0
# syn0: the Word2Vec model loaded above, consists of a feature vector for
#       each word in the vocabulary
# word_vectors row: the number of words in the model's vocabulary
# word_vectors column: the size of the feature vector
print(model.wv.syn0.shape)
num_clusters = int(word_vectors.shape[0] / 5) 
# Initialize a k-means object and use it to extract centroids
print("Running K means with k = ", num_clusters)
kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

(16490, 300)
Running K means with k =  3298
Time taken for K Means clustering:  929.8754115104675 seconds.


In [4]:
print(len(model.wv.vocab)) # the number of words in the model's vocabulary
model.wv.vocab # example of the vocabulary words

16490


{'yul': <gensim.models.keyedvectors.Vocab at 0x22dbea71240>,
 'kinski': <gensim.models.keyedvectors.Vocab at 0x22dbeaf6518>,
 'cheerfully': <gensim.models.keyedvectors.Vocab at 0x22dbea51a58>,
 'opus': <gensim.models.keyedvectors.Vocab at 0x22dbed53e48>,
 'jane': <gensim.models.keyedvectors.Vocab at 0x22dbea7f6d8>,
 'imposing': <gensim.models.keyedvectors.Vocab at 0x22dbea47860>,
 'downer': <gensim.models.keyedvectors.Vocab at 0x22dbed8ee48>,
 'knightley': <gensim.models.keyedvectors.Vocab at 0x22dbed97710>,
 'lengthy': <gensim.models.keyedvectors.Vocab at 0x22dbea75cc0>,
 'wit': <gensim.models.keyedvectors.Vocab at 0x22dbec249b0>,
 'jerky': <gensim.models.keyedvectors.Vocab at 0x22dbeb88f60>,
 'customs': <gensim.models.keyedvectors.Vocab at 0x22dbec32400>,
 'convinced': <gensim.models.keyedvectors.Vocab at 0x22dbeaffc50>,
 'exiled': <gensim.models.keyedvectors.Vocab at 0x22dbea51c50>,
 'dribble': <gensim.models.keyedvectors.Vocab at 0x22dbeae8470>,
 'debra': <gensim.models.keyedvector

In [5]:
# print an example word vector
print(type(model.wv['flower']), model.wv['flower'].shape, model.wv['flower'])

<class 'numpy.ndarray'> (300,) [-0.10062144 -0.0373067  -0.01177098  0.01897836 -0.0582221   0.01059407
 -0.01903199 -0.03965223  0.01402162  0.06537773 -0.00234694  0.04390201
 -0.05563514 -0.08568203  0.0407625  -0.04255536  0.01772894 -0.1099337
 -0.07264499 -0.00234345  0.03807487 -0.00944625 -0.05043571  0.04681217
 -0.00365645 -0.0103762   0.04115774  0.00555492 -0.10632971  0.0169404
 -0.11786894  0.04571186 -0.05091904  0.02057571  0.03425485  0.03920021
 -0.04902321  0.00347328 -0.00153042 -0.01283417 -0.03548294  0.0088976
 -0.07267819  0.06515966  0.04340233  0.09843781  0.10642156 -0.05319658
 -0.00077657 -0.05363375 -0.0399363  -0.06495567 -0.00632878  0.05095981
 -0.06730764  0.09523525 -0.02798701  0.07545108  0.01979266  0.04176433
  0.1159274   0.00106818 -0.03873147  0.04909463 -0.01007216  0.00981472
  0.02324556 -0.0216225  -0.09122876  0.07475305  0.03391683  0.10518024
  0.03046319 -0.04791102 -0.01907292 -0.01433413  0.07916567  0.09057612
  0.00386166  0.0980695

In [6]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number
word_centroid_map = dict(zip(model.wv.index2word, idx))

In [7]:
# Print the first ten clusters
for cluster in range(0, 10):
    #
    # Print the cluster number
    print("\nCluster %d" % cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    for i in range(0, len(word_centroid_map.values())):
        if list(word_centroid_map.values())[i] == cluster:
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['vertigo', 'alfred']

Cluster 1
['wounds', 'injuries']

Cluster 2
['jordan', 'dicaprio']

Cluster 3
['minorities', 'ethnic', 'traditionally', 'asians', 'intellectuals', 'regional']

Cluster 4
['scattered', 'sprinkled', 'interludes', 'inter', 'montages']

Cluster 5
['suit', 'wig', 'outfit', 'uniform', 'costume']

Cluster 6
['parrot', 'blind', 'mute', 'transvestite', 'deaf']

Cluster 7
['fix']

Cluster 8
['constant', 'nastiness', 'uncontrollable', 'outbursts', 'nausea']

Cluster 9
['seaside']


In [8]:
# Read data from files
train = pd.read_csv('./labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
test = pd.read_csv('./testData.tsv', header=0, delimiter='\t', quoting=3)

In [9]:
print("Cleaning training reviews")
clean_train_reviews = []
for review in train['review']:
    clean_train_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, \
                                remove_stopwords=True))

print("Cleaning test reviews")
clean_test_reviews = []
for review in test['review']:
    clean_test_reviews.append(KaggleWord2VecUtility.review_to_wordlist(review, \
                                remove_stopwords = True))
    
# ****** Create bags of centroids
#
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros((train['review'].size, num_clusters), \
                          dtype='float32')

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, \
        word_centroid_map)
    counter += 1
    
# Repeat for test reviews
test_centroids = np.zeros((test['review'].size, num_clusters), \
                         dtype = 'float32')

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, \
        word_centroid_map)
    counter += 1
    
# ****** Fit a random forest and extract predictions
#
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids, train['sentiment'])
result = forest.predict(test_centroids)

# Write the test results
output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})
output.to_csv('BagOfCentroids.csv', index=False, quoting=3)
print("Wrote BagOfCentroids.csv")

Cleaning training reviews




 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


Cleaning test reviews
Fitting a random forest to labeled training data...
Wrote BagOfCentroids.csv
