In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from gensim.models import word2vec
import pickle
import nltk.data
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
# Read data from files 
train = pd.read_csv("data/train.csv", header=0)
test = pd.read_csv( "data/test.csv", header=0)

# Verify the number of comments that were read
print("Read %d labeled train reviews and  %d unlabelled test reviews" % (len(train),len(test)))

all_comments = train['comment_text'].tolist() + test['comment_text'].tolist() 

with open("all_comments.csv", "w") as comments_file:
    i=0
    for comment in all_comments:
        comment = re.sub("[^a-zA-Z]"," ",str(comment))
        comments_file.write("%s\n" % comment)

##### 1. Preprocess Text Reviews For Creating Word Vectors

The <b>sentences</b> iterable can be simply a list, but for larger corpora, consider a generator that streams the sentences directly from disk/network, without storing everything in RAM. See BrownCorpus, Text8Corpus or LineSentence in the gensim.models.word2vec module for such examples.

<b>min_count</b> ignore all words and bigrams with total collected count lower than this.

<b>threshold</b> represents a score threshold for forming the phrases (higher means fewer phrases). A phrase of words a followed by b is accepted if the score of the phrase is greater than threshold. see the scoring setting.

<b>max_vocab_size</b> is the maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control. The default of 40M needs about 3.6GB of RAM; increase/decrease max_vocab_size depending on how much available memory you have.

<b>delimiter</b> is the glue character used to join collocation tokens, and should be a byte string (e.g. b’_’).

<b>scoring</b> specifies how potential phrases are scored for comparison to the threshold setting. scoring can be set with either a string that refers to a built-in scoring function, or with a function with the expected parameter names. Two built-in scoring functions are available by setting scoring to a string:

‘default’: from “Efficient Estimaton of Word Representations in Vector Space” by
Mikolov, et. al.: (count(worda followed by wordb) - min_count) * N / (count(worda) * count(wordb)) > threshold`, where <b>N</b> is the total vocabulary size.
<b>npmi</b>: normalized pointwise mutual information, from “Normalized (Pointwise) Mutual
Information in Colocation Extraction” by Gerlof Bouma: ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda followed by wordb) where prop(n) is the count of n / the count of everything in the entire corpus.
‘npmi’ is more robust when dealing with common words that form part of common bigrams, and ranges from -1 to 1, but is slower to calculate than the default.

To use a custom scoring function, create a function with the following parameters and set the scoring parameter to the custom function. You must use all the parameters in your function call, even if the function does not require all the parameters.

<b>worda_count</b>: number of occurrances in sentences of the first token in the phrase being scored wordb_count: number of occurrances in sentences of the second token in the phrase being scored bigram_count: number of occurrances in sentences of the phrase being scored len_vocab: the number of unique tokens in sentences min_count: the min_count setting of the Phrases class corpus_word_count: the total number of (non-unique) tokens in sentences
A scoring function without any of these parameters (even if the parameters are not used) will raise a ValueError on initialization of the Phrases class. The scoring function must be picklable.

common_terms is an optionnal list of “stop words” that won’t affect frequency count of expressions containing them.

In [None]:
class FileToComments(object):    
    def __init__(self, filename):
        self.filename = filename
        self.stop = set(nltk.corpus.stopwords.words('english'))
        
    def __iter__(self):
        
        def comment_to_wordlist(comment, remove_stopwords=True):
            comment = str(comment)
            words = comment.lower().split()
            #if remove_stopwords:
            #    stops = set(stopwords.words("english"))
            #    words = [w for w in words if not w in stops]
            return(words)
    
        for line in open(self.filename, 'r'):
            #line = unicode(line, 'utf-8')
            tokenized_comment = comment_to_wordlist(line, tokenizer)
            yield tokenized_comment
        
comments = FileToComments('all_comments.csv')
#model = gensim.models.Word2Vec(sentences=sentences, window=5, min_count=5, workers=4, hs=1)

In [None]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

bigram = Phrases(comments, min_count=30, threshold=15)
bigram_phraser = Phraser(bigram) 

In [None]:
tokens = [bigram_phraser[comment] for comment in comments]
stops = set(stopwords.words("english"))
clean_tokens = []
for token in tokens:
    words = [w for w in token if not w in stops]
    clean_tokens += [words]

In [None]:
# #Pickle the tokens file for further use
# import pickle
# with open('tokenized_comments.pickle', 'wb') as filename:
#     pickle.dump(clean_tokens, filename, protocol=pickle.HIGHEST_PROTOCOL)

##### 2. Training and Saving Your Model

With the list of nicely parsed sentences, we're ready to train the model. There are a number of parameter choices that affect the run time and the quality of the final model that is produced. For details on the algorithms below, see the word2vec API documentation as well as the Google documentation. 

 - Architecture: Architecture options are skip-gram (default) or continuous bag of words. We found that skip-gram was very slightly slower but produced better results. 
 - Training algorithm: Hierarchical softmax (default) or negative sampling. For us, the default worked well.
 - Downsampling of frequent words: The Google documentation recommends values between .00001 and .001. For us, values closer 0.001 seemed to improve the accuracy of the final model.
 - Word vector dimensionality: More features result in longer runtimes, and often, but not always, result in better models. Reasonable values can be in the tens to hundreds; we used 300. 
 - Context / window size: How many words of context should the training algorithm take into account? 10 seems to work well for hierarchical softmax (more is better, up to a point).
 - Worker threads: Number of parallel processes to run. This is computer-specific, but between 4 and 6 should work on most systems.
 - Minimum word count: This helps limit the size of the vocabulary to meaningful words. Any word that does not occur at least this many times across all documents is ignored. Reasonable values could be between 10 and 100. In this case, since each movie occurs 30 times, we set the minimum word count to 40, to avoid attaching too much importance to individual movie titles. This resulted in an overall vocabulary size of around 15,000 words. Higher values also help limit run time.

In [2]:
#Load Pre-saved tokenized comments
with open('tokenized_comments.pickle', 'rb') as filename:
    comments = pickle.load(filename)
                           
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)


# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 20   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print("Training model...")
model = word2vec.Word2Vec(comments,
                          workers=num_workers,
                          size=num_features,
                          min_count = min_word_count,
                          window = context,
                          sample = downsampling
                         )

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2017-12-28 08:43:51,274 : INFO : collecting all words and their counts
2017-12-28 08:43:51,279 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-12-28 08:43:51,455 : INFO : PROGRESS: at sentence #10000, processed 332303 words, keeping 35481 word types


Training model...


2017-12-28 08:43:51,653 : INFO : PROGRESS: at sentence #20000, processed 664751 words, keeping 52404 word types
2017-12-28 08:43:51,790 : INFO : PROGRESS: at sentence #30000, processed 998240 words, keeping 66195 word types
2017-12-28 08:43:51,922 : INFO : PROGRESS: at sentence #40000, processed 1342842 words, keeping 78285 word types
2017-12-28 08:43:52,046 : INFO : PROGRESS: at sentence #50000, processed 1672684 words, keeping 89015 word types
2017-12-28 08:43:52,176 : INFO : PROGRESS: at sentence #60000, processed 2007133 words, keeping 98715 word types
2017-12-28 08:43:52,299 : INFO : PROGRESS: at sentence #70000, processed 2341001 words, keeping 107515 word types
2017-12-28 08:43:52,425 : INFO : PROGRESS: at sentence #80000, processed 2684149 words, keeping 116238 word types
2017-12-28 08:43:52,544 : INFO : PROGRESS: at sentence #90000, processed 3020627 words, keeping 124056 word types
2017-12-28 08:43:52,673 : INFO : PROGRESS: at sentence #100000, processed 3398527 words, keepin

2017-12-28 08:44:35,817 : INFO : PROGRESS: at 30.07% examples, 419942 words/s, in_qsize 7, out_qsize 0
2017-12-28 08:44:36,837 : INFO : PROGRESS: at 30.81% examples, 419563 words/s, in_qsize 7, out_qsize 0
2017-12-28 08:44:37,844 : INFO : PROGRESS: at 31.46% examples, 417734 words/s, in_qsize 6, out_qsize 1
2017-12-28 08:44:38,905 : INFO : PROGRESS: at 32.21% examples, 417144 words/s, in_qsize 8, out_qsize 0
2017-12-28 08:44:39,906 : INFO : PROGRESS: at 32.82% examples, 415145 words/s, in_qsize 8, out_qsize 0
2017-12-28 08:44:40,913 : INFO : PROGRESS: at 33.60% examples, 415041 words/s, in_qsize 7, out_qsize 0
2017-12-28 08:44:41,941 : INFO : PROGRESS: at 34.13% examples, 411798 words/s, in_qsize 8, out_qsize 0
2017-12-28 08:44:42,956 : INFO : PROGRESS: at 34.82% examples, 410837 words/s, in_qsize 6, out_qsize 1
2017-12-28 08:44:43,959 : INFO : PROGRESS: at 35.52% examples, 409811 words/s, in_qsize 7, out_qsize 0
2017-12-28 08:44:45,004 : INFO : PROGRESS: at 36.15% examples, 408462 wor

2017-12-28 08:45:57,073 : INFO : PROGRESS: at 94.66% examples, 422235 words/s, in_qsize 7, out_qsize 0
2017-12-28 08:45:58,082 : INFO : PROGRESS: at 95.75% examples, 423527 words/s, in_qsize 7, out_qsize 0
2017-12-28 08:45:59,095 : INFO : PROGRESS: at 96.80% examples, 424836 words/s, in_qsize 8, out_qsize 0
2017-12-28 08:46:00,100 : INFO : PROGRESS: at 97.82% examples, 425983 words/s, in_qsize 8, out_qsize 0
2017-12-28 08:46:01,123 : INFO : PROGRESS: at 98.85% examples, 427189 words/s, in_qsize 6, out_qsize 1
2017-12-28 08:46:02,138 : INFO : PROGRESS: at 99.91% examples, 428434 words/s, in_qsize 7, out_qsize 0
2017-12-28 08:46:02,185 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-12-28 08:46:02,206 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-12-28 08:46:02,214 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-12-28 08:46:02,216 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-12-28 08:46:0

In [None]:
model.index2word

##### 3. Training A Model With Average Feature Vectors

In [None]:
# Load the model that we created 
from gensim.models import Word2Vec
import gensim
model = Word2Vec.load("300features_40minwords_10context")
#model= gensim.models.KeyedVectors.load_word2vec_format("300features_40minwords_10context", binary=False)

In [None]:
set(model.index2word)

In [None]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given comment

    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0.
    # Index2word is a list that contains the names of the words in the model's vocabulary. 
    # Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%1000. == 0.:
           print("Review %d of %d" % (counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [None]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.
num_features = 300    # Word vector dimensionality                      

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs(clean_train_reviews, model, num_features )

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )