In [8]:
import pandas as pd

# Read data from files 
train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 ,encoding="utf-8")
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3,encoding="utf-8" )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0,delimiter="\t", quoting=3,encoding="utf-8" )

# Verify the number of reviews that were read (100,000 in total)
print ("Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  
 test["review"].size, unlabeled_train["review"].size ))

Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [10]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(review,"html.parser").get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

In [11]:
import nltk.data
#nltk.download('punkt')

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [12]:
# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [114]:


sentences = []  # Initialize an empty list of sentences
print("Parsing sentences from training set")

for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)



Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [115]:
print (len(sentences))

795538


In [116]:
sentences[0]

['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again']

In [119]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print( "Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-09-21 01:08:48,531 : INFO : 'pattern' package not found; tag filters are not available for English
2018-09-21 01:08:49,110 : INFO : collecting all words and their counts
2018-09-21 01:08:49,114 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types


Training model...


2018-09-21 01:09:48,722 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2018-09-21 01:09:53,117 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2018-09-21 01:09:55,246 : INFO : PROGRESS: at sentence #30000, processed 671314 words, keeping 30034 word types
2018-09-21 01:09:57,221 : INFO : PROGRESS: at sentence #40000, processed 897814 words, keeping 34348 word types
2018-09-21 01:09:58,743 : INFO : PROGRESS: at sentence #50000, processed 1116962 words, keeping 37761 word types
2018-09-21 01:09:59,965 : INFO : PROGRESS: at sentence #60000, processed 1338403 words, keeping 40723 word types
2018-09-21 01:10:01,053 : INFO : PROGRESS: at sentence #70000, processed 1561579 words, keeping 43333 word types
2018-09-21 01:10:02,157 : INFO : PROGRESS: at sentence #80000, processed 1780886 words, keeping 45714 word types
2018-09-21 01:10:03,137 : INFO : PROGRESS: at sentence #90000, processed 2004995 words, keeping 4813

2018-09-21 01:10:29,096 : INFO : PROGRESS: at sentence #730000, processed 16332045 words, keeping 118955 word types
2018-09-21 01:10:29,521 : INFO : PROGRESS: at sentence #740000, processed 16553078 words, keeping 119669 word types
2018-09-21 01:10:29,874 : INFO : PROGRESS: at sentence #750000, processed 16771405 words, keeping 120296 word types
2018-09-21 01:10:30,161 : INFO : PROGRESS: at sentence #760000, processed 16990809 words, keeping 120931 word types
2018-09-21 01:10:30,543 : INFO : PROGRESS: at sentence #770000, processed 17217946 words, keeping 121704 word types
2018-09-21 01:10:30,888 : INFO : PROGRESS: at sentence #780000, processed 17448092 words, keeping 122403 word types
2018-09-21 01:10:31,126 : INFO : PROGRESS: at sentence #790000, processed 17675168 words, keeping 123067 word types
2018-09-21 01:10:31,342 : INFO : collected 123505 word types from a corpus of 17798269 raw words and 795538 sentences
2018-09-21 01:10:31,345 : INFO : Loading a fresh vocabulary
2018-09-21

2018-09-21 01:11:38,164 : INFO : EPOCH 1 - PROGRESS: at 43.58% examples, 88590 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:11:39,206 : INFO : EPOCH 1 - PROGRESS: at 44.50% examples, 88937 words/s, in_qsize 8, out_qsize 1
2018-09-21 01:11:40,247 : INFO : EPOCH 1 - PROGRESS: at 45.41% examples, 89274 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:11:41,259 : INFO : EPOCH 1 - PROGRESS: at 46.27% examples, 89638 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:11:42,308 : INFO : EPOCH 1 - PROGRESS: at 47.22% examples, 90053 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:11:43,361 : INFO : EPOCH 1 - PROGRESS: at 48.06% examples, 90238 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:11:44,437 : INFO : EPOCH 1 - PROGRESS: at 48.87% examples, 90387 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:11:45,509 : INFO : EPOCH 1 - PROGRESS: at 49.61% examples, 90332 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:11:46,553 : INFO : EPOCH 1 - PROGRESS: at 50.28% examples, 90212 words/s, in_qsize 6, out_

2018-09-21 01:12:55,908 : INFO : EPOCH 1 - PROGRESS: at 98.29% examples, 89351 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:12:57,055 : INFO : EPOCH 1 - PROGRESS: at 98.89% examples, 89182 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:12:58,144 : INFO : EPOCH 1 - PROGRESS: at 99.54% examples, 89101 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:12:58,479 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-21 01:12:58,486 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-21 01:12:58,520 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-21 01:12:58,554 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-21 01:12:58,556 : INFO : EPOCH - 1 : training on 17798269 raw words (12749854 effective words) took 142.9s, 89240 effective words/s
2018-09-21 01:12:59,755 : INFO : EPOCH 2 - PROGRESS: at 0.67% examples, 83372 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:13:00,791 : INFO : EPOCH 2 - PROGRE

2018-09-21 01:14:10,313 : INFO : EPOCH 2 - PROGRESS: at 48.72% examples, 86500 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:14:11,340 : INFO : EPOCH 2 - PROGRESS: at 49.27% examples, 86263 words/s, in_qsize 6, out_qsize 0
2018-09-21 01:14:12,355 : INFO : EPOCH 2 - PROGRESS: at 49.66% examples, 85752 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:14:13,364 : INFO : EPOCH 2 - PROGRESS: at 50.11% examples, 85360 words/s, in_qsize 6, out_qsize 1
2018-09-21 01:14:14,509 : INFO : EPOCH 2 - PROGRESS: at 50.45% examples, 84639 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:14:15,581 : INFO : EPOCH 2 - PROGRESS: at 50.79% examples, 84018 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:14:16,728 : INFO : EPOCH 2 - PROGRESS: at 51.24% examples, 83513 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:14:17,746 : INFO : EPOCH 2 - PROGRESS: at 51.64% examples, 83073 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:14:18,795 : INFO : EPOCH 2 - PROGRESS: at 52.10% examples, 82698 words/s, in_qsize 7, out_

2018-09-21 01:15:31,001 : INFO : EPOCH 2 - PROGRESS: at 90.36% examples, 75647 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:15:32,053 : INFO : EPOCH 2 - PROGRESS: at 90.88% examples, 75547 words/s, in_qsize 7, out_qsize 1
2018-09-21 01:15:33,098 : INFO : EPOCH 2 - PROGRESS: at 91.54% examples, 75592 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:15:34,212 : INFO : EPOCH 2 - PROGRESS: at 92.10% examples, 75511 words/s, in_qsize 6, out_qsize 1
2018-09-21 01:15:35,426 : INFO : EPOCH 2 - PROGRESS: at 92.74% examples, 75427 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:15:36,511 : INFO : EPOCH 2 - PROGRESS: at 93.29% examples, 75361 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:15:37,588 : INFO : EPOCH 2 - PROGRESS: at 94.10% examples, 75482 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:15:38,632 : INFO : EPOCH 2 - PROGRESS: at 94.77% examples, 75526 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:15:39,912 : INFO : EPOCH 2 - PROGRESS: at 95.65% examples, 75592 words/s, in_qsize 7, out_

2018-09-21 01:16:46,603 : INFO : EPOCH 3 - PROGRESS: at 34.82% examples, 73766 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:16:47,656 : INFO : EPOCH 3 - PROGRESS: at 35.33% examples, 73547 words/s, in_qsize 6, out_qsize 1
2018-09-21 01:16:48,759 : INFO : EPOCH 3 - PROGRESS: at 35.95% examples, 73505 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:16:49,869 : INFO : EPOCH 3 - PROGRESS: at 36.51% examples, 73343 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:16:50,902 : INFO : EPOCH 3 - PROGRESS: at 37.06% examples, 73280 words/s, in_qsize 8, out_qsize 1
2018-09-21 01:16:51,921 : INFO : EPOCH 3 - PROGRESS: at 37.73% examples, 73457 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:16:52,929 : INFO : EPOCH 3 - PROGRESS: at 38.46% examples, 73740 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:16:54,102 : INFO : EPOCH 3 - PROGRESS: at 39.25% examples, 73947 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:16:55,119 : INFO : EPOCH 3 - PROGRESS: at 40.07% examples, 74414 words/s, in_qsize 7, out_

2018-09-21 01:18:06,351 : INFO : EPOCH 3 - PROGRESS: at 83.28% examples, 76046 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:18:07,384 : INFO : EPOCH 3 - PROGRESS: at 83.89% examples, 76046 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:18:08,410 : INFO : EPOCH 3 - PROGRESS: at 84.40% examples, 75950 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:18:09,485 : INFO : EPOCH 3 - PROGRESS: at 85.05% examples, 75979 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:18:10,490 : INFO : EPOCH 3 - PROGRESS: at 85.68% examples, 75997 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:18:11,498 : INFO : EPOCH 3 - PROGRESS: at 86.37% examples, 76061 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:18:12,673 : INFO : EPOCH 3 - PROGRESS: at 86.86% examples, 75889 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:18:13,728 : INFO : EPOCH 3 - PROGRESS: at 87.47% examples, 75878 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:18:14,740 : INFO : EPOCH 3 - PROGRESS: at 87.97% examples, 75795 words/s, in_qsize 6, out_

2018-09-21 01:19:19,803 : INFO : EPOCH 4 - PROGRESS: at 39.18% examples, 100408 words/s, in_qsize 7, out_qsize 1
2018-09-21 01:19:20,810 : INFO : EPOCH 4 - PROGRESS: at 39.90% examples, 100246 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:19:21,841 : INFO : EPOCH 4 - PROGRESS: at 40.69% examples, 100191 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:19:22,924 : INFO : EPOCH 4 - PROGRESS: at 41.42% examples, 99892 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:19:23,983 : INFO : EPOCH 4 - PROGRESS: at 42.19% examples, 99786 words/s, in_qsize 8, out_qsize 1
2018-09-21 01:19:25,040 : INFO : EPOCH 4 - PROGRESS: at 42.97% examples, 99689 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:19:26,078 : INFO : EPOCH 4 - PROGRESS: at 43.76% examples, 99629 words/s, in_qsize 6, out_qsize 1
2018-09-21 01:19:27,139 : INFO : EPOCH 4 - PROGRESS: at 44.74% examples, 99910 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:19:28,167 : INFO : EPOCH 4 - PROGRESS: at 45.60% examples, 100098 words/s, in_qsize 7, 

2018-09-21 01:20:37,835 : INFO : EPOCH 4 - PROGRESS: at 98.77% examples, 98746 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:20:39,044 : INFO : EPOCH 4 - PROGRESS: at 99.54% examples, 98594 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:20:39,409 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-21 01:20:39,463 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-21 01:20:39,524 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-21 01:20:39,560 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-21 01:20:39,564 : INFO : EPOCH - 4 : training on 17798269 raw words (12748185 effective words) took 129.2s, 98636 effective words/s
2018-09-21 01:20:40,863 : INFO : EPOCH 5 - PROGRESS: at 0.45% examples, 45128 words/s, in_qsize 8, out_qsize 2
2018-09-21 01:20:41,887 : INFO : EPOCH 5 - PROGRESS: at 1.01% examples, 56220 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:20:42,944 : INFO : EPOCH 5 - PROGRES

2018-09-21 01:21:52,338 : INFO : EPOCH 5 - PROGRESS: at 46.61% examples, 81386 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:21:53,466 : INFO : EPOCH 5 - PROGRESS: at 47.10% examples, 81016 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:21:54,557 : INFO : EPOCH 5 - PROGRESS: at 47.67% examples, 80790 words/s, in_qsize 6, out_qsize 1
2018-09-21 01:21:55,670 : INFO : EPOCH 5 - PROGRESS: at 48.22% examples, 80546 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:21:56,839 : INFO : EPOCH 5 - PROGRESS: at 48.93% examples, 80533 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:21:57,881 : INFO : EPOCH 5 - PROGRESS: at 49.61% examples, 80556 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:21:58,961 : INFO : EPOCH 5 - PROGRESS: at 50.28% examples, 80542 words/s, in_qsize 8, out_qsize 0
2018-09-21 01:22:00,017 : INFO : EPOCH 5 - PROGRESS: at 51.08% examples, 80729 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:22:01,127 : INFO : EPOCH 5 - PROGRESS: at 51.70% examples, 80594 words/s, in_qsize 7, out_

2018-09-21 01:23:10,930 : INFO : EPOCH 5 - PROGRESS: at 97.40% examples, 82027 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:23:12,138 : INFO : EPOCH 5 - PROGRESS: at 97.90% examples, 81800 words/s, in_qsize 6, out_qsize 1
2018-09-21 01:23:13,152 : INFO : EPOCH 5 - PROGRESS: at 98.45% examples, 81727 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:23:14,216 : INFO : EPOCH 5 - PROGRESS: at 99.00% examples, 81626 words/s, in_qsize 6, out_qsize 1
2018-09-21 01:23:15,224 : INFO : EPOCH 5 - PROGRESS: at 99.54% examples, 81557 words/s, in_qsize 7, out_qsize 0
2018-09-21 01:23:15,668 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-21 01:23:15,756 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-21 01:23:15,889 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-21 01:23:15,903 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-21 01:23:15,909 : INFO : EPOCH - 5 : training on 17798269 raw words 

In [120]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [121]:
model.wv.doesnt_match("hockey cricket swimming horse".split())



'hockey'

In [122]:
 model.most_similar("man")

  """Entry point for launching an IPython kernel.


[('woman', 0.6176144480705261),
 ('lady', 0.588788628578186),
 ('lad', 0.5731593370437622),
 ('men', 0.5240305662155151),
 ('guy', 0.5164757966995239),
 ('millionaire', 0.5151826739311218),
 ('businessman', 0.5099620223045349),
 ('person', 0.5094382166862488),
 ('soldier', 0.509232759475708),
 ('monk', 0.5027958154678345)]

In [123]:
model.most_similar("awful")

  """Entry point for launching an IPython kernel.


[('terrible', 0.7823050618171692),
 ('atrocious', 0.7463761568069458),
 ('horrible', 0.7442225217819214),
 ('dreadful', 0.7181977033615112),
 ('abysmal', 0.6965848207473755),
 ('horrid', 0.6755940914154053),
 ('horrendous', 0.6673485636711121),
 ('appalling', 0.6633576154708862),
 ('lousy', 0.662935733795166),
 ('amateurish', 0.6082025766372681)]

In [126]:
model.syn0_lockf

  """Entry point for launching an IPython kernel.


array([1., 1., 1., ..., 1., 1., 1.], dtype=float32)

In [14]:
 # Load the model that we created in Part 2
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")


In [2]:
model_loaded.syn0_lockf.shape

  """Entry point for launching an IPython kernel.


(16490,)

In [24]:
def makeFeatureVec(words, model, num_features):

    # Pre-initialize an empty numpy array for speed
    featureVec = np.zeros((num_features,),dtype="float32")
    nwords = 0
    
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)
    
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set:
            nwords = nwords+1.
            featureVec = np.add(featureVec, model[word])
    
    # Divide the result by the number of words to get the avg
    featureVec = np.divide(featureVec, nwords)
    return featureVec



In [25]:
def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    counter = int(counter)
    
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
        if counter%1000. == 0.:
            print("Review %d of %d" % (counter, len(reviews)))
        
       # Call the function (defined above) that makes average feature vectors, increment the counter
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        counter = counter + 1
        
    return reviewFeatureVecs



In [26]:
# ****************************************************************
# Calculate average feature vectors for training and testing sets,
# using the functions we defined above. Notice that we now use stop word
# removal.
num_features = 300
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review,remove_stopwords=True ))

trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

Review 0 of 25000


  app.launch_new_instance()


Review 1000 of 25000
Review 2000 of 25000
Review 3000 of 25000
Review 4000 of 25000
Review 5000 of 25000
Review 6000 of 25000
Review 7000 of 25000
Review 8000 of 25000
Review 9000 of 25000
Review 10000 of 25000
Review 11000 of 25000
Review 12000 of 25000
Review 13000 of 25000
Review 14000 of 25000
Review 15000 of 25000
Review 16000 of 25000
Review 17000 of 25000
Review 18000 of 25000
Review 19000 of 25000
Review 20000 of 25000
Review 21000 of 25000
Review 22000 of 25000
Review 23000 of 25000
Review 24000 of 25000
