In [6]:
import pandas as pd
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting = 3)
unlabeled_train = pd.read_csv("unlabeledTrainData.tsv", header=0, delimiter="\t", quoting = 3)

print("Read %d labeled train reviews, %d labeled test reviews, " \
 "and %d unlabeled reviews\n" % (train["review"].size,  
 train["review"].size, unlabeled_train["review"].size ))


Read 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews



In [16]:
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    review_text = BeautifulSoup(review,"lxml").get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)

In [17]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [18]:
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    return sentences

In [19]:
sentences = [] 
print("Parsing sentences from training set")

for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


Parsing sentences from unlabeled set


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [20]:
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [21]:
print(sentences[1])

['maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent']


In [22]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [23]:
num_features = 300 
min_word_count = 40 
num_workers = 4 
context = 10

downsampling = 1e-3 

In [38]:
from gensim.models import word2vec
print("Training Model ... ")

model = word2vec.Word2Vec(sentences,workers=num_workers, \
                         size=num_features, min_count = min_word_count, \
                         window = context, sample=downsampling)

model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)

2018-03-09 14:19:52,823 : INFO : collecting all words and their counts
2018-03-09 14:19:52,826 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-03-09 14:19:52,938 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types


Training Model ... 


2018-03-09 14:19:53,050 : INFO : PROGRESS: at sentence #20000, processed 451867 words, keeping 24947 word types
2018-03-09 14:19:53,166 : INFO : PROGRESS: at sentence #30000, processed 671290 words, keeping 30033 word types
2018-03-09 14:19:53,284 : INFO : PROGRESS: at sentence #40000, processed 897790 words, keeping 34347 word types
2018-03-09 14:19:53,393 : INFO : PROGRESS: at sentence #50000, processed 1116929 words, keeping 37760 word types
2018-03-09 14:19:53,499 : INFO : PROGRESS: at sentence #60000, processed 1338370 words, keeping 40722 word types
2018-03-09 14:19:53,618 : INFO : PROGRESS: at sentence #70000, processed 1561505 words, keeping 43332 word types
2018-03-09 14:19:53,739 : INFO : PROGRESS: at sentence #80000, processed 1780812 words, keeping 45713 word types
2018-03-09 14:19:53,852 : INFO : PROGRESS: at sentence #90000, processed 2004905 words, keeping 48134 word types
2018-03-09 14:19:53,961 : INFO : PROGRESS: at sentence #100000, processed 2226863 words, keeping 50

2018-03-09 14:20:00,700 : INFO : PROGRESS: at sentence #740000, processed 16550913 words, keeping 119654 word types
2018-03-09 14:20:00,777 : INFO : PROGRESS: at sentence #750000, processed 16769240 words, keeping 120282 word types
2018-03-09 14:20:00,858 : INFO : PROGRESS: at sentence #760000, processed 16988632 words, keeping 120917 word types
2018-03-09 14:20:00,935 : INFO : PROGRESS: at sentence #770000, processed 17215761 words, keeping 121690 word types
2018-03-09 14:20:01,025 : INFO : PROGRESS: at sentence #780000, processed 17445902 words, keeping 122389 word types
2018-03-09 14:20:01,123 : INFO : PROGRESS: at sentence #790000, processed 17672895 words, keeping 123055 word types
2018-03-09 14:20:01,180 : INFO : collected 123493 word types from a corpus of 17795898 raw words and 795538 sentences
2018-03-09 14:20:01,181 : INFO : Loading a fresh vocabulary
2018-03-09 14:20:02,778 : INFO : min_count=40 retains 16490 unique words (13% of original 123493, drops 107003)
2018-03-09 14:

2018-03-09 14:20:59,873 : INFO : EPOCH 2 - PROGRESS: at 29.53% examples, 262165 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:21:00,908 : INFO : EPOCH 2 - PROGRESS: at 31.51% examples, 260321 words/s, in_qsize 6, out_qsize 1
2018-03-09 14:21:01,916 : INFO : EPOCH 2 - PROGRESS: at 33.32% examples, 258251 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:21:02,917 : INFO : EPOCH 2 - PROGRESS: at 35.11% examples, 256527 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:21:03,970 : INFO : EPOCH 2 - PROGRESS: at 36.85% examples, 253885 words/s, in_qsize 6, out_qsize 1
2018-03-09 14:21:05,009 : INFO : EPOCH 2 - PROGRESS: at 38.40% examples, 250598 words/s, in_qsize 8, out_qsize 0
2018-03-09 14:21:06,029 : INFO : EPOCH 2 - PROGRESS: at 39.96% examples, 247905 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:21:07,039 : INFO : EPOCH 2 - PROGRESS: at 41.76% examples, 246882 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:21:08,042 : INFO : EPOCH 2 - PROGRESS: at 43.52% examples, 246050 words/s, in_qsiz

2018-03-09 14:22:10,720 : INFO : EPOCH 3 - PROGRESS: at 96.04% examples, 335007 words/s, in_qsize 8, out_qsize 0
2018-03-09 14:22:11,728 : INFO : EPOCH 3 - PROGRESS: at 98.77% examples, 335530 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:22:12,082 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-03-09 14:22:12,108 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-03-09 14:22:12,121 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-03-09 14:22:12,135 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-03-09 14:22:12,137 : INFO : EPOCH - 3 : training on 17795898 raw words (12747548 effective words) took 37.9s, 336036 effective words/s
2018-03-09 14:22:13,181 : INFO : EPOCH 4 - PROGRESS: at 2.45% examples, 307696 words/s, in_qsize 6, out_qsize 1
2018-03-09 14:22:14,211 : INFO : EPOCH 4 - PROGRESS: at 4.59% examples, 285770 words/s, in_qsize 8, out_qsize 0
2018-03-09 14:22:15,216 : INFO : EPOCH 4 - PRO

2018-03-09 14:23:16,792 : INFO : EPOCH 5 - PROGRESS: at 79.58% examples, 371133 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:23:17,802 : INFO : EPOCH 5 - PROGRESS: at 81.82% examples, 367992 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:23:18,816 : INFO : EPOCH 5 - PROGRESS: at 84.18% examples, 365520 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:23:19,820 : INFO : EPOCH 5 - PROGRESS: at 86.93% examples, 364970 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:23:20,837 : INFO : EPOCH 5 - PROGRESS: at 89.54% examples, 363856 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:23:21,848 : INFO : EPOCH 5 - PROGRESS: at 91.99% examples, 362219 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:23:22,883 : INFO : EPOCH 5 - PROGRESS: at 94.55% examples, 360628 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:23:23,900 : INFO : EPOCH 5 - PROGRESS: at 96.97% examples, 358893 words/s, in_qsize 7, out_qsize 0
2018-03-09 14:23:24,908 : INFO : EPOCH 5 - PROGRESS: at 99.61% examples, 358368 words/s, in_qsiz

In [32]:
model.wv.doesnt_match("man woman child kitchen".split())

'kitchen'

In [33]:
model.wv.doesnt_match("france england germany berlin".split())

'berlin'

In [34]:
model.wv.doesnt_match("paris berlin london austria".split())

'paris'

In [36]:
model.wv.most_similar("man")

[('woman', 0.6042462587356567),
 ('lady', 0.5923141837120056),
 ('lad', 0.5652726292610168),
 ('monk', 0.5444037914276123),
 ('farmer', 0.5356318354606628),
 ('chap', 0.5209022760391235),
 ('businessman', 0.5208353996276855),
 ('guy', 0.514893651008606),
 ('boxer', 0.509599506855011),
 ('soldier', 0.5093779563903809)]

In [37]:
 model.wv.most_similar("queen")

[('princess', 0.6770464181900024),
 ('belle', 0.6076541543006897),
 ('bride', 0.6059161424636841),
 ('victoria', 0.5874012112617493),
 ('maria', 0.5845478773117065),
 ('maid', 0.5812250375747681),
 ('starlet', 0.5796318650245667),
 ('mistress', 0.5777221322059631),
 ('duchess', 0.5757953524589539),
 ('eva', 0.5731122493743896)]