In [2]:
import pandas as pd

In [3]:
labeled_train = pd.read_csv( "labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

In [4]:
assert len(labeled_train) + len(test) + len(unlabeled_train) == 100000

In [5]:
# for dealing with HTML markup
from bs4 import BeautifulSoup

In [6]:
import re
from nltk.corpus import stopwords

In [7]:
def reviewWordList(rawReview, removeStopwords = False):
    
    # using the library BeautifulSoup for removing html markup from the raw review string
    withoutMarkup = BeautifulSoup(rawReview)
    withoutMarkup = withoutMarkup.get_text()

    # removing punctuation(which may take away smilies used in the review), and numbers for simplicity
    lettersOnly = re.sub('[^a-zA-Z]', ' ', withoutMarkup)
    # getting all words in lower case
    lettersOnly = lettersOnly.lower()
    words = lettersOnly.split()
    
    # Stop words are words which occur frequently in the language and don't carry much meaninig
    # by default, they will not be removed
    # set removeStopwords = True to remove them
    if removeStopwords:    
        # using sets to store stop words as they are faster for membership tests than lists
        stopWords = set(stopwords.words('english'))
        # removing the stopwords of english language from the words occured in the review
        words = [w for w in words if w not in stopWords]
    
    return words

In [8]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
sampleReview = labeled_train['review'][0]

In [28]:
sampleReview

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [29]:
sampleReview.strip()

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

In [43]:
tokenizer.tokenize(sampleReview.strip())

['"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.',
 'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.',
 'Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released.',
 "Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring.",
 'Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit 

In [10]:
def reviewToSentences(review, tokenizer, removeStopwords=False):
    reviewSentences = tokenizer.tokenize(review.strip())
    sentences = []
    for sentence in reviewSentences:
        if len(sentence) > 0:
            wordList = reviewWordList(sentence, removeStopwords)
            sentences.append(wordList)
    return sentences

In [None]:
# for getting sentences in the form of word lists from the entire dataset
sentences = []

for review in labeled_train['review']:
    review = review.decode('utf8')
    sentencesInThisReview = reviewToSentences(review, tokenizer)
    sentences += sentencesInThisReview

for review in unlabeled_train['review']:
    review = review.decode('utf8')
    sentencesInThisReview = reviewToSentences(review, tokenizer)
    sentences += sentencesInThisReview



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))
  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [56]:
sentences[1:10]

[[u'maybe',
  u'i',
  u'just',
  u'want',
  u'to',
  u'get',
  u'a',
  u'certain',
  u'insight',
  u'into',
  u'this',
  u'guy',
  u'who',
  u'i',
  u'thought',
  u'was',
  u'really',
  u'cool',
  u'in',
  u'the',
  u'eighties',
  u'just',
  u'to',
  u'maybe',
  u'make',
  u'up',
  u'my',
  u'mind',
  u'whether',
  u'he',
  u'is',
  u'guilty',
  u'or',
  u'innocent'],
 [u'moonwalker',
  u'is',
  u'part',
  u'biography',
  u'part',
  u'feature',
  u'film',
  u'which',
  u'i',
  u'remember',
  u'going',
  u'to',
  u'see',
  u'at',
  u'the',
  u'cinema',
  u'when',
  u'it',
  u'was',
  u'originally',
  u'released'],
 [u'some',
  u'of',
  u'it',
  u'has',
  u'subtle',
  u'messages',
  u'about',
  u'mj',
  u's',
  u'feeling',
  u'towards',
  u'the',
  u'press',
  u'and',
  u'also',
  u'the',
  u'obvious',
  u'message',
  u'of',
  u'drugs',
  u'are',
  u'bad',
  u'm',
  u'kay',
  u'visually',
  u'impressive',
  u'but',
  u'of',
  u'course',
  u'this',
  u'is',
  u'all',
  u'about',
  u'micha

In [57]:
print len(sentences)

795538


In [None]:
# for better output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

In [None]:
num_features = 300
min_word_count = 40
num_workers = 4
context = 10
downsampling = 1e-3

In [61]:
from gensim.models import word2vec
print "Training model..."
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, window = context, sample = downsampling)

2016-11-08 13:38:02,626 : INFO : 'pattern' package not found; tag filters are not available for English
2016-11-08 13:38:02,714 : INFO : collecting all words and their counts
2016-11-08 13:38:02,715 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2016-11-08 13:38:02,798 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2016-11-08 13:38:02,866 : INFO : PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
2016-11-08 13:38:02,937 : INFO : PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types


Training model...


2016-11-08 13:38:03,014 : INFO : PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2016-11-08 13:38:03,087 : INFO : PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2016-11-08 13:38:03,159 : INFO : PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2016-11-08 13:38:03,262 : INFO : PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2016-11-08 13:38:03,338 : INFO : PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2016-11-08 13:38:03,410 : INFO : PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2016-11-08 13:38:03,486 : INFO : PROGRESS: at sentence #100000, processed 2226966 words, keeping 50207 word types
2016-11-08 13:38:03,557 : INFO : PROGRESS: at sentence #110000, processed 2446580 words, keeping 52081 word types
2016-11-08 13:38:03,633 : INFO : PROGRESS: at sentence #120000, processed 2668775 words, keepin

In [62]:
# for making the model much more memory efficient
model.init_sims(replace=True)
model_name = 'kaggleParameters'
model.save(model_name)

2016-11-08 13:43:03,470 : INFO : precomputing L2-norms of word weight vectors
2016-11-08 13:43:03,758 : INFO : saving Word2Vec object under kaggleParameters, separately None
2016-11-08 13:43:03,759 : INFO : not storing attribute syn0norm
2016-11-08 13:43:03,761 : INFO : not storing attribute cum_table
2016-11-08 13:43:04,797 : INFO : saved kaggleParameters


In [1]:
from gensim.models import word2vec
model = word2vec.Word2Vec.load('kaggleParameters')



In [7]:
type(model.syn0)

numpy.ndarray

In [8]:
model.syn0.shape

(16490, 300)

In [10]:
type(model.vocab)

dict

In [14]:
model.vocab

{u'raining': <gensim.models.word2vec.Vocab at 0x7fe24a3e6050>,
 u'writings': <gensim.models.word2vec.Vocab at 0x7fe24a3e6090>,
 u'hordes': <gensim.models.word2vec.Vocab at 0x7fe24a3e60d0>,
 u'galactica': <gensim.models.word2vec.Vocab at 0x7fe24a3e6110>,
 u'foul': <gensim.models.word2vec.Vocab at 0x7fe24a3e6150>,
 u'four': <gensim.models.word2vec.Vocab at 0x7fe24a3e6190>,
 u'gag': <gensim.models.word2vec.Vocab at 0x7fe2486cf650>,
 u'woods': <gensim.models.word2vec.Vocab at 0x7fe24a3e6210>,
 u'spiders': <gensim.models.word2vec.Vocab at 0x7fe24a3e6290>,
 u'hanging': <gensim.models.word2vec.Vocab at 0x7fe24a3e62d0>,
 u'woody': <gensim.models.word2vec.Vocab at 0x7fe24a3e6310>,
 u'comically': <gensim.models.word2vec.Vocab at 0x7fe24a3e6350>,
 u'gabrielle': <gensim.models.word2vec.Vocab at 0x7fe24a3e6390>,
 u'marching': <gensim.models.word2vec.Vocab at 0x7fe24a3e63d0>,
 u'increase': <gensim.models.word2vec.Vocab at 0x7fe248a69450>,
 u'electricity': <gensim.models.word2vec.Vocab at 0x7fe24a3e6

In [15]:
len(model.vocab)

16490

In [16]:
model.index2word

[u'the',
 u'and',
 u'a',
 u'of',
 u'to',
 u'is',
 u'it',
 u'in',
 u'i',
 u'this',
 u'that',
 u's',
 u'was',
 u'as',
 u'with',
 u'for',
 u'movie',
 u'but',
 u'film',
 u'you',
 u't',
 u'on',
 u'not',
 u'he',
 u'are',
 u'his',
 u'have',
 u'be',
 u'one',
 u'all',
 u'at',
 u'they',
 u'by',
 u'who',
 u'an',
 u'from',
 u'so',
 u'like',
 u'there',
 u'her',
 u'or',
 u'just',
 u'about',
 u'out',
 u'has',
 u'if',
 u'what',
 u'some',
 u'good',
 u'can',
 u'more',
 u'when',
 u'very',
 u'she',
 u'up',
 u'no',
 u'time',
 u'even',
 u'would',
 u'my',
 u'which',
 u'their',
 u'story',
 u'only',
 u'really',
 u'see',
 u'had',
 u'were',
 u'well',
 u'we',
 u'me',
 u'than',
 u'much',
 u'bad',
 u'get',
 u'been',
 u'people',
 u'also',
 u'into',
 u'do',
 u'great',
 u'other',
 u'will',
 u'first',
 u'because',
 u'him',
 u'how',
 u'most',
 u'don',
 u'them',
 u'made',
 u'its',
 u'make',
 u'then',
 u'way',
 u'could',
 u'too',
 u'movies',
 u'after',
 u'any',
 u'characters',
 u'character',
 u'think',
 u'films',
 u'two',

In [20]:
model.syn0[0] == model[model.index2word[0]]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [21]:
type(model['to'])

numpy.ndarray

In [5]:
import numpy as np

In [6]:
np.array_equal(model.syn0[0], model[model.index2word[0]])

True

In [None]:
model.similar_by_vector(model['king'] - model['man'] + model['woman'])

In [10]:
model.syn0

array([[ 0.02903191, -0.00973335,  0.03219427, ..., -0.08135615,
        -0.03701866, -0.03512355],
       [-0.01272452,  0.09487445,  0.07090939, ..., -0.00964031,
        -0.10211296,  0.0516564 ],
       [-0.13472755, -0.06105804,  0.09714838, ...,  0.02951038,
        -0.04869334, -0.09796863],
       ..., 
       [-0.03496105,  0.1049808 , -0.12801889, ..., -0.03543064,
        -0.0708217 ,  0.04881627],
       [ 0.03428168,  0.05503095, -0.03196469, ...,  0.19315711,
        -0.02666347,  0.04711537],
       [ 0.05423125,  0.00195914, -0.01626314, ...,  0.09409437,
        -0.07786203, -0.01682461]], dtype=float32)

In [11]:
model.syn0[5] in model.syn0

True

In [None]:
model.doesnt_match('man woman child kitchen'.split())

In [None]:
model.most_similar('man')