In [1]:
import itertools
from multiprocessing import cpu_count, Pool

import numpy as np
import pandas as pd

from torchtext import data
from nltk.tokenize import word_tokenize
from gensim.models.word2vec import Word2Vec

In [2]:
df_neg = pd.read_csv('data/amazon_negative.csv', usecols=['review/text'])

In [3]:
df_neg.head()

Unnamed: 0,review/text
0,I bought this book because I read some glowing...
1,"This is a self-published book, and if you want..."
2,A complete waste of time. Typographical errors...
3,I guess you have to be a romance novel lover f...
4,I feel I have to write to keep others from was...


In [4]:
def tokenize_corpus(corpus):
    return [[token.lower() for token in word_tokenize(sent)] for sent in corpus]

In [5]:
sentences = list(df_neg['review/text'])

In [6]:
def split_data(sents):
    numel = len(sents)
    chunk_size = int(numel/16)
    return [sents[i: i+chunk_size] for i in range(0, numel, chunk_size)]

In [7]:
chunks = split_data(sentences)

In [8]:
with Pool(16) as p:
    sentences = list(itertools.chain(*p.map(tokenize_corpus, chunks)))

In [9]:
# train model
for i in range(1):
    model = Word2Vec(sentences=sentences, min_count=5, workers=16, window=5, size=100)
    # summarize the loaded model
    print(model)
    # summarize vocabulary
    words = list(model.wv.vocab)
    print('Vocab size: %s' % len(words))

    # save model
    model.save('wv_neg.bin')

    # load model
    new_model = Word2Vec.load('wv_neg.bin')
    print(new_model)

Word2Vec(vocab=217544, size=100, alpha=0.025)
Vocab size: 217544
Word2Vec(vocab=217544, size=100, alpha=0.025)
