In [2]:
import itertools
from multiprocessing import cpu_count, Pool
import pickle

import numpy as np
import pandas as pd

from torchtext import data
from nltk.tokenize import word_tokenize
from gensim.models.word2vec import Word2Vec

In [5]:
df_pos = pd.read_csv('data/amazon_positive.csv', usecols=['review/text'])

In [6]:
df_pos.head()

Unnamed: 0,review/text
0,I own the Austin Reed dartmouth blazer in ever...
1,Got these last Christmas as a gag gift. They a...
2,This is only for Julie Strain fans. It's a col...
3,I hope a lot of people hear this cd. We need m...
4,My lovely Pat has one of the GREAT voices of h...


In [7]:
df_pos = df_pos.dropna()

In [14]:
df_pos.isna().sum()

review/text    0
dtype: int64

In [15]:
def tokenize_corpus(corpus):
    return [[token.lower() for token in word_tokenize(sent)] for sent in corpus]

In [16]:
sentences = list(df_pos['review/text'])

In [17]:
def split_data(sents):
    numel = len(sents)
    chunk_size = int(numel/16)
    return [sents[i: i+chunk_size] for i in range(0, numel, chunk_size)]

In [18]:
chunks = split_data(sentences)

In [19]:
with Pool(8) as p:
    sentences = list(itertools.chain(*p.map(tokenize_corpus, chunks)))

In [20]:
len(sentences)

7872285

In [21]:
with open('data/temp_pos_sentence.plk', 'wb') as file_handler:
    pickle.dump(sentences, file_handler)

In [4]:
with open('data/temp_pos_sentence.plk', 'rb') as file_handler:
    sentences = pickle.load(file_handler)
sentences = sentences[:2000000]

In [None]:
len(sentences)

In [None]:
# train model
for i in range(1):
    # __import__('pdb').set_trace()
    model = Word2Vec(sentences=sentences, min_count=5, workers=8, window=5, size=100, max_vocab_size=20000)
    # summarize the loaded model
    print(model)
    # summarize vocabulary
    words = list(model.wv.vocab)
    print('Vocab size: %s' % len(words))

    # save model
    model.save('wv_pos.bin')

    # load model
    new_model = Word2Vec.load('wv_pos.bin')
    print(new_model)