## Proprocess raw texts to digital indices and train them to word embeddings via gensim's word2vec

**Libraries used:**
1. nltk: used to tokenize text

2. gensim: use its models.word2vec to produce word vectors.

**Note: This is summerized in a more concise python file Wemb_gensim.py **

In [31]:
import os
import glob
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import nltk
import numpy
import pickle as pkl
import sys
import multiprocessing

In [32]:
# Strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')

    # Pad punctuation with spaces on both sides
    for char in [':', '"', ',', '(', ')', '!', '?', ';', '*']:
        norm_text = norm_text.replace(char, ' ')

    norm_text = norm_text.replace('.', ' ' + '.')
    return norm_text

In [33]:
def read_dataset(path):
    if sys.version > '3':
        control_chars = [chr(0x85)] # UTF-8
    else:
        control_chars = [unichr(0x85)] # UTF-16 which is often used in other language codes (e.g. Chinese)

    dataset = []
    currdir = os.getcwd()
    os.chdir(path)
    #i = 0
    for ff in glob.glob("*.txt"):
        #i += 1
        with open(ff, "r") as f:
            line_txt = f.readline().strip()
            line_norm = normalize_text(line_txt)
            dataset.append(line_norm)
        #if(i==100):
            #break
    os.chdir(currdir)
    return dataset

In [34]:
def build_dict(path, Wemb_size=128, iter=10):
    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    sentences_pos = read_dataset(path+'/pos/')
    sentences_neg = read_dataset(path+'/neg/')

    sentences_train = sentences_pos + sentences_neg
    tokenized_sentences = [nltk.word_tokenize(sent.decode("utf8").encode('ascii', 'ignore')) for sent in sentences_train]

    model = gensim.models.Word2Vec(tokenized_sentences, min_count=1,
                                   size=Wemb_size, window=5,
                                   workers=cores, iter=iter)

    tok_sents_pos = tokenized_sentences[:len(sentences_pos)]
    tok_sents_neg = tokenized_sentences[len(sentences_pos):]

    return {'model': model, 'tok_sents_pos': tok_sents_pos, 'tok_sents_neg': tok_sents_neg}

### Convert words in text sentences to their corresponding indices in the dictionary

In [35]:
def sentence2idx(tokenized_sentences, model):
    idx = []
    for tok_sen in tokenized_sentences:
        idx_sent = numpy.zeros(len(tok_sen), dtype=numpy.int)
        for i, word in enumerate(tok_sen):
            if word in model.wv.vocab:
                idx_sent[i] = model.wv.vocab[word].index
            else:
                idx_sent[i] = model.wv.vocab['.'].index
        idx.append(idx_sent)
    return idx

### Convert words in the dictionary into embeddings

Actually, there is no need to create word embedding specifically, because it is included in the trained gensim model and can be acessed by `model.wv.syn0`.

In [36]:
def create_Wemb(model, Wemb_size=128):
    Wemb = numpy.zeros((len(model.wv.vocab), Wemb_size))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            Wemb[i] = embedding_vector
    return Wemb

In [37]:
data_path = '../data/aclImdb/train'

In [38]:
import nltk
nltk.download('punkt')
result = build_dict(data_path)

[nltk_data] Downloading package punkt to /Users/lifa08/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2018-07-12 15:44:25,902 : INFO : collecting all words and their counts
2018-07-12 15:44:25,903 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-12 15:44:26,854 : INFO : PROGRESS: at sentence #10000, processed 2518094 words, keeping 62658 word types
2018-07-12 15:44:27,847 : INFO : PROGRESS: at sentence #20000, processed 5010152 words, keeping 89834 word types
2018-07-12 15:44:28,313 : INFO : collected 100352 word types from a corpus of 6250510 raw words and 25000 sentences
2018-07-12 15:44:28,315 : INFO : Loading a fresh vocabulary
2018-07-12 15:44:29,472 : INFO : effective_min_count=1 retains 100352 unique words (100% of original 100352, drops 0)
2018-07-12 15:44:29,473 : INFO : effective_min_count=1 leaves 6250510 word corpus (100% of original 6250510, drops 0)
2018-07-12 15:44:29,822 : INFO : deleting the raw counts dictionary of 100352 items
2018-07-12 15:44:29,832 : INFO : sample=0.001 downsamples 48 most-common words
2018-07-12 15:44:29,833 : INF

2018-07-12 15:45:17,724 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-12 15:45:17,732 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-12 15:45:17,734 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-12 15:45:17,736 : INFO : EPOCH - 5 : training on 6250510 raw words (4558033 effective words) took 10.6s, 430475 effective words/s
2018-07-12 15:45:18,750 : INFO : EPOCH 6 - PROGRESS: at 11.29% examples, 515455 words/s, in_qsize 8, out_qsize 0
2018-07-12 15:45:19,753 : INFO : EPOCH 6 - PROGRESS: at 25.70% examples, 587948 words/s, in_qsize 8, out_qsize 0
2018-07-12 15:45:20,756 : INFO : EPOCH 6 - PROGRESS: at 42.45% examples, 646963 words/s, in_qsize 8, out_qsize 0
2018-07-12 15:45:21,757 : INFO : EPOCH 6 - PROGRESS: at 56.58% examples, 647680 words/s, in_qsize 7, out_qsize 0
2018-07-12 15:45:22,767 : INFO : EPOCH 6 - PROGRESS: at 71.23% examples, 648278 words/s, in_qsize 8, out_qsize 1
2018-07-12 15:45:23,769 

In [39]:
sents_pos = result['tok_sents_pos']
model = result['model']
train_x_pos = sentence2idx(sents_pos, model)

In [40]:
model.wv.most_similar('good')

2018-07-12 15:45:50,905 : INFO : precomputing L2-norms of word weight vectors


[('decent', 0.7668520212173462),
 ('great', 0.715586245059967),
 ('bad', 0.6942986845970154),
 ('cool', 0.6417878866195679),
 ('nice', 0.6319639086723328),
 ('fine', 0.6245481967926025),
 ('solid', 0.59840989112854),
 ('lousy', 0.5939891338348389),
 ('ok', 0.5814189910888672),
 ('terrific', 0.5808237195014954)]

In [41]:
sents_neg = result['tok_sents_neg']
train_x_neg = sentence2idx(sents_neg, model)
print(train_x_neg[:2])

[array([  792,    16,    29,     4,     0,   118,  1755,  7082,    10,
          19,   985,     5,    28, 19990,     5,     7,    12,  2375,
        1807,   128,  2235,     5,     3,  6995,   299,     1,  2582,
        2315,     0,    19,    36,   483,  5888,    12,  3361,     2,
          39,    12,     3,  1006,   175,    21,    50,   782,     1]), array([   78,     1,     1, 51922,     9,     0,   197,   624,   131,
           8,  3335,     2,     9,   250,     0,    17,   179,   710,
           5,   107,     1,   190,    32, 12474,     5,    99, 10177,
        1477,     2,  3363,     1,  1644,    71,   147,   103,   624,
         168,  4411,   204,   102,    32,  1751,     0,    90,    29,
           9,   375,  2381, 25259,   151,    63,   536,   305,     6,
        2193,     4,  9988,     0,   861,   137,     8,  6084,   397,
          55,    32,   955,   155,    29,     4,   132, 18943, 12641,
           4,     3,    17, 10177,  5479,    61,    23,    28,  1146,
           3,    

In [42]:
train_x = train_x_pos + train_x_neg

In [43]:
train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
# print(train_y)

In [44]:
test_path = '../data/aclImdb/test'
test_sents_pos = read_dataset(test_path+'/pos/')

In [45]:
tok_test_sents_pos = [nltk.word_tokenize(sent.decode("utf8").encode('ascii', 'ignore')) for sent in test_sents_pos]
test_x_pos = sentence2idx(tok_test_sents_pos, model)

In [46]:
test_sents_neg = read_dataset(test_path+'/neg/')
tok_test_sents_neg = [nltk.word_tokenize(sent.decode("utf8").encode('ascii', 'ignore')) for sent in test_sents_neg]
test_x_neg = sentence2idx(tok_test_sents_neg, model)
test_x = test_x_pos + test_x_neg
test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)

### Save converted sentence indices to a file

In [47]:
f = open('../data/gensim/gensim_imdb.pkl', 'wb')
pkl.dump((train_x, train_y), f, -1)
pkl.dump((test_x, test_y), f, -1)
f.close()

In [48]:
f = open('../data/gensim/gensim_imdb.pkl', 'rb')
train_set = pkl.load(f)
test_set = pkl.load(f)
f.close()

### Save word embeddings to a file

In [49]:
Wemb = create_Wemb(model)

In [50]:
f = open('../data/gensim/gensim_imdb_Wemb.pkl', 'wb')
pkl.dump(Wemb, f, -1)
f.close()

In [51]:
f = open('../data/gensim/gensim_imdb_Wemb.pkl', 'rb')
Wemb_file = pkl.load(f)
f.close()

In [52]:
print(Wemb)

[[ 7.14067876e-01 -1.49158609e+00  9.64979351e-01 ... -9.61133957e-01
   1.75469482e+00 -1.77968249e-01]
 [-6.63630664e-01 -2.52432644e-01  8.86730194e-01 ... -1.50446981e-01
  -2.50875622e-01 -4.96376812e-01]
 [-6.16630793e-01 -2.99852312e-01  1.09431922e+00 ... -1.22701669e+00
  -1.18284440e+00 -1.39343485e-01]
 ...
 [ 2.42219102e-02  3.26500684e-02 -3.38389613e-02 ... -1.85415484e-02
  -3.49518023e-02  1.92117393e-02]
 [-3.34197544e-02 -8.84763058e-03 -5.36686322e-03 ...  1.86094474e-02
   2.37452965e-02 -1.52655382e-04]
 [ 4.41411696e-02  8.57123360e-02 -3.84849380e-03 ... -2.65482161e-02
  -2.57179160e-02  3.27176750e-02]]


In [53]:
print(Wemb_file)

[[ 7.14067876e-01 -1.49158609e+00  9.64979351e-01 ... -9.61133957e-01
   1.75469482e+00 -1.77968249e-01]
 [-6.63630664e-01 -2.52432644e-01  8.86730194e-01 ... -1.50446981e-01
  -2.50875622e-01 -4.96376812e-01]
 [-6.16630793e-01 -2.99852312e-01  1.09431922e+00 ... -1.22701669e+00
  -1.18284440e+00 -1.39343485e-01]
 ...
 [ 2.42219102e-02  3.26500684e-02 -3.38389613e-02 ... -1.85415484e-02
  -3.49518023e-02  1.92117393e-02]
 [-3.34197544e-02 -8.84763058e-03 -5.36686322e-03 ...  1.86094474e-02
   2.37452965e-02 -1.52655382e-04]
 [ 4.41411696e-02  8.57123360e-02 -3.84849380e-03 ... -2.65482161e-02
  -2.57179160e-02  3.27176750e-02]]


### Include all above into a function

In [54]:
def train_gensim_w2vec(path):
    result = build_dict(path+'train')
    model = result['model']

    sents_pos = result['tok_sents_pos']
    train_x_pos = sentence2idx(sents_pos, model)

    sents_neg = result['tok_sents_neg']
    train_x_neg = sentence2idx(sents_neg, model)
    train_x = train_x_pos + train_x_neg
    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)

    test_sents_pos = read_dataset(path+'test/pos/')
    tok_test_sents_pos = [nltk.word_tokenize(sent.decode("utf8").encode('ascii', 'ignore')) for sent in test_sents_pos]
    test_x_pos = sentence2idx(tok_test_sents_pos, model)

    test_sents_neg = read_dataset(path+'test/neg/')
    tok_test_sents_neg = [nltk.word_tokenize(sent.decode("utf8").encode('ascii', 'ignore')) for sent in test_sents_neg]
    test_x_neg = sentence2idx(tok_test_sents_neg, model)
    test_x = test_x_pos + test_x_neg
    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)

    f = open('../data/gensim/gensim_imdb.pkl', 'wb')
    pkl.dump((train_x, train_y), f, -1)
    pkl.dump((test_x, test_y), f, -1)
    f.close()

    Wemb = create_Wemb(model)
    f = open('../data/gensim/gensim_imdb_Wemb.pkl', 'wb')
    pkl.dump(Wemb, f, -1)
    f.close()

    model.save('../data/gensim/imdb_gensim_w2vmodel')

In [55]:
path = '../data/aclImdb/'
train_gensim_w2vec(path)

2018-07-12 15:48:48,998 : INFO : collecting all words and their counts
2018-07-12 15:48:48,999 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-12 15:48:50,007 : INFO : PROGRESS: at sentence #10000, processed 2518094 words, keeping 62658 word types
2018-07-12 15:48:51,060 : INFO : PROGRESS: at sentence #20000, processed 5010152 words, keeping 89834 word types
2018-07-12 15:48:51,565 : INFO : collected 100352 word types from a corpus of 6250510 raw words and 25000 sentences
2018-07-12 15:48:51,567 : INFO : Loading a fresh vocabulary
2018-07-12 15:48:52,950 : INFO : effective_min_count=1 retains 100352 unique words (100% of original 100352, drops 0)
2018-07-12 15:48:52,951 : INFO : effective_min_count=1 leaves 6250510 word corpus (100% of original 6250510, drops 0)
2018-07-12 15:48:53,391 : INFO : deleting the raw counts dictionary of 100352 items
2018-07-12 15:48:53,402 : INFO : sample=0.001 downsamples 48 most-common words
2018-07-12 15:48:53,405 : INF

2018-07-12 15:49:35,356 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-12 15:49:35,357 : INFO : EPOCH - 6 : training on 6250510 raw words (4558231 effective words) took 7.2s, 630941 effective words/s
2018-07-12 15:49:36,370 : INFO : EPOCH 7 - PROGRESS: at 9.88% examples, 455273 words/s, in_qsize 8, out_qsize 0
2018-07-12 15:49:37,380 : INFO : EPOCH 7 - PROGRESS: at 23.77% examples, 541906 words/s, in_qsize 8, out_qsize 0
2018-07-12 15:49:38,394 : INFO : EPOCH 7 - PROGRESS: at 37.69% examples, 570826 words/s, in_qsize 7, out_qsize 0
2018-07-12 15:49:39,396 : INFO : EPOCH 7 - PROGRESS: at 53.77% examples, 613900 words/s, in_qsize 7, out_qsize 0
2018-07-12 15:49:40,400 : INFO : EPOCH 7 - PROGRESS: at 71.56% examples, 650627 words/s, in_qsize 7, out_qsize 0
2018-07-12 15:49:41,414 : INFO : EPOCH 7 - PROGRESS: at 89.01% examples, 673942 words/s, in_qsize 7, out_qsize 1
2018-07-12 15:49:41,980 : INFO : worker thread finished; awaiting finish of 3 more threads
2018

### Check if embeddings from gensim model equal to the word embeddings stored in the file to make sure the correctness of storing word embeddings.

In [56]:
def compare_idxwemb_wordemb(sentence_idx):
    model_path = '../data/gensim/imdb_gensim_w2vmodel'
    model = gensim.models.Word2Vec.load(model_path)
    sentence_words = []
    for idx, x in enumerate(sentence_idx):
        sentence_words.append(model.wv[model.wv.index2word[x]])

    f = open('../data/gensim/gensim_imdb_Wemb.pkl', 'rb')
    Wemb = pkl.load(f)
    f.close()
    emb_x = Wemb[sentence_idx]

    return (numpy.matrix(sentence_words)==numpy.matrix(emb_x)).all()

In [57]:
print(train_x_neg[0])
compare_idxwemb_wordemb(train_x_neg[0])

2018-07-12 15:51:42,157 : INFO : loading Word2Vec object from ../data/gensim/imdb_gensim_w2vmodel


[  792    16    29     4     0   118  1755  7082    10    19   985     5
    28 19990     5     7    12  2375  1807   128  2235     5     3  6995
   299     1  2582  2315     0    19    36   483  5888    12  3361     2
    39    12     3  1006   175    21    50   782     1]


2018-07-12 15:51:42,839 : INFO : loading vocabulary recursively from ../data/gensim/imdb_gensim_w2vmodel.vocabulary.* with mmap=None
2018-07-12 15:51:42,841 : INFO : loading wv recursively from ../data/gensim/imdb_gensim_w2vmodel.wv.* with mmap=None
2018-07-12 15:51:42,843 : INFO : loading vectors from ../data/gensim/imdb_gensim_w2vmodel.wv.vectors.npy with mmap=None
2018-07-12 15:51:42,917 : INFO : setting ignored attribute vectors_norm to None
2018-07-12 15:51:42,922 : INFO : loading trainables recursively from ../data/gensim/imdb_gensim_w2vmodel.trainables.* with mmap=None
2018-07-12 15:51:42,925 : INFO : loading syn1neg from ../data/gensim/imdb_gensim_w2vmodel.trainables.syn1neg.npy with mmap=None
2018-07-12 15:51:42,986 : INFO : setting ignored attribute cum_table to None
2018-07-12 15:51:42,991 : INFO : loaded ../data/gensim/imdb_gensim_w2vmodel


True