## Proprocess raw texts to digital indices and train them to word embeddings via gensim's word2vec

**Libraries used:**
1. nltk: used to tokenize text

2. gensim: use its models.word2vec to produce word vectors.

**Note: This is summerized in a more concise python file Wemb_gensim.py **

In [1]:
import os
import glob
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import nltk
import numpy
import pickle as pkl
import sys
import multiprocessing

In [2]:
# Strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')

    # Pad punctuation with spaces on both sides
    for char in [':', '"', ',', '(', ')', '!', '?', ';', '*']:
        norm_text = norm_text.replace(char, ' ')

    norm_text = norm_text.replace('.', ' ' + '.')
    return norm_text

In [3]:
def read_dataset(path):
    if sys.version > '3':
        control_chars = [chr(0x85)] # UTF-8
    else:
        control_chars = [unichr(0x85)] # UTF-16 which is often used in other language codes (e.g. Chinese)

    dataset = []
    currdir = os.getcwd()
    os.chdir(path)
    #i = 0
    for ff in glob.glob("*.txt"):
        #i += 1
        with open(ff, "r") as f:
            line_txt = f.readline().strip()
            line_norm = normalize_text(line_txt)
            dataset.append(line_norm)
        #if(i==100):
            #break
    os.chdir(currdir)
    return dataset

In [4]:
def build_dict(path, Wemb_size=128, iter=10):
    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    sentences_pos = read_dataset(path+'/pos/')
    sentences_neg = read_dataset(path+'/neg/')

    sentences_train = sentences_pos + sentences_neg
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences_train]

    model = gensim.models.Word2Vec(tokenized_sentences, min_count=1,
                                   size=Wemb_size, window=5,
                                   workers=cores, iter=iter)

    tok_sents_pos = tokenized_sentences[:len(sentences_pos)]
    tok_sents_neg = tokenized_sentences[len(sentences_pos):]

    return {'model': model, 'tok_sents_pos': tok_sents_pos, 'tok_sents_neg': tok_sents_neg}

### Convert words in text sentences to their corresponding indices in the dictionary

In [5]:
def sentence2idx(tokenized_sentences, model):
    idx = []
    for tok_sen in tokenized_sentences:
        idx_sent = numpy.zeros(len(tok_sen), dtype=numpy.int)
        for i, word in enumerate(tok_sen):
            if word in model.wv.vocab:
                idx_sent[i] = model.wv.vocab[word].index
            else:
                idx_sent[i] = model.wv.vocab['.'].index
        idx.append(idx_sent)
    return idx

### Convert words in the dictionary into embeddings

Actually, there is no need to create word embedding specifically, because it is included in the trained gensim model and can be acessed by `model.wv.syn0`.

In [6]:
def create_Wemb(model, Wemb_size=128):
    Wemb = numpy.zeros((len(model.wv.vocab), Wemb_size))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            Wemb[i] = embedding_vector
    return Wemb

In [7]:
data_path = '../data/Method3/aclImdb/train'

In [8]:
result = build_dict(data_path)

2018-07-10 09:40:49,192 : INFO : collecting all words and their counts
2018-07-10 09:40:49,193 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-10 09:40:49,716 : INFO : PROGRESS: at sentence #10000, processed 2518483 words, keeping 63047 word types
2018-07-10 09:40:50,297 : INFO : PROGRESS: at sentence #20000, processed 5010758 words, keeping 90480 word types
2018-07-10 09:40:50,599 : INFO : collected 101119 word types from a corpus of 6251170 raw words and 25000 sentences
2018-07-10 09:40:50,601 : INFO : Loading a fresh vocabulary
2018-07-10 09:40:51,110 : INFO : min_count=1 retains 101119 unique words (100% of original 101119, drops 0)
2018-07-10 09:40:51,112 : INFO : min_count=1 leaves 6251170 word corpus (100% of original 6251170, drops 0)
2018-07-10 09:40:51,438 : INFO : deleting the raw counts dictionary of 101119 items
2018-07-10 09:40:51,443 : INFO : sample=0.001 downsamples 48 most-common words
2018-07-10 09:40:51,444 : INFO : downsampling lea

2018-07-10 09:41:59,744 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-10 09:41:59,746 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-10 09:41:59,747 : INFO : training on 62511700 raw words (45598925 effective words) took 66.6s, 684642 effective words/s


In [9]:
sents_pos = result['tok_sents_pos']
model = result['model']
train_x_pos = sentence2idx(sents_pos, model)

In [10]:
model.wv.most_similar('good')

2018-07-10 09:42:10,793 : INFO : precomputing L2-norms of word weight vectors


[('decent', 0.7672065496444702),
 ('great', 0.7299867868423462),
 ('bad', 0.6878710389137268),
 ('nice', 0.6379164457321167),
 ('cool', 0.630168616771698),
 ('terrific', 0.6098483800888062),
 ('fine', 0.6026772856712341),
 ('solid', 0.6011196970939636),
 ('mediocre', 0.5951859951019287),
 ('funny', 0.5892332196235657)]

In [11]:
sents_neg = result['tok_sents_neg']
train_x_neg = sentence2idx(sents_neg, model)
print(train_x_neg[:2])

[array([  792,    16,    29,     4,     0,   118,  1764,  7121,    10,
          19,   983,     5,    28, 20422,     5,     7,    12,  2375,
        1807,   128,  2230,     5,     3,  6987,   300,     1,  2584,
        2314,     0,    19,    36,   484,  5942,    12,  3371,     2,
          39,    12,     3,  1003,   175,    21,    50,   782,     1]), array([   78,     1,     1, 49001,     9,     0,   197,   627,   131,
           8,  3345,     2,     9,   250,     0,    17,   179,   710,
           5,   107,     1,   190,    32, 12560,     5,    99, 10283,
        1478,     2,  3369,     1,  1642,    71,   147,   103,   627,
         168,  4411,   204,   102,    32,  1747,     0,    90,    29,
           9,   375,  2383, 26621,   151,    63,   537,   305,     6,
        2200,     4,  9892,     0,   861,   137,     8,  6074,   397,
          55,    32,   955,   155,    29,     4,   132, 19296, 12672,
           4,     3,    17, 10283,  5509,    61,    23,    28,  1148,
           3,    

In [12]:
train_x = train_x_pos + train_x_neg

In [13]:
train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)
# print(train_y)

In [14]:
test_path = '../data/Method3/aclImdb/test'
test_sents_pos = read_dataset(test_path+'/pos/')

In [15]:
tok_test_sents_pos = [nltk.word_tokenize(sent) for sent in test_sents_pos]
test_x_pos = sentence2idx(tok_test_sents_pos, model)

In [16]:
test_sents_neg = read_dataset(test_path+'/neg/')
tok_test_sents_neg = [nltk.word_tokenize(sent) for sent in test_sents_neg]
test_x_neg = sentence2idx(tok_test_sents_neg, model)
test_x = test_x_pos + test_x_neg
test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)

### Save converted sentence indices to a file

In [17]:
f = open('../data/Method1_and_2/gensim/gensim_imdb.pkl', 'wb')
pkl.dump((train_x, train_y), f, -1)
pkl.dump((test_x, test_y), f, -1)
f.close()

In [18]:
f = open('../data/Method1_and_2/gensim/gensim_imdb.pkl', 'rb')
train_set = pkl.load(f)
test_set = pkl.load(f)
f.close()

### Save word embeddings to a file

In [19]:
Wemb = create_Wemb(model)

In [20]:
f = open('../data/Method1_and_2/gensim/gensim_imdb_Wemb.pkl', 'wb')
pkl.dump(Wemb, f, -1)
f.close()

In [21]:
f = open('../data/Method1_and_2/gensim/gensim_imdb_Wemb.pkl', 'rb')
Wemb_file = pkl.load(f)
f.close()

In [22]:
print(Wemb)

[[ -1.92171264e+00  -3.81147116e-01  -1.29860923e-01 ...,  -2.91345835e-01
   -1.78681016e+00  -7.31336296e-01]
 [ -8.08200777e-01   4.21601906e-02   3.02510470e-01 ...,   2.95328256e-03
   -6.02169633e-01  -4.39393252e-01]
 [ -8.35283518e-01  -1.01795304e+00  -7.26746678e-01 ...,   5.62283099e-01
   -2.69957393e-01   4.16467078e-02]
 ..., 
 [  3.78807150e-02  -1.21285790e-03  -3.67050283e-02 ...,   8.16808548e-03
   -3.65836434e-02   7.64383702e-03]
 [  5.21445349e-02  -1.07249478e-02  -6.36772141e-02 ...,  -4.69883308e-02
   -2.36288421e-02   1.54223349e-02]
 [ -3.80863249e-02   4.27303603e-03  -6.16931077e-03 ...,  -2.31780261e-02
   -3.32884826e-02   3.24734516e-04]]


In [23]:
print(Wemb_file)

[[ -1.92171264e+00  -3.81147116e-01  -1.29860923e-01 ...,  -2.91345835e-01
   -1.78681016e+00  -7.31336296e-01]
 [ -8.08200777e-01   4.21601906e-02   3.02510470e-01 ...,   2.95328256e-03
   -6.02169633e-01  -4.39393252e-01]
 [ -8.35283518e-01  -1.01795304e+00  -7.26746678e-01 ...,   5.62283099e-01
   -2.69957393e-01   4.16467078e-02]
 ..., 
 [  3.78807150e-02  -1.21285790e-03  -3.67050283e-02 ...,   8.16808548e-03
   -3.65836434e-02   7.64383702e-03]
 [  5.21445349e-02  -1.07249478e-02  -6.36772141e-02 ...,  -4.69883308e-02
   -2.36288421e-02   1.54223349e-02]
 [ -3.80863249e-02   4.27303603e-03  -6.16931077e-03 ...,  -2.31780261e-02
   -3.32884826e-02   3.24734516e-04]]


### Include all above into a function

In [24]:
def train_gensim_w2vec(path):
    result = build_dict(path+'train')
    model = result['model']

    sents_pos = result['tok_sents_pos']
    train_x_pos = sentence2idx(sents_pos, model)

    sents_neg = result['tok_sents_neg']
    train_x_neg = sentence2idx(sents_neg, model)
    train_x = train_x_pos + train_x_neg
    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)

    test_sents_pos = read_dataset(path+'test/pos/')
    tok_test_sents_pos = [nltk.word_tokenize(sent) for sent in test_sents_pos]
    test_x_pos = sentence2idx(tok_test_sents_pos, model)

    test_sents_neg = read_dataset(path+'test/neg/')
    tok_test_sents_neg = [nltk.word_tokenize(sent) for sent in test_sents_neg]
    test_x_neg = sentence2idx(tok_test_sents_neg, model)
    test_x = test_x_pos + test_x_neg
    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)

    f = open('../data/Method1_and_2/gensim/gensim_imdb.pkl', 'wb')
    pkl.dump((train_x, train_y), f, -1)
    pkl.dump((test_x, test_y), f, -1)
    f.close()

    Wemb = create_Wemb(model)
    f = open('../data/Method1_and_2/gensim/gensim_imdb_Wemb.pkl', 'wb')
    pkl.dump(Wemb, f, -1)
    f.close()

    model.save('../data/Method1_and_2/gensim/imdb_gensim_w2vmodel')

In [25]:
path = '../data/Method3/aclImdb/'
train_gensim_w2vec(path)

2018-07-10 09:44:59,609 : INFO : collecting all words and their counts
2018-07-10 09:44:59,611 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-10 09:45:00,369 : INFO : PROGRESS: at sentence #10000, processed 2518483 words, keeping 63047 word types
2018-07-10 09:45:01,003 : INFO : PROGRESS: at sentence #20000, processed 5010758 words, keeping 90480 word types
2018-07-10 09:45:01,312 : INFO : collected 101119 word types from a corpus of 6251170 raw words and 25000 sentences
2018-07-10 09:45:01,314 : INFO : Loading a fresh vocabulary
2018-07-10 09:45:02,048 : INFO : min_count=1 retains 101119 unique words (100% of original 101119, drops 0)
2018-07-10 09:45:02,049 : INFO : min_count=1 leaves 6251170 word corpus (100% of original 6251170, drops 0)
2018-07-10 09:45:02,534 : INFO : deleting the raw counts dictionary of 101119 items
2018-07-10 09:45:02,540 : INFO : sample=0.001 downsamples 48 most-common words
2018-07-10 09:45:02,544 : INFO : downsampling lea

2018-07-10 09:46:13,488 : INFO : PROGRESS: at 93.25% examples, 620093 words/s, in_qsize 7, out_qsize 0
2018-07-10 09:46:14,489 : INFO : PROGRESS: at 94.38% examples, 618694 words/s, in_qsize 7, out_qsize 0
2018-07-10 09:46:15,495 : INFO : PROGRESS: at 95.76% examples, 618879 words/s, in_qsize 7, out_qsize 0
2018-07-10 09:46:16,511 : INFO : PROGRESS: at 97.32% examples, 619810 words/s, in_qsize 7, out_qsize 0
2018-07-10 09:46:17,514 : INFO : PROGRESS: at 98.19% examples, 616762 words/s, in_qsize 6, out_qsize 1
2018-07-10 09:46:18,518 : INFO : PROGRESS: at 99.64% examples, 617200 words/s, in_qsize 5, out_qsize 2
2018-07-10 09:46:18,784 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-07-10 09:46:18,800 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-07-10 09:46:18,802 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-07-10 09:46:18,804 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-07-10 09:46:1

### Check if embeddings from gensim model equal to the word embeddings stored in the file to make sure the correctness of storing word embeddings.

In [26]:
def compare_idxwemb_wordemb(sentence_idx):
    model_path = '../data/Method1_and_2/gensim/imdb_gensim_w2vmodel'
    model = gensim.models.Word2Vec.load(model_path)
    sentence_words = []
    for idx, x in enumerate(sentence_idx):
        sentence_words.append(model.wv[model.wv.index2word[x]])

    f = open('../data/Method1_and_2/gensim/gensim_imdb_Wemb.pkl', 'rb')
    Wemb = pkl.load(f)
    f.close()
    emb_x = Wemb[sentence_idx]

    return (numpy.matrix(sentence_words)==numpy.matrix(emb_x)).all()

In [27]:
print(train_x_neg[0])
compare_idxwemb_wordemb(train_x_neg[0])

2018-07-10 09:49:00,239 : INFO : loading Word2Vec object from ../data/Method1_and_2/gensim/imdb_gensim_w2vmodel


[  792    16    29     4     0   118  1764  7121    10    19   983     5
    28 20422     5     7    12  2375  1807   128  2230     5     3  6987
   300     1  2584  2314     0    19    36   484  5942    12  3371     2
    39    12     3  1003   175    21    50   782     1]


2018-07-10 09:49:01,312 : INFO : loading wv recursively from ../data/Method1_and_2/gensim/imdb_gensim_w2vmodel.wv.* with mmap=None
2018-07-10 09:49:01,314 : INFO : loading syn0 from ../data/Method1_and_2/gensim/imdb_gensim_w2vmodel.wv.syn0.npy with mmap=None
2018-07-10 09:49:01,367 : INFO : setting ignored attribute syn0norm to None
2018-07-10 09:49:01,371 : INFO : loading syn1neg from ../data/Method1_and_2/gensim/imdb_gensim_w2vmodel.syn1neg.npy with mmap=None
2018-07-10 09:49:01,442 : INFO : setting ignored attribute cum_table to None
2018-07-10 09:49:01,445 : INFO : loaded ../data/Method1_and_2/gensim/imdb_gensim_w2vmodel


True