In [73]:
import os
import glob
import gensim, logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import nltk
import numpy
import pickle as pkl
import sys
import multiprocessing

In [48]:
# Strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')

    # Pad punctuation with spaces on both sides
    for char in [':', '"', ',', '(', ')', '!', '?', ';', '*']:
        norm_text = norm_text.replace(char, ' ')

    norm_text = norm_text.replace('.', ' ' + '.')
    return norm_text

In [66]:
def read_dataset(path):
    if sys.version > '3':
        control_chars = [chr(0x85)]
    else:
        control_chars = [unichr(0x85)]

    dataset = []
    currdir = os.getcwd()
    os.chdir(path)
    #i = 0
    for ff in glob.glob("*.txt"):
        #i += 1
        with open(ff, "r") as f:
            line_txt = f.readline().strip()
            line_norm = normalize_text(line_txt)
            dataset.append(line_norm)
        #if(i==100):
            #break
    os.chdir(currdir)
    return dataset

In [94]:
def build_dict(path, Wemb_size=128, iter=10):
    cores = multiprocessing.cpu_count()
    assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

    sentences_pos = read_dataset(path+'/pos/')
    sentences_neg = read_dataset(path+'/neg/')
    
    sentences_train = sentences_pos + sentences_neg
    tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences_train]

    model = gensim.models.Word2Vec(tokenized_sentences, min_count=1,
                                   size=Wemb_size, window=5,
                                   workers=cores, iter=iter)

    tok_sents_pos = tokenized_sentences[:len(sentences_pos)]
    tok_sents_neg = tokenized_sentences[len(sentences_pos):]

    return {'model': model, 'tok_sents_pos': tok_sents_pos, 'tok_sents_neg': tok_sents_neg}

In [69]:
def sentence2idx(tokenized_sentences, model):
    idx = []
    for tok_sen in tokenized_sentences:
        idx_sent = numpy.zeros(len(tok_sen), dtype=numpy.int)
        for i, word in enumerate(tok_sen):
            if word in model.wv.vocab:
                idx_sent[i] = model.wv.vocab[word].index
            else:
                idx_sent[i] = model.wv.vocab['.'].index
        idx.append(idx_sent)
    return idx

In [70]:
def create_Wemb(model, Wemb_size=128):
    Wemb = numpy.zeros((len(model.wv.vocab), Wemb_size))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            Wemb[i] = embedding_vector
    return Wemb

In [71]:
data_path = '/Users/lifa08/Documents/Lifa/Machine_Learning/Miniproject_test/aclImdb/train'

In [95]:
result = build_dict(data_path)

2018-01-25 15:00:18,290 : INFO : collecting all words and their counts
2018-01-25 15:00:18,291 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-25 15:00:18,842 : INFO : PROGRESS: at sentence #10000, processed 2518483 words, keeping 63047 word types
2018-01-25 15:00:19,370 : INFO : PROGRESS: at sentence #20000, processed 5010758 words, keeping 90480 word types
2018-01-25 15:00:19,624 : INFO : collected 101119 word types from a corpus of 6251170 raw words and 25000 sentences
2018-01-25 15:00:19,625 : INFO : Loading a fresh vocabulary
2018-01-25 15:00:21,236 : INFO : min_count=1 retains 101119 unique words (100% of original 101119, drops 0)
2018-01-25 15:00:21,237 : INFO : min_count=1 leaves 6251170 word corpus (100% of original 6251170, drops 0)
2018-01-25 15:00:21,562 : INFO : deleting the raw counts dictionary of 101119 items
2018-01-25 15:00:21,574 : INFO : sample=0.001 downsamples 48 most-common words
2018-01-25 15:00:21,580 : INFO : downsampling lea

2018-01-25 15:01:32,468 : INFO : PROGRESS: at 95.25% examples, 630644 words/s, in_qsize 7, out_qsize 0
2018-01-25 15:01:33,490 : INFO : PROGRESS: at 95.85% examples, 625401 words/s, in_qsize 7, out_qsize 0
2018-01-25 15:01:34,501 : INFO : PROGRESS: at 96.96% examples, 623446 words/s, in_qsize 8, out_qsize 1
2018-01-25 15:01:35,502 : INFO : PROGRESS: at 98.34% examples, 623549 words/s, in_qsize 6, out_qsize 1
2018-01-25 15:01:36,506 : INFO : PROGRESS: at 99.94% examples, 624764 words/s, in_qsize 4, out_qsize 0
2018-01-25 15:01:36,517 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-01-25 15:01:36,524 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-01-25 15:01:36,532 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-01-25 15:01:36,538 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-01-25 15:01:36,539 : INFO : training on 62511700 raw words (45597626 effective words) took 73.0s, 624833 effective w

In [96]:
sents_pos = result['tok_sents_pos']
model = result['model']
train_x_pos = sentence2idx(sents_pos, model)

In [97]:
model.wv.most_similar('good')

2018-01-25 15:02:16,472 : INFO : precomputing L2-norms of word weight vectors


[('decent', 0.7660684585571289),
 ('great', 0.7152284383773804),
 ('bad', 0.6988692283630371),
 ('cool', 0.6371133327484131),
 ('nice', 0.6201329231262207),
 ('solid', 0.603621244430542),
 ('terrific', 0.599502444267273),
 ('fine', 0.5884516835212708),
 ('mediocre', 0.5779147148132324),
 ('interesting', 0.5721300840377808)]

In [77]:
sents_neg = result['tok_sents_neg']
train_x_neg = sentence2idx(sents_neg, model)

In [78]:
train_x = train_x_pos + train_x_neg

In [79]:
train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)

In [81]:
# print(train_y)

In [82]:
test_path = '/Users/lifa08/Documents/Lifa/Machine_Learning/Miniproject_test/aclImdb/test'
test_sents_pos = read_dataset(test_path+'/pos/')

In [83]:
tok_test_sents_pos = [nltk.word_tokenize(sent) for sent in test_sents_pos]
test_x_pos = sentence2idx(tok_test_sents_pos, model)

In [84]:
test_sents_neg = read_dataset(test_path+'/neg/')
tok_test_sents_neg = [nltk.word_tokenize(sent) for sent in test_sents_neg]
test_x_neg = sentence2idx(tok_test_sents_neg, model)
test_x = test_x_pos + test_x_neg
test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)

In [86]:
f = open('/Users/lifa08/Documents/Lifa/Machine_Learning/miniproject/gensim/gensim_imdb.pkl', 'wb')
pkl.dump((train_x, train_y), f, -1)
pkl.dump((test_x, test_y), f, -1)
f.close()

In [87]:
f = open('/Users/lifa08/Documents/Lifa/Machine_Learning/miniproject/gensim/gensim_imdb.pkl', 'rb')
train_set = pkl.load(f)
test_set = pkl.load(f)
f.close()

In [88]:
Wemb = create_Wemb(model)

In [89]:
f = open('/Users/lifa08/Documents/Lifa/Machine_Learning/miniproject/gensim/gensim_imdb_Wemb.pkl', 'wb')
pkl.dump(Wemb, f, -1)
f.close()

In [91]:
f = open('/Users/lifa08/Documents/Lifa/Machine_Learning/miniproject/gensim/gensim_imdb_Wemb.pkl', 'rb')
Wemb_file = pkl.load(f)
f.close()

In [92]:
print(Wemb)

[[ 1.09758687  1.05043387  0.39202166 ..., -0.38976276 -0.46508232
  -0.3893936 ]
 [ 0.19986108 -0.11645149  0.10571738 ...,  0.00534131 -0.48626673
  -0.71969342]
 [-0.19283009  0.90497458 -0.1725197  ...,  0.14764518  0.19503661
  -0.82979882]
 ..., 
 [-0.02627246  0.01910434  0.01541281 ...,  0.01297759 -0.03586807
   0.00416359]
 [ 0.02873666 -0.04096749  0.00623717 ..., -0.02622253  0.01189386
   0.00886645]
 [-0.03427199 -0.00707456  0.02952876 ..., -0.04402973 -0.01717805
   0.01733459]]


In [93]:
print(Wemb_file)

[[ 1.09758687  1.05043387  0.39202166 ..., -0.38976276 -0.46508232
  -0.3893936 ]
 [ 0.19986108 -0.11645149  0.10571738 ...,  0.00534131 -0.48626673
  -0.71969342]
 [-0.19283009  0.90497458 -0.1725197  ...,  0.14764518  0.19503661
  -0.82979882]
 ..., 
 [-0.02627246  0.01910434  0.01541281 ...,  0.01297759 -0.03586807
   0.00416359]
 [ 0.02873666 -0.04096749  0.00623717 ..., -0.02622253  0.01189386
   0.00886645]
 [-0.03427199 -0.00707456  0.02952876 ..., -0.04402973 -0.01717805
   0.01733459]]


In [64]:
def train_gensim_w2vec(path):
    # path = '/Users/lifa08/Documents/Lifa/Machine_Learning/Miniproject_test/aclImdb/'
    result = build_dict(path+'train')
    model = result['model']

    sents_pos = result['tok_sents_pos']
    train_x_pos = sentence2idx(sents_pos, model)

    sents_neg = result['tok_sents_neg']
    train_x_neg = sentence2idx(sents_neg, model)
    train_x = train_x_pos + train_x_neg
    train_y = [1] * len(train_x_pos) + [0] * len(train_x_neg)

    test_sents_pos = read_dataset(path+'test/pos/')
    tok_test_sents_pos = [nltk.word_tokenize(sent) for sent in test_sents_pos]
    test_x_pos = sentence2idx(tok_test_sents_pos, model)

    test_sents_neg = read_dataset(path+'test/neg/')
    tok_test_sents_neg = [nltk.word_tokenize(sent) for sent in test_sents_neg]
    test_x_neg = sentence2idx(tok_test_sents_neg, model)
    test_x = test_x_pos + test_x_neg
    test_y = [1] * len(test_x_pos) + [0] * len(test_x_neg)

    f = open('gensim/gensim_imdb.pkl', 'wb')
    pkl.dump((train_x, train_y), f, -1)
    pkl.dump((test_x, test_y), f, -1)
    f.close()

    Wemb = create_Wemb(model)
    f = open('gensim/gensim_imdb_Wemb.pkl', 'wb')
    pkl.dump(Wemb, f, -1)
    f.close()

    model.save('gensim/imdb_gensim_w2vmodel')

In [100]:
currdir = os.getcwd()
print(currdir)

/Users/lifa08/Documents/Lifa/Machine_Learning/Miniproject_test/aclImdb/train/pos


In [41]:
train_gensim_w2vec()

2018-01-20 22:30:30,011 : INFO : collecting all words and their counts
2018-01-20 22:30:30,013 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-20 22:30:30,017 : INFO : collected 286 word types from a corpus of 532 raw words and 4 sentences
2018-01-20 22:30:30,020 : INFO : Loading a fresh vocabulary
2018-01-20 22:30:30,023 : INFO : min_count=1 retains 286 unique words (100% of original 286, drops 0)
2018-01-20 22:30:30,024 : INFO : min_count=1 leaves 532 word corpus (100% of original 532, drops 0)
2018-01-20 22:30:30,027 : INFO : deleting the raw counts dictionary of 286 items
2018-01-20 22:30:30,029 : INFO : sample=0.001 downsamples 64 most-common words
2018-01-20 22:30:30,031 : INFO : downsampling leaves estimated 351 word corpus (66.0% of prior 532)
2018-01-20 22:30:30,032 : INFO : estimated required memory for 286 words and 128 dimensions: 435864 bytes
2018-01-20 22:30:30,035 : INFO : resetting layer weights
2018-01-20 22:30:30,041 : INFO : trainin

In [None]:
def compare_idxwemb_wordemb(sentence):
    model = gensim.models.Word2Vec.load('gensim/imdb_gensim_vmodel')
    sentence_words = []
    for idx, x in enumerate(sentence):
        sentence_words.append(model.wv[model.wv.index2word[x]])

    f = open('gensim/gensim_imdb_Wemb.pkl', 'rb')
    Wemb = pkl.load(f)
    f.close()
    emb_x = Wemb[sentence]
    
    return (numpy.matrix(sentence_words)==numpy.matrix(emb_x)).all()