#### word2vec Embedding

In this section, we will discover how to learn a standalone word embedding using an efficient algorithm called word2vec.

The word2vec algorithm is an approach to learning a word embedding from a text corpus in a standalone way. 
The benefit of the method is that it can produce high-quality word embeddings very efficiently, in terms of 
space and time complexity.

The word2vec algorithm processes documents sentence by sentence. This means we will preserve the 
sentence-based structure during cleaning.



In [90]:
from string import punctuation
import re, string
from os import listdir
from numpy import array
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

### Loading, Cleaning, Making Vocabulary

In [91]:
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
 
# turn a doc into clean tokens
def clean_doc(doc):
    tokens = doc.split()
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens
 
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)
 
# load all docs in a directory
def process_docs(directory, vocab, is_trian):

    for filename in listdir(directory):
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)

In [92]:
# define vocab
vocab = Counter()

# add all docs to vocab
process_docs('/home/hasan/DATA SET/txt_sentoken/neg', vocab, True)
process_docs('/home/hasan/DATA SET/txt_sentoken/pos', vocab, True)
# print the size of the vocab
print('Length of the vocabulary is :',len(vocab))
# print the top/most common words in the vocab
print(vocab.most_common(50))

Length of the vocabulary is : 44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('could', 1248), ('bad', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [93]:
# remove those word which are apperas one time
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

25767


In [94]:
# save list to file
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
# save tokens to a vocabulary file 
save_list(tokens, '/home/hasan/DATA SET/txt_sentoken/vocabulary.txt')

# Train and Test Data

In [95]:
# load vocabulary into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
 
# load the vocabulary
vocab_filename = '/home/hasan/DATA SET/txt_sentoken/vocabulary.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)
 

In [96]:
# turn a doc into clean tokens
def clean_doc(doc, vocab):
    tokens = doc.split()
    
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens


# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    documents = list()
    
    for filename in listdir(directory):
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc, vocab)
        documents.append(tokens)
    return documents

### Train data

In [97]:
# load all training reviews
positive_docs = process_docs('/home/hasan/DATA SET/txt_sentoken/pos', vocab, True)
negative_docs = process_docs('/home/hasan/DATA SET/txt_sentoken/neg', vocab, True)
train_docs = negative_docs+positive_docs

# create the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# pad sequences
max_length = max([len(s.split()) for s in train_docs])

#train data
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

### Test data

In [98]:
# load all test reviews
positive_docs = process_docs('/home/hasan/DATA SET/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('/home/hasan/DATA SET/txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

In [99]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size :',vocab_size)

Vocabulary size : 25768


### Model

In [100]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [101]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1310, 32)          25632     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 10)                209610    
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 11        
Total params: 2,812,053
Trainable params: 2,812,053
Non-trainable params: 0
____________________________________________

In [102]:
# compile network
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [103]:
# fit network
model.fit(Xtrain, ytrain, epochs=10, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
 - 12s - loss: 0.6888 - accuracy: 0.5456
Epoch 2/10
 - 11s - loss: 0.4652 - accuracy: 0.8083
Epoch 3/10
 - 12s - loss: 0.0779 - accuracy: 0.9761
Epoch 4/10
 - 12s - loss: 0.0084 - accuracy: 1.0000
Epoch 5/10
 - 12s - loss: 0.0024 - accuracy: 1.0000
Epoch 6/10
 - 12s - loss: 0.0014 - accuracy: 1.0000
Epoch 7/10
 - 12s - loss: 0.0010 - accuracy: 1.0000
Epoch 8/10
 - 12s - loss: 7.9758e-04 - accuracy: 1.0000
Epoch 9/10
 - 12s - loss: 6.3068e-04 - accuracy: 1.0000
Epoch 10/10
 - 12s - loss: 5.1321e-04 - accuracy: 1.0000


<keras.callbacks.callbacks.History at 0x7fe2fa6cf4d0>

### evaluate with test data

In [104]:
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 86.000001


### Predictin with new data

In [105]:
# turn a doc into clean tokens
def clean_doc(doc, vocab):
    tokens = doc.split()
    re_punc = re.compile( ' [%s] ' % re.escape(string.punctuation))
    tokens = [re_punc.sub( '' , w) for w in tokens]
    tokens = [w for w in tokens if w in vocab]
    tokens = ' ' .join(tokens)
    return tokens

In [106]:
# integer encode and pad documents
def encode_docs(tokenizer, max_length, docs):
    encoded = tokenizer.texts_to_sequences(docs)
    padded = pad_sequences(encoded, maxlen=max_length, padding='post')
    return padded

In [107]:
def predict_sentiment(review, vocab, tokenizer, max_length, model):
    line = clean_doc(review, vocab)
    padded = encode_docs(tokenizer, max_length, [line])
    yhat = model.predict(padded, verbose=0)
    percent_pos = yhat[0,0]
    if round(percent_pos) == 0:
        return (1-percent_pos), ' NEGATIVE '
    return percent_pos, ' POSITIVE '

In [108]:
# test positive text
text = 'Best movie ever! It was great, I recommend it.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print( 'Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))


 Review: [ Best movie ever! It was great, I recommend it. ]
Sentiment:  NEGATIVE  (51.443%) 


In [109]:
# test negative text
text = 'This is a bad movie.'
percent, sentiment = predict_sentiment(text, vocab, tokenizer, max_length, model)
print( 'Review: [%s]\nSentiment: %s (%.3f%%)' % (text, sentiment, percent*100))


 Review: [ This is a bad movie. ]
Sentiment:  NEGATIVE  (53.513%) 
