#### Text Classification(Sentiment Analysis) using Embedding

Embedding is an old technique and very simple technique. 

This is a more expressive representation for text than more classical methods like bag-of-words, where 
relationships between words or tokens are ignored, or forced in bigram and trigram approaches.

In [79]:
from string import punctuation
from os import listdir
from numpy import array
from collections import Counter
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D


### Loading, Cleaning, Making Vocabulary

In [80]:
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
 
# turn a doc into clean tokens
def clean_doc(doc):
    tokens = doc.split()
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens
 
# load doc and add to vocab
def add_doc_to_vocab(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    vocab.update(tokens)
 
# load all docs in a directory
def process_docs(directory, vocab, is_trian):

    for filename in listdir(directory):
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        # add doc to vocab
        add_doc_to_vocab(path, vocab)
 


In [81]:
# define vocab
vocab = Counter()

# add all docs to vocab
process_docs('/home/hasan/DATA SET/review_polarity/txt_sentoken/neg', vocab, True)
process_docs('/home/hasan/DATA SET/review_polarity/txt_sentoken/pos', vocab, True)
# print the size of the vocab
print('Length of the vocabulary is :',len(vocab))
# print the top/most common words in the vocab
print(vocab.most_common(50))

Length of the vocabulary is : 44276
[('film', 7983), ('one', 4946), ('movie', 4826), ('like', 3201), ('even', 2262), ('good', 2080), ('time', 2041), ('story', 1907), ('films', 1873), ('would', 1844), ('much', 1824), ('also', 1757), ('characters', 1735), ('get', 1724), ('character', 1703), ('two', 1643), ('first', 1588), ('see', 1557), ('way', 1515), ('well', 1511), ('make', 1418), ('really', 1407), ('little', 1351), ('life', 1334), ('plot', 1288), ('people', 1269), ('could', 1248), ('bad', 1248), ('scene', 1241), ('movies', 1238), ('never', 1201), ('best', 1179), ('new', 1140), ('scenes', 1135), ('man', 1131), ('many', 1130), ('doesnt', 1118), ('know', 1092), ('dont', 1086), ('hes', 1024), ('great', 1014), ('another', 992), ('action', 985), ('love', 977), ('us', 967), ('go', 952), ('director', 948), ('end', 946), ('something', 945), ('still', 936)]


In [82]:
# remove those word which are apperas one time
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

25767


In [83]:
# save list to file
def save_list(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()
# save tokens to a vocabulary file 
save_list(tokens, '/home/hasan/DATA SET/review_polarity/txt_sentoken/vocabulary.txt')

### Train and Test Data

In [84]:
# load vocabulary into memory
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text
 
# load the vocabulary
vocab_filename = '/home/hasan/DATA SET/review_polarity/txt_sentoken/vocabulary.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [85]:
# turn a doc into clean tokens
def clean_doc(doc, vocab):
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    # filter out tokens not in vocab
    tokens = [w for w in tokens if w in vocab]
    tokens = ' '.join(tokens)
    return tokens


# load all docs in a directory
def process_docs(directory, vocab, is_trian):
    documents = list()
    # walk through all files in the folder
    for filename in listdir(directory):
        # skip any reviews in the test set
        if is_trian and filename.startswith('cv9'):
            continue
        if not is_trian and not filename.startswith('cv9'):
            continue
        # create the full path of the file to open
        path = directory + '/' + filename
        doc = load_doc(path)
        tokens = clean_doc(doc, vocab)
        documents.append(tokens)
    return documents
 

In [86]:
# load all training reviews
positive_docs = process_docs('/home/hasan/DATA SET/review_polarity/txt_sentoken/pos', vocab, True)
negative_docs = process_docs('/home/hasan/DATA SET/review_polarity/txt_sentoken/neg', vocab, True)
train_docs = negative_docs+positive_docs

# create the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_docs)

# sequence encode
encoded_docs = tokenizer.texts_to_sequences(train_docs)

# pad sequences
max_length = max([len(s.split()) for s in train_docs])

#train data
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

In [87]:
# load all test reviews
positive_docs = process_docs('/home/hasan/DATA SET/review_polarity/txt_sentoken/pos', vocab, False)
negative_docs = process_docs('/home/hasan/DATA SET/review_polarity/txt_sentoken/neg', vocab, False)
test_docs = negative_docs + positive_docs
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(test_docs)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define test labels
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

In [88]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1

### Model

In [89]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


In [90]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1317, 100)         2576800   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1310, 32)          25632     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 655, 32)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 20960)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                209610    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 11        
Total params: 2,812,053
Trainable params: 2,812,053
Non-trainable params: 0
____________________________________________

### Compiling

In [91]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


### Model Fit

In [92]:
model.fit(Xtrain, ytrain, epochs=30, verbose=2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/30
 - 13s - loss: 0.6906 - accuracy: 0.5222
Epoch 2/30
 - 14s - loss: 0.5763 - accuracy: 0.7039
Epoch 3/30
 - 12s - loss: 0.2132 - accuracy: 0.9322
Epoch 4/30
 - 12s - loss: 0.0215 - accuracy: 0.9972
Epoch 5/30
 - 12s - loss: 0.0040 - accuracy: 1.0000
Epoch 6/30
 - 12s - loss: 0.0019 - accuracy: 1.0000
Epoch 7/30
 - 12s - loss: 0.0013 - accuracy: 1.0000
Epoch 8/30
 - 13s - loss: 9.7527e-04 - accuracy: 1.0000
Epoch 9/30
 - 12s - loss: 7.8771e-04 - accuracy: 1.0000
Epoch 10/30
 - 12s - loss: 6.6174e-04 - accuracy: 1.0000
Epoch 11/30
 - 13s - loss: 5.6942e-04 - accuracy: 1.0000
Epoch 12/30
 - 12s - loss: 4.9952e-04 - accuracy: 1.0000
Epoch 13/30
 - 12s - loss: 4.1767e-04 - accuracy: 1.0000
Epoch 14/30
 - 12s - loss: 3.6966e-04 - accuracy: 1.0000
Epoch 15/30
 - 12s - loss: 3.3338e-04 - accuracy: 1.0000
Epoch 16/30
 - 12s - loss: 3.0122e-04 - accuracy: 1.0000
Epoch 17/30
 - 12s - loss: 2.7251e-04 - accuracy: 1.0000
Epoch 18/30
 - 14s - loss: 2.4646e-04 - accuracy: 1.0000
Epoch 19/30

<keras.callbacks.callbacks.History at 0x7f025a8c6050>

### Evaluate with Test Data

In [93]:
loss, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 88.499999
