#### Importing Dependencies

In [7]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import sys
import pandas

#### Defining WordVec Parameters

In [8]:
MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 40000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

#### Importing Word Vectors

In [9]:
print('Indexing word vectors.')

embeddings_index = {}
f = open('glove.6B/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


#### Generating a list of sentences

In [40]:
import xml.etree.ElementTree as ET
print('Processing text dataset')

tree = ET.parse("/home/jeet/Academics/CS671/Project/Restaurants_Train.xml")
corpus = tree.getroot()
sentences = [] # List of list of sentences.
sent = corpus.findall('.//sentence')
for s in sent:
    sentences.append(s.find('text').text)

print ('Generated list of sentences..')
# print (type(sentences[0].find('text').text))
# print (sent[0])

# texts = []  # list of text samples
# labels_index = {}  # dictionary mapping label name to numeric id
# labels = []  # list of label ids
# for name in sorted(os.listdir('20news-bydate/20news-bydate-train/')):
#     path = os.path.join('20news-bydate/20news-bydate-train/', name)
#     if os.path.isdir(path):
#         label_id = len(labels_index)
#         labels_index[name] = label_id
#         # mapping a document to a label
#         for fname in sorted(os.listdir(path)):
#             if fname.isdigit():
#                 fpath = os.path.join(path, fname)
#                 f = open(fpath)
#                 texts.append(f.read())
#                 f.close()
#                 labels.append(label_id)
#                 # mapping a label id to all the documents in a directory.

# print('Found %s texts.' % len(texts))
# # print (labels[:1000])
# print (texts[0])
# print (sentences[0])

Processing text dataset
['But the staff was so horrible to us.', "To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora.", "The food is uniformly exceptional, with a very capable kitchen which will proudly whip up whatever you feel like eating, whether it's on the menu or not.", 'Where Gabriela personaly greets you and recommends you what to eat.', "For those that go once and don't enjoy it, all I can say is that they just don't get it.", "Not only was the food outstanding, but the little 'perks' were great.", 'It is very overpriced and not very tasty.', 'Our agreed favorite is the orrechiete with sausage and chicken (usually the waiters are kind enough to split the dish in half so you get to sample both meats).', 'The Bagels have an outstanding taste with a terrific texture, both chewy yet not gummy.', 'Nevertheless the food itself is pretty good.']


#### vectorize the text samples into a 2D integer tensor and padding the sentences

In [42]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, lower=False)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 5250 unique tokens.


In [43]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

#### split the data into training set and validation set

In [44]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])