### Importing Dependencies and tools

In [2]:
from __future__ import print_function
import os
import numpy as np
np.random.seed(1337)

from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
import sys

Using Theano backend.
Using gpu device 0: GeForce GT 730M (CNMeM is disabled, cuDNN 5103)


### Importing Text Data

In [3]:
import xml.etree.ElementTree as ET
print('Processing text dataset')

tree = ET.parse("/home/jeet/Academics/CS671/Project/Restaurants_Train.xml")
corpus = tree.getroot()
sentences = [] # List of list of sentences.
sent = corpus.findall('.//sentence')
for s in sent:
    sentences.append(s.find('text').text)

print ('Generated list of sentences..')

MAX_SEQ_LENGTH = 69
MAX_NB_WORDS = 40000
EMBEDDING_DIM = 300

Processing text dataset
Generated list of sentences..


### Indexing Word Vectors

In [4]:
print('Indexing word vectors.')

embeddings_index = {}
f = open('glove.6B/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


#### vectorize the text samples into a 2D integer tensor and padding the sentences

In [5]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, lower=False)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
print ("Let's have a quick look at the word_index data..")
print (list(word_index.items())[:10])
# print (word_index['limited'])

data = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
print('Shape of data tensor:', data.shape)

Found 5250 unique tokens.
Let's have a quick look at the word_index data..
[('rasamalai', 2402), ('limited', 647), ('arrives', 4234), ('legacies', 2403), ('raining', 2404), ('saves', 1617), ('AN', 2405), ('meatsauce', 4956), ('sleek', 1237), ('four', 648)]
Shape of data tensor: (3044, 69)


#### defining output data

In [6]:
import nltk
from keras.preprocessing.text import text_to_word_sequence
raw_output = corpus.findall('.//sentence')
train_out= np.zeros(shape=(3044,69))
i=0
for output in raw_output:
    s = text_to_word_sequence(output.find('text').text, lower=False)
    indices = np.zeros(MAX_SEQ_LENGTH)
    
    aspectTerms = output.find('aspectTerms')
    if (aspectTerms):
        aspectTerm = aspectTerms.findall('aspectTerm')
        if (aspectTerm):
            for aspect_term in aspectTerm:
                try:
                    indices[s.index(aspect_term.attrib['term'])] = 1
#                     print (indices)
                except:
                    continue
    train_out[i] = indices
    i=i+1

print ("Shape of output tensor:", train_out.shape)

Shape of output tensor: (3044, 69)




### Preparing Embedding Layer

In [7]:
print('Preparing embedding matrix.')

# prepare embedding matrix
nb_words = len(word_index)
embedding_matrix = np.zeros((nb_words + 1, 300))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(nb_words + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LENGTH,
                            trainable=False)
print('Embedding Layer set..')

Preparing embedding matrix.
Embedding Layer set..


#### Extract Embeddings

In [9]:
from keras.models import Sequential
embedding_model = Sequential()
embedding_model.add(embedding_layer)

embedding_model.compile(loss='categorical_crossentropy',
                        optimizer='rmsprop',
                        metrics=['acc']
                       )

In [10]:
embedding_output = embedding_model.predict(data)

In [12]:
print('Shape of Embedding_output', embedding_output.shape)
print(type(embedding_output))

Shape of Embedding_output (3044, 69, 300)
<type 'numpy.ndarray'>


#### Adding POS-tag features to input

In [27]:
from keras.preprocessing.text import text_to_word_sequence
from nltk.tag.stanford import StanfordPOSTagger
from sklearn import preprocessing
from tqdm import tqdm

emb_tag_out = np.zeros(shape=(3044,69,306))
le = preprocessing.LabelEncoder()
tags = ["CC","NN","JJ","VB","RB","IN"]
le.fit(tags)
i=0
sentences = corpus.findall('.//sentence')
for sent in sentences:
    s = text_to_word_sequence(sent.find('text').text)
    tags_for_sent = nltk.pos_tag(s)
    sent_len = len(tags_for_sent)
    ohe = [0]*6
        
    for j in xrange(69):
        if j< len(tags_for_sent) and tags_for_sent[j][1] in tags:
            ohe[le.transform(tags_for_sent[j][1])] = 1
        emb_tag_out[i][j] = np.concatenate([embedding_output[i][j],ohe])
    i=i+1
    print (i,j)

1 68
2 68
3 68
4 68
5 68
6 68
7 68
8 68
9 68
10 68
11 68
12 68
13 68
14 68
15 68
16 68
17 68
18 68
19 68
20 68
21 68
22 68
23 68
24 68
25 68
26 68
27 68
28 68
29 68
30 68
31 68
32 68
33 68
34 68
35 68
36 68
37 68
38 68
39 68
40 68
41 68
42 68
43 68
44 68
45 68
46 68
47 68
48 68
49 68
50 68
51 68
52 68
53 68
54 68
55 68
56 68
57 68
58 68
59 68
60 68
61 68
62 68
63 68
64 68
65 68
66 68
67 68
68 68
69 68
70 68
71 68
72 68
73 68
74 68
75 68
76 68
77 68
78 68
79 68
80 68
81 68
82 68
83 68
84 68
85 68
86 68
87 68
88 68
89 68
90 68
91 68
92 68
93 68
94 68
95 68
96 68
97 68
98 68
99 68
100 68
101 68
102 68
103 68
104 68
105 68
106 68
107 68
108 68
109 68
110 68
111 68
112 68
113 68
114 68
115 68
116 68
117 68
118 68
119 68
120 68
121 68
122 68
123 68
124 68
125 68
126 68
127 68
128 68
129 68
130 68
131 68
132 68
133 68
134 68
135 68
136 68
137 68
138 68
139 68
140 68
141 68
142 68
143 68
144 68
145 68
146 68
147 68
148 68
149 68
150 68
151 68
152 68
153 68
154 68
155 68
156 68
157 68
158 68
15

In [29]:
print (emb_tag_out[1][0].shape)

(306,)
