In [1]:
import pandas as pd
import numpy as np
import re
from keras.layers import Dot, Embedding,Activation, Input, Reshape, Flatten
from keras.layers import GlobalAveragePooling1D, Dense, Dropout
from keras.models import Model

Using TensorFlow backend.


In [2]:
def data_input(lab):
    sents = []
    with open("trail0/"+str(lab)+".csv", 'r') as sent_file:
        for line in sent_file:
            sents.append(re.sub("\n","",line))
    labels = [lab]*len(sents)
    return sents, labels
    

In [3]:
sents, labels = [], []

for k in range(19):
    s,l = data_input(k)
    sents += s
    labels+= l

In [4]:
vocab_fn = "word_list.txt"
with open(vocab_fn, 'r') as vfn:
    index2word = vfn.read().split('\n')
index2word = index2word[:-1]
print(len(index2word),"words in vocab")

mat_fn = "weight.npy"
embedding_mat = np.load(mat_fn)
print(embedding_mat.shape,"embedding matrix")

1048576 words in vocab
(1048576, 100) embedding matrix


In [5]:
#add NULL (0) and UNK to our vocab
lookup_with_unk = {word:i+2 for i,word in enumerate(index2word)}
UNK_IND = 1

#add null and UNK vectors to our embedding matrix so it still lines up
embeddings_with_unk = np.zeros((embedding_mat.shape[0]+2, embedding_mat.shape[1]))
embeddings_with_unk[2:] = embedding_mat

In [6]:
sent_len = 20
X_matrix = np.zeros((len(sents), sent_len), dtype=np.int32)
for i,sent in enumerate(sents):
    sent_tokens = sent.strip().lower().split() #lazy tokenization
    sent_inds = [lookup_with_unk[s] if s in lookup_with_unk else UNK_IND for s in sent_tokens]
    sent_inds = sent_inds[:sent_len] #truncate if necessary
    X_matrix[i, :len(sent_inds)] = sent_inds

In [7]:
y = np.asarray(labels)

In [8]:
from keras.utils import np_utils
y = np_utils.to_categorical(y, 19)


In [9]:
#model 0
hidden_size = 16
vocab_size,embed_size = embeddings_with_unk.shape

#simplest possible model

sent_in = Input((None,), dtype="int32", name="sent_in")
#load the weights into the model
embed_layer = Embedding(vocab_size, embed_size, name="word_vec", weights=[embeddings_with_unk,])
sent_embeddings = embed_layer(sent_in)

sent_embeddings = Dropout(0.25)(sent_embeddings)

#compose the words by averaging their vectors
#a recurrent layer would be much more common here, but we're keeping this extremely simple
sent_avg = GlobalAveragePooling1D()(sent_embeddings)

#add a fully-connected layer - in practice, we would want to see whether this actually helps or not
hidden_repr = Dense(hidden_size, activation="tanh", name="tanh")(sent_avg)

pred = Dense(19, activation="softmax", name="softmax")(hidden_repr)
sentiment_model = Model(inputs=[sent_in], outputs=[pred,])

In [10]:
sentiment_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",])
sentiment_model.fit(X_matrix,y, epochs=2, validation_split=0.2)

  "This may consume a large amount of memory." % num_elements)


Train on 91200 samples, validate on 22800 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f08df58efd0>

In [11]:
#save model
from keras.models import model_from_json
model_json = sentiment_model.to_json()
with open("new_model_0.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
sentiment_model.save_weights("model_0.h5")
print("Saved model to disk")

Saved model to disk


In [12]:
sentiment_model = None

In [13]:
from keras.layers import Input, Bidirectional, Embedding, Dense, Dropout, SpatialDropout1D, LSTM, Activation
from keras.layers.merge import concatenate


In [14]:
# model 1
hidden_size = 16
vocab_size,embed_size = embeddings_with_unk.shape

#simplest possible model

sent_in = Input((20,), dtype="int32", name="sent_in")
#load the weights into the model
embed_layer = Embedding(vocab_size, embed_size, name="word_vec", weights=[embeddings_with_unk,])
x = embed_layer(sent_in)
x = Activation('tanh')(x)
embed_drop = SpatialDropout1D(0.25, name='embed_drop')
x = embed_drop(x)
lstm_0_output = Bidirectional(LSTM(512, return_sequences=True), name="bi_lstm_0")(x)
lstm_1_output = Bidirectional(LSTM(512, return_sequences=True), name="bi_lstm_1")(lstm_0_output)
x = concatenate([lstm_1_output, lstm_0_output, x])
x = GlobalAveragePooling1D()(x)

pred = Dense(19, activation="softmax", name="softmax")(x)
sentiment_model = Model(inputs=[sent_in], outputs=[pred,])

In [15]:
sentiment_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",])
sentiment_model.fit(X_matrix,y, epochs=2, validation_split=0.2)

  "This may consume a large amount of memory." % num_elements)


Train on 91200 samples, validate on 22800 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f08cdd2ccf8>

In [16]:
#save model
from keras.models import model_from_json
model_json = sentiment_model.to_json()
with open("new_model_1.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
sentiment_model.save_weights("new_model_1.h5")
print("Saved model to disk")

Saved model to disk


In [17]:
sentiment_model = None

In [18]:
def data_input(lab):
    sents = []
    with open(str(lab)+".csv", 'r') as sent_file:
        for line in sent_file:
            sents.append(re.sub("\n","",line))
    labels = [lab]*len(sents)
    return sents, labels
    

In [19]:
sents, labels = [], []

for k in range(19):
    s,l = data_input(k)
    sents += s
    labels+= l

In [20]:
vocab_fn = "word_list.txt"
with open(vocab_fn, 'r') as vfn:
    index2word = vfn.read().split('\n')
index2word = index2word[:-1]
print(len(index2word),"words in vocab")

mat_fn = "weight.npy"
embedding_mat = np.load(mat_fn)
print(embedding_mat.shape,"embedding matrix")

1048576 words in vocab
(1048576, 100) embedding matrix


In [21]:
#add NULL (0) and UNK to our vocab
lookup_with_unk = {word:i+2 for i,word in enumerate(index2word)}
UNK_IND = 1

#add null and UNK vectors to our embedding matrix so it still lines up
embeddings_with_unk = np.zeros((embedding_mat.shape[0]+2, embedding_mat.shape[1]))
embeddings_with_unk[2:] = embedding_mat

In [22]:
sent_len = 20
X_matrix = np.zeros((len(sents), sent_len), dtype=np.int32)
for i,sent in enumerate(sents):
    sent_tokens = sent.strip().lower().split() #lazy tokenization
    sent_inds = [lookup_with_unk[s] if s in lookup_with_unk else UNK_IND for s in sent_tokens]
    sent_inds = sent_inds[:sent_len] #truncate if necessary
    X_matrix[i, :len(sent_inds)] = sent_inds

In [23]:
y = np.asarray(labels)
y = np_utils.to_categorical(y, 19)

In [24]:
#model 0
hidden_size = 16
vocab_size,embed_size = embeddings_with_unk.shape

#simplest possible model

sent_in = Input((None,), dtype="int32", name="sent_in")
#load the weights into the model
embed_layer = Embedding(vocab_size, embed_size, name="word_vec", weights=[embeddings_with_unk,])
sent_embeddings = embed_layer(sent_in)

sent_embeddings = Dropout(0.25)(sent_embeddings)

#compose the words by averaging their vectors
#a recurrent layer would be much more common here, but we're keeping this extremely simple
sent_avg = GlobalAveragePooling1D()(sent_embeddings)

#add a fully-connected layer - in practice, we would want to see whether this actually helps or not
hidden_repr = Dense(hidden_size, activation="tanh", name="tanh")(sent_avg)

pred = Dense(19, activation="softmax", name="softmax")(hidden_repr)
sentiment_model = Model(inputs=[sent_in], outputs=[pred,])

In [25]:
sentiment_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",])
sentiment_model.fit(X_matrix,y, epochs=2, validation_split=0.2)

  "This may consume a large amount of memory." % num_elements)


Train on 182400 samples, validate on 45600 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f08cd7c4940>

In [26]:
#save model
from keras.models import model_from_json
model_json = sentiment_model.to_json()
with open("new_model_2.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
sentiment_model.save_weights("model_2.h5")
print("Saved model to disk")

Saved model to disk


In [27]:
sentiment_model = None

In [28]:
# model 1
hidden_size = 16
vocab_size,embed_size = embeddings_with_unk.shape

#simplest possible model

sent_in = Input((20,), dtype="int32", name="sent_in")
#load the weights into the model
embed_layer = Embedding(vocab_size, embed_size, name="word_vec", weights=[embeddings_with_unk,])
x = embed_layer(sent_in)
x = Activation('tanh')(x)
embed_drop = SpatialDropout1D(0.25, name='embed_drop')
x = embed_drop(x)
lstm_0_output = Bidirectional(LSTM(512, return_sequences=True), name="bi_lstm_0")(x)
lstm_1_output = Bidirectional(LSTM(512, return_sequences=True), name="bi_lstm_1")(lstm_0_output)
x = concatenate([lstm_1_output, lstm_0_output, x])
x = GlobalAveragePooling1D()(x)

pred = Dense(19, activation="softmax", name="softmax")(x)
sentiment_model = Model(inputs=[sent_in], outputs=[pred,])

In [29]:
sentiment_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy",])
sentiment_model.fit(X_matrix,y, epochs=2, validation_split=0.2)

  "This may consume a large amount of memory." % num_elements)


Train on 182400 samples, validate on 45600 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f094ff21518>

In [30]:
#save model
from keras.models import model_from_json
model_json = sentiment_model.to_json()
with open("new_model_3.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
sentiment_model.save_weights("new_model_3.h5")
print("Saved model to disk")

Saved model to disk
