# Smooth Inverse Frequency with MLP

## Install Requeriments

In [2]:
import tensorflow as tf
import keras 

from keras.utils import to_categorical
from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, Dense, concatenate, Activation, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding

import lib.xmlreader as xml
import lib.utils as ut
import numpy as np
import random
import gensim
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline

## Add Google Drive Files

In [None]:
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH='/content/drive/My Drive/TASS'

## Pre-Process TASS Dataset

In [None]:
train_docs=xml.readXML(DATA_PATH+"/2017/2017-train.xml",[0,1,2,3])
dev_docs=xml.readXML(DATA_PATH+"/2017/2017-train.xml",[0,1,2,3])
test_docs=xml.readXML(DATA_PATH+"/2017/2017-train.xml",[0,1,2,3])

In [5]:
train_labels = []
for train_doc in train_docs:
    train_labels.append(train_doc.polarity)
dev_labels   = []
for dev_doc in dev_docs:
    dev_labels.append(dev_doc.polarity)

In [6]:
POS_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 0]
NEG_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 1]
NEU_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 2]
NON_train_docs = [train_docs[i] for i in range(len(train_labels)) if train_labels[i] == 3]

level_train_docs = [POSI_train_docs,NEGA_train_docs,NEUT_train_docs,NONE_train_docs]

### Get the same number of examples per class

In [8]:
min_number = min(len(POSI_train_docs),len(NEGA_train_docs),len(NEUT_train_docs),len(NONE_train_docs))
print(minSentLvl)

166


In [9]:
new_train_docs = []
for i in range(len(level_train_docs)):
    level_per = random.sample(level_train_docs[i],len(level_train_docs[i]))
    new_train_docs.append(level_per[:min_number])

In [11]:
flat_train_docs = [item for sublist in new_train_docs for item in sublist]
shuf_train_docs = random.sample(flat_train_docs,len(flat_train_docs))
print("shuf_train_docs size = ", len(shuf_train_docs))

shuf_train_docs size =  664


In [12]:
corpus = []
for doc in shuf_train_docs + dev_docs + test_docs:
    corpus.append(doc.content)

In [13]:
print("Sentences = ", (len(test_docs + dev_docs + shuf_train_docs)))

Sentences =  2592


In [14]:
shuf_train_labels = []
for doc in shuf_train_docs:
    shuf_train_labels.append(doc.polarity)

## Process with SBW 300 Features Model

In [15]:
def gensim_load_vec(path="../database/embeddings/SBW-vectors-300-min5.bin"):
    gensim_emb =  gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    vocab = gensim_emb.index2word
    vec = gensim_emb.syn0
    shape = gensim_emb.syn0.shape
    return gensim_emb, vec, shape, vocab

In [None]:
gensim_emb, vec, shape, vocab = gensim_load_vec()

In [17]:
counter = CountVectorizer(tokenizer=ut.tokenizer)

In [18]:
X = counter.fit_transform(corpus)
print(X.shape)

(2592, 8012)


In [19]:
VOCAB_SIZE = X.shape[1]

caption_texts = corpus
Xc = counter.fit_transform(caption_texts).todense().astype("float")
print(Xc.shape)

(2592, 8012)


In [23]:
sent_lens = np.sum(Xc, axis=1).astype("float")
sent_lens[sent_lens == 0] = 1e-14
print(sent_lens.shape)

(2592, 1)


In [20]:
embedding_matrix = np.zeros((VOCAB_SIZE, 300), np.float)
for word in list(counter.vocabulary_.keys()):
    try:
        i = counter.vocabulary_[word]
        embedding_matrix[i] = gensim_emb[word]
    except KeyError:
        pass

In [24]:
Xb = np.divide(np.dot(Xc, embedding_matrix), sent_lens)
print(Xb.shape)

(2592, 300)


In [25]:
train_tweets = []
train_labels = shuf_train_labels
for doc in shuf_train_docs:
    train_tweets.append(doc.content)

dev_tweets = []
for doc in dev_docs:
    dev_tweets.append(doc.content)

test_tweets = []
for doc in test_docs:
    test_tweets.append(doc.content)

In [30]:
Xtrain = Xb[0:len(train_tweets)]
Xdev   = Xb[ len(train_tweets):len(train_tweets) + len(dev_tweets)]
Xtest  = Xb[-len(test_tweets):]
ytrain = np.array(train_labels)
ydev   = np.array(dev_labels)

print(Xtrain.shape)
print(Xdev.shape)
print(Xtest.shape)

print(len(train_labels))
print(len(dev_labels))

(664, 300)
(500, 300)
(1428, 300)
664
500


## MLP Keras Model

In [34]:
tweet_encoder = Input(shape=(300,), dtype='float32')
join = Dense(300)(tweet_encoder)
join = Dropout(0.5)
join = Dense(128)(tweet_encoder)
join = Dropout(0.5)
join = Dense(64)(tweet_encoder)
join = Dropout(0.2)(join)
join = Dense(4)(join)
output = Activation('softmax')(join)
model  = Model(inputs=[tweet_encoder], outputs=[output])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 300)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                19264     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 4)                 260       
_________________________________________________________________
activation_1 (Activation)    (None, 4)                 0         
Total params: 19,524
Trainable params: 19,524
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.fit(np.concatenate((Xtrain,Xdev)), to_categorical(np.concatenate((ytrain,ydev))),
batch_size=128, epochs=25,validation_data=(Xdev, to_categorical(ydev)),verbose=1)

Train on 1164 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50


