## 1. Reading the database

In [1]:
import sys
import html
import json
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
import re

We transfor the original outputs into an categoritcal target

In [2]:
labels_dict = {}
labels_dict['P'] = [1,0,0,0]
labels_dict['N'] = [0,1,0,0]
labels_dict['NEU'] = [0,0,1,0]
labels_dict['NONE'] = [0,0,0,1]
print(labels_dict)

{'P': [1, 0, 0, 0], 'N': [0, 1, 0, 0], 'NEU': [0, 0, 1, 0], 'NONE': [0, 0, 0, 1]}


Next we create 2 functions to read the dataset provided by TASS

In [3]:
def read_xml(path):
    e = ET.parse(path)
    root = e.getroot()
    tweets = np.array([tweet.find('content').text for tweet in root.findall('tweet')])
    labels_aux = np.array([tweet.find('sentiment').find('polarity').find('value').text for tweet in root.findall('tweet')])
    labels = [labels_dict[i] for i in labels_aux]
    return tweets,labels
def read_xml_test(path):
    e = ET.parse(path)
    root = e.getroot()
    tweets = np.array([tweet.find('content').text for tweet in root.findall('tweet')])
    return tweets

The test dataset uses another function because is not labeled

In [4]:
DATA_PATH='./TASS/TASS2018/'
tweets_train,labels_train=read_xml(DATA_PATH+'Training.xml')
tweets_valid,labels_valid=read_xml(DATA_PATH+'Development.xml')
tweets_test=read_xml_test(DATA_PATH+'Test.xml')

In [5]:
print("Size train:",len(tweets_train),"\n")
print("Data train:\n",tweets_train[:2],"\n")
print("Size valid:",len(tweets_valid),"\n")
print("Data valid:\n",tweets_valid[:2],"\n")
print("Size test:",len(tweets_valid),"\n")

Size train: 1000 

Data train:
 ['Sin ser fan de Juan Gabriel, siempre supe que era una fuerza de la naturaleza. Hoy escuché "Querida", y me dio una ternura enorme.'
 'ayer preguntaban y dónde están las solteras!!!! todo mi grupo alza la mano y yo la única que no y todas voltean a verme AJAJAJAJAJJA'] 

Size valid: 500 

Data valid:
 ['Así te paguen bien... Si es a última hora... No se podrá... Y hoy me tocó servir, no es con pago económico, pero el pago me lo da el Rey'
 'Manolo: se llama H&M por Hombre y Mujer. Yo: ..pero.es una marca americana, no tendría sentido. Manolo: callate butch. Yo:'] 

Size test: 500 



Applying a shuffle to mix the training data

In [6]:
import random
shaffle_ids = random.sample(range(len(tweets_train)), len(tweets_train))
tweets=[tweets_train[i] for i in shaffle_ids]+[tw for tw in tweets_valid]+[tw for tw in tweets_test]
new_labels_train=[labels_train[i] for i in shaffle_ids]
print(len(shaffle_ids))
print(len(tweets))
print(len(new_labels_train))

1000
2928
1000


## 3. Generating the embedding

### 3.1. Loading a word2vec model

We have used a pre-trained word2vec model in order to analize the results.

In [7]:
from gensim.models import KeyedVectors
import gensim.models.word2vec

w2v =  gensim.models.KeyedVectors.load_word2vec_format("./word2vec.bin", binary=True)
vocab = w2v.index2word


### 3.2. Tweets to sequences

In [8]:
from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer
counter = CountVectorizer(tokenizer=Tokenizer)
tokenizer  = Tokenizer()
tokenizer.fit_on_texts(tweets)
#seq = tokenizer.texts_to_sequences(tweets)
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
len(reverse_word_map)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


9984

After recognize the number of words in the dataset we proced to extract their features in the worrd2vec model, creating the embbeding_matrix

In [9]:
embedding_matrix = []
dict2={}
idx=0
for i in range(len(reverse_word_map)):
    w=reverse_word_map.get(i)
    if w in vocab:
        embedding_matrix.append(w2v[w])
        dict2[w]=idx
        if i%1000==0:
            print(i,": ",idx,": ",w)
        idx+=1
print(idx)
embedding_matrix=np.asarray(embedding_matrix)
print(embedding_matrix.shape)

2000 :  1835 :  pasaron
4000 :  3371 :  bandas
5000 :  4021 :  alcanzaba
7000 :  5373 :  deuda
9000 :  6710 :  azulado
7272
(7272, 300)


Now we update de tokenizer with new values using only the word presents in the loaded word2vec model

In [10]:
X=[]
idx=0
for tw in tweets:
    idx+=1
    Xr=[]
    for w in tw.split():
        if w in dict2:
            Xr.append(dict2[w])
    X.append(Xr)

Once processed we split again the dataset in train, validation and test

In [11]:
from keras.preprocessing.sequence import pad_sequences
maxl=30
x_train = pad_sequences(X[:len(tweets_train)], maxlen=maxl)
x_valid   = pad_sequences(X[len(tweets_train):-len(tweets_test)], maxlen=maxl)
x_test   = pad_sequences(X[-len(tweets_test):], maxlen=maxl)
y_train=np.asarray(new_labels_train)
y_valid=np.asarray(labels_valid)
print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)
print(y_train.shape)
print(y_valid.shape)

(1000, 30)
(500, 30)
(1428, 30)
(1000, 4)
(500, 4)


## 3. Defining the architecture

We add some configurations in order to get compiling

In [12]:
import tensorflow as tf
from keras import backend as K
config = tf.ConfigProto(intra_op_parallelism_threads=4, allow_soft_placement=True, device_count = {'CPU' : 1, 'GPU' : 0})
sess = tf.Session(config=config)
K.set_session(sess)

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

from keras.layers import Input, Dense, concatenate, Activation
from keras.models import Model
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Dense, Embedding, LSTM,TimeDistributed,Dropout,Bidirectional,GRU

lstm_out=16
VOCAB_SIZE=len(dict2)
input_layer = Input(shape=(maxl,), dtype='int32')

network   = Embedding(VOCAB_SIZE, 300, weights=[embedding_matrix], input_length=30, trainable=False)(input_layer)
network=LSTM(lstm_out, return_sequences=True)(network)
network=Dropout(0.5)(network)
network=LSTM(lstm_out)(network)
network=Dropout(0.5)(network)
network=Dense(4, activation='softmax')(network)
model = Model(inputs=[tweet_input], outputs=[network])
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])


print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 30)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 30, 300)           2181600   
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 16)            20288     
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 16)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 68        
Total para

## 4. Training

In [14]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

y_integers = np.argmax(y_train, axis=1)
class_weights = compute_class_weight('balanced', np.unique(y_integers), y_integers)
d_class_weights = dict(enumerate(class_weights))

In [15]:
model.fit(x_train,y_train, epochs=50,validation_data=(x_valid, y_valid))
#model.fit(x_train,y_train, epochs=50,validation_data=(x_valid, y_valid),class_weight=d_class_weights)

Train on 1000 samples, validate on 500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7fecfe367828>

In [16]:
result2 = model.predict(x_valid,batch_size=1,verbose = 2)

In [17]:
print(result2[:5])

[[0.01032623 0.90967005 0.01236943 0.06763427]
 [0.01412319 0.05688623 0.18665737 0.7423333 ]
 [0.22834596 0.03581598 0.0973129  0.6385252 ]
 [0.00376674 0.04132044 0.09123153 0.86368126]
 [0.00591933 0.36781576 0.00940099 0.6168639 ]]


In [18]:
result3=np.argmax(result2,axis=1)
print(result3[:10])

[1 3 3 3 3 2 0 1 0 3]


In [19]:
Y_target=np.argmax(y_valid,axis=1)
print(Y_target[:10])

[1 3 1 3 3 3 1 2 3 3]


In [20]:
from sklearn.metrics import confusion_matrix,f1_score
Result=confusion_matrix(Y_target, result3)
print(Result)

[[ 50   5   4  36]
 [ 22  26   9  49]
 [ 13  10   4  34]
 [ 47  28  16 147]]


In [21]:
print("Precision 0: ",Result[0,0]/(Result[0,0]+Result[0,1]+Result[0,2]+Result[0,3]))
print("Precision 1: ",Result[1,1]/(Result[1,0]+Result[1,1]+Result[1,2]+Result[1,3]))
print("Precision 2: ",Result[2,2]/(Result[2,0]+Result[2,1]+Result[2,2]+Result[2,3]))
print("Precision 3: ",Result[3,3]/(Result[3,0]+Result[3,1]+Result[3,2]+Result[3,3]))
print("General precision: ",np.trace(Result)/np.sum(Result))
print("F1_score: ",f1_score(Y_target, result3, average='macro'))

Precision 0:  0.5263157894736842
Precision 1:  0.24528301886792453
Precision 2:  0.06557377049180328
Precision 3:  0.6176470588235294
General precision:  0.454
F1_score:  0.3515278019540369


In [22]:
result2 = model.predict(x_test,batch_size=1,verbose = 2)
result3=np.argmax(result2,axis=1)

In [23]:
import xml.etree.ElementTree as ET

file = open("test_label.txt","w") 
e = ET.parse(DATA_PATH+'Test.xml')
root = e.getroot()
ID = np.array([tweet.find('tweetid').text for tweet in root.findall('tweet')])
idx=0
for i in ID:
    label=result3[idx]
    if label == 0:
        file.write(i + "\t" + "P" + "\n")
    elif label == 1:
        file.write(i + "\t" + "N" + "\n")
    elif label == 2:
        file.write(i + "\t" + "NEU" + "\n")
    elif label == 3:
        file.write(i + "\t" + "NONE" + "\n")
    
    idx+=1
file.close()


In [24]:
x_test.shape

(1428, 30)