In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from textblob import Word
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Flatten, Dense, CuDNNLSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
df_anger = pd.read_table('train/EI-reg-En-anger-train.txt')
df_fear = pd.read_table('train/EI-reg-En-fear-train.txt')
df_joy = pd.read_table('train/EI-reg-En-joy-train.txt')
df_sadness = pd.read_table('train/EI-reg-En-sadness-train.txt')
train_df = [df_anger,df_fear,df_joy,df_sadness]

In [3]:
#preprocess
stop = stopwords.words('english')
for train in train_df:
    train['Tweet'] = train['Tweet'].apply(lambda x: " ".join(word.lower() for word in x.split() if not word.startswith('@')))
    train['Tweet'] = train['Tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    train['Tweet'] = train['Tweet'].str.replace('[^\w\s]','')
    train['Tweet'] = train['Tweet'].apply(lambda x: " ".join([re.sub(r'(.)\1+', r'\1\1', word) for word in x.split()]))
#     train['Tweet'] = train['Tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [4]:
# Better Results with higher threshold but training sample size reduced by increasing threshold
th = 0.6 #threshold
tweets = (pd.concat([train_df[0][train_df[0]['Intensity Score']> th]['Tweet'],
                     train_df[1][train_df[1]['Intensity Score']> th]['Tweet'],
                     train_df[2][train_df[2]['Intensity Score']> th]['Tweet'],
                     train_df[3][train_df[3]['Intensity Score']> th]['Tweet']],
                    axis = 0)).values

labels = (pd.concat([train_df[0][train_df[0]['Intensity Score']> th]['Affect Dimension'],
                     train_df[1][train_df[1]['Intensity Score']> th]['Affect Dimension'],
                     train_df[2][train_df[2]['Intensity Score']> th]['Affect Dimension'],
                     train_df[3][train_df[3]['Intensity Score']> th]['Affect Dimension']],
                    axis = 0)).values

labels = pd.get_dummies(labels).values

In [5]:
len(labels)

2219

In [7]:
maxlen = 140
training_samples = 2000
validation_samples = 219
max_words = 10000  # We will only consider the top 10000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 5838 unique tokens.
Shape of data tensor: (2219, 140)
Shape of label tensor: (2219, 4)


In [8]:
import os
glove_dir = 'glove/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'),encoding= 'utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [9]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [10]:
model = Sequential()
# model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Embedding(max_words, embedding_dim, weights=[embedding_matrix], input_length=maxlen))
model.add(CuDNNLSTM(32))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=15,
                    batch_size=32,
                    validation_data=(x_val, y_val))

Train on 2000 samples, validate on 219 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [20]:
test_list = []
test_list.append('This makes me furious.')
test_list.append('I am so sorry for your loss')
test_list.append('The movie so bad.')
test_list.append('Congratulations you finally passed the exam.')
test_list.append('Everything is doomed to fail.')

In [21]:
test_seq = tokenizer.texts_to_sequences(test_list)
test = pad_sequences(test_seq, maxlen=maxlen)

In [22]:
res = np.round(model.predict(test),3) # anger,fear,joy,sadness
for i,j in zip(test_list,res):
    print(i,j)
    print()

This makes me furious. [0.996 0.001 0.001 0.001]

I am so sorry for your loss [0.    0.001 0.    0.998]

The movie so bad. [0.009 0.818 0.011 0.162]

Congratulations you finally passed the exam. [0.002 0.001 0.995 0.003]

Everything is doomed to fail. [0.02  0.559 0.078 0.343]



In [None]:
embeddings_index.get('motorola')