In [25]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from textblob import Word
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Flatten, Dense, CuDNNLSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [20]:
df_anger = pd.read_table('train/EI-reg-En-anger-train.txt')
df_fear = pd.read_table('train/EI-reg-En-fear-train.txt')
df_joy = pd.read_table('train/EI-reg-En-joy-train.txt')
df_sadness = pd.read_table('train/EI-reg-En-sadness-train.txt')
train_df = [df_anger,df_fear,df_joy,df_sadness]

In [21]:
#preprocess
stop = stopwords.words('english')
for train in train_df:
    train['Tweet'] = train['Tweet'].apply(lambda x: " ".join(word.lower() for word in x.split() if not word.startswith('@')))
    train['Tweet'] = train['Tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    train['Tweet'] = train['Tweet'].str.replace('[^\w\s]','')
    train['Tweet'] = train['Tweet'].apply(lambda x: " ".join([re.sub(r'(.)\1+', r'\1\1', word) for word in x.split()]))
#     train['Tweet'] = train['Tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [98]:
# Better Results with higher threshold but training sample size reduced by increasing threshold
th = 0.6 #threshold
tweets = (pd.concat([train_df[0][train_df[0]['Intensity Score']> th]['Tweet'],
                     train_df[1][train_df[1]['Intensity Score']> th]['Tweet'],
                     train_df[2][train_df[2]['Intensity Score']> th]['Tweet'],
                     train_df[3][train_df[3]['Intensity Score']> th]['Tweet']],
                    axis = 0)).values

labels = (pd.concat([train_df[0][train_df[0]['Intensity Score']> th]['Affect Dimension'],
                     train_df[1][train_df[1]['Intensity Score']> th]['Affect Dimension'],
                     train_df[2][train_df[2]['Intensity Score']> th]['Affect Dimension'],
                     train_df[3][train_df[3]['Intensity Score']> th]['Affect Dimension']],
                    axis = 0)).values

labels = pd.get_dummies(labels).values

In [100]:
len(labels)

2219

In [61]:
data

array([[   0,    0,    0, ...,  615,  837,    7],
       [   0,    0,    0, ...,   56,  150,  146],
       [   0,    0,    0, ..., 7749, 1465,   74],
       ...,
       [   0,    0,    0, ...,    0,  133,   54],
       [   0,    0,    0, ...,   53, 4663,   50],
       [   0,    0,    0, ...,  806,  809, 1908]])

In [101]:
maxlen = 140
training_samples = 2000
validation_samples = 219
max_words = 10000  # We will only consider the top 10000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

Found 5838 unique tokens.
Shape of data tensor: (2219, 140)
Shape of label tensor: (2219, 4)


In [67]:
import os
glove_dir = 'glove/'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'),encoding= 'utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [102]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

In [105]:
model = Sequential()
# model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Embedding(max_words, embedding_dim, weights=[embedding_matrix], input_length=maxlen))
model.add(CuDNNLSTM(32))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=15,
                    batch_size=32,
                    validation_data=(x_val, y_val))

Train on 2000 samples, validate on 219 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [87]:
embeddings_index.get('motorola')

array([ 0.38116 , -0.23609 ,  0.30581 ,  0.40604 ,  0.47422 , -1.1652  ,
        0.58543 , -0.42291 , -0.43639 , -0.15835 ,  0.24699 , -0.018789,
        0.58137 , -0.49755 ,  0.041687, -0.5897  , -0.3294  ,  0.36925 ,
        0.78821 ,  0.14787 , -0.21123 , -0.31178 ,  0.19986 ,  1.8647  ,
       -0.051482,  0.23404 ,  0.50634 ,  0.41292 ,  0.24996 , -0.12791 ,
        0.33124 ,  1.2024  , -0.4176  ,  0.083039, -0.035185,  0.22507 ,
        0.046623,  0.044168,  1.0262  , -0.15211 ,  0.268   , -0.063099,
        0.098951, -0.26087 , -1.7165  ,  0.60704 ,  0.63593 ,  0.7041  ,
        0.23608 , -0.673   , -0.58736 ,  0.069367, -0.37012 , -0.32947 ,
       -0.10242 , -0.20711 , -0.69569 ,  0.021247,  0.65592 ,  0.09189 ,
       -0.13037 , -0.59474 ,  0.17548 ,  0.33628 , -0.72363 , -0.6054  ,
       -0.0123  ,  0.78532 ,  0.40969 ,  0.63428 ,  0.17487 ,  0.50056 ,
       -0.69332 , -0.59251 ,  0.17914 ,  0.46409 ,  0.79007 ,  0.12431 ,
       -0.22714 ,  0.34608 ,  1.3969  , -0.93868 , 