In [1]:
# ********* Import Packages ********* # 
import string, random, glob, re, pickle, random, csv, nltk, emoji, operator
from collections import Counter
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize
import numpy as np
from numpy import array, asarray, zeros
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, MultiLabelBinarizer
from nltk.stem.wordnet import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import StratifiedKFold
from sklearn.cross_validation import KFold
from emoji.unicode_codes import UNICODE_EMOJI
from sklearn.metrics import precision_recall_fscore_support
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Model, load_model, Sequential
from keras.layers import Input, Dense, Flatten, Dropout, Embedding, GlobalMaxPooling1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping
from keras.callbacks import Callback
from keras import optimizers

Using TensorFlow backend.


In [2]:
# ********* Hyper-parameters configurations ********* # 

# Fix your seed
seed = 66
np.random.seed(seed)

# List of emotions you are going to use in ascending order
emotion_categories = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
num_categories = len(emotion_categories)

# Word and Hash-emo embedding dimension
dimension = 100

# Lexical feature dimension
feature_dimension = 29

filters = [128, 128, 128, 128]
dropout_rates = [0.5, 0.5, 0.5, 0.5]
kernel_sizes = [1, 2, 3, 1]
hidden = [200, 100, 10]

epochs = 5
batch_size = 64

embedding_dir = '../Deep-Learning-Resources/embeddings/glove.twitter.27B/glove.twitter.27B.100d.txt'

In [3]:
# ********* Load Data ********* # 
# Important: Change load_data() function according to the dataset that you have. The following 
# function processes the Twitter Emotion Corpus (TEC)
# Link of paper: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.383.3384&rep=rep1&type=pdf
# Link of dataset: http://saifmohammad.com/WebPages/SentimentEmotionLabeledData.html

# List of tweets
texts = []

# List of labels
labels = []

def load_data():
    with open('tweets.txt', 'r') as f:
        for line in f:
            splitted = line.strip().split()
            labels.append(splitted[len(splitted)-1])
            texts.append(' '.join(splitted[1:len(splitted)-2]))
    print('Loaded %s  data' % len(labels))

print("Loading data...")
load_data()
print(Counter(labels))

# Example
print(texts[55])
print(labels[55])

Loading data...
Loaded 21051  data
Counter({'joy': 8240, 'surprise': 3849, 'sadness': 3830, 'fear': 2816, 'anger': 1555, 'disgust': 761})
literally haven't seen the sun in a week and it's finally coming out!
joy


In [4]:
# ********* Load Lexicons ********* # 
bingliu_mpqa = {}
nrc_emotion = {}
nrc_affect_intensity = {}
nrc_hashtag_emotion = {}
afinn = {}
ratings = {}
stopwords = []
slangs = {}
negated = {}
emoticons = []

# Vader
analyzer = SentimentIntensityAnalyzer()

def load_lexicons():    
    # Ratings by Warriner et al. (2013)
    with open('lexicons/Ratings_Warriner_et_al.csv', 'r') as f:
        reader = csv.reader(f)
        rows = list(reader)
    for i in range(1, len(rows)):
        # Normalize values
        valence = (float(rows[i][2]) - 1.0)/(9.0-1.0)
        arousal = (float(rows[i][5]) - 1.0)/(9.0-1.0)
        dominance = (float(rows[i][8]) - 1.0)/(9.0-1.0)
        ratings[rows[i][1]] = {"Valence": valence, "Arousal": arousal, "Dominance": dominance}
        
    
    # NRC Emotion Lexicon (2014)
    with open('lexicons/NRC-emotion-lexicon-wordlevel-v0.92.txt', 'r') as f:
        f.readline()
        for line in f:
            splitted = line.strip().split('\t')
            if splitted[0] not in nrc_emotion:
                nrc_emotion[splitted[0]] = {'anger': float(splitted[1]),
                                                    'disgust': float(splitted[3]),
                                                    'fear': float(splitted[4]),
                                                    'joy': float(splitted[5]),
                                                    'sadness': float(splitted[8]),
                                                    'surprise': float(splitted[9])}

    # NRC Affect Intensity (2018)
    with open('lexicons/nrc_affect_intensity.txt', 'r') as f:
        f.readline()
        for line in f:
            splitted = line.strip().split('\t')
            if splitted[0] not in nrc_affect_intensity:
                nrc_affect_intensity[splitted[0]] = {'anger': float(splitted[1]),
                                                    'disgust': float(splitted[3]),
                                                    'fear': float(splitted[4]),
                                                    'joy': float(splitted[5]),
                                                    'sadness': float(splitted[8]),
                                                    'surprise': float(splitted[9])}
                
    # NRC Hashtag Emotion Lexicon (2015)
    with open('Lexicons/NRC-Hashtag-Emotion-Lexicon-v0.2.txt', 'r') as f:
        f.readline()
        for line in f:
            splitted = line.strip().split('\t')
            splitted[0] = splitted[0].replace('#','')
            if splitted[0] not in nrc_hashtag_emotion:
                nrc_hashtag_emotion[splitted[0]] = {'anger': float(splitted[1]),
                                                    'disgust': float(splitted[3]),
                                                    'fear': float(splitted[4]),
                                                    'joy': float(splitted[5]),
                                                    'sadness': float(splitted[8]),
                                                    'surprise': float(splitted[9])}
                
                
    # BingLiu (2004) and MPQA (2005)
    with open('lexicons/BingLiu.txt', 'r') as f:
        for line in f:
            splitted = line.strip().split('\t')
            if splitted[0] not in bingliu_mpqa:
                bingliu_mpqa[splitted[0]] = splitted[1]
    with open('lexicons/mpqa.txt', 'r') as f:
        for line in f:
            splitted = line.strip().split('\t')
            if splitted[0] not in bingliu_mpqa:
                bingliu_mpqa[splitted[0]] = splitted[1]
    
    
    with open('lexicons/AFINN-en-165.txt', 'r') as f:
        for line in f:
            splitted = line.strip().split('\t')
            if splitted[0] not in afinn:
                score = float(splitted[1])
                normalized_score = (score - (-5)) / (5-(-5))
                afinn[splitted[0]] = normalized_score
                
    
    with open('lexicons/stopwords.txt', 'r') as f:
        for line in f:
            stopwords.append(line.strip())

    with open('lexicons/slangs.txt', 'r') as f:
        for line in f:
            splitted = line.strip().split(',', 1)
            slangs[splitted[0]] = splitted[1]
            
    with open('lexicons/negated_words.txt', 'r') as f:
        for line in f:
            splitted = line.strip().split(',', 1)
            negated[splitted[0]] = splitted[1]
            
    with open('lexicons/emoticons.txt', 'r') as f:
        for line in f:
            emoticons.append(line.strip())
load_lexicons()

In [5]:
# ********* Helper Functions ********* # 
def char_is_emoji(character):
    return character in emoji.UNICODE_EMOJI

def text_has_emoji(text):
    for character in text:
        if character in emoji.UNICODE_EMOJI:
            return True
    return False

def clean_tweets(texts):
    cleaned_tweets = []
    hash_emos = []

    for text in texts:
        hash_emo = []
        text = re.sub('(!){2,}', ' <!repeat> ', text)
        text = re.sub('(\?){2,}', ' <?repeat> ', text)
        
        # Tokenize using tweet tokenizer
        tokenizer = TweetTokenizer(strip_handles=False, reduce_len=True)
        tokens = tokenizer.tokenize(text.lower())
        lemmatizer = WordNetLemmatizer()
        
        
        # Emojis and emoticons
        if text_has_emoji(text):
            temp = []
            for word in tokens:
                if char_is_emoji(word):
                    hash_emo.append(UNICODE_EMOJI[word])
                elif word in emoticons:
                    hash_emo.append(word)
                else:
                    temp.append(word)
            tokens = temp
            
        # Hashtags
        temp = []
        for word in tokens:
            if '#' in word:
                word = word.replace('#','')
                hash_emo.append(word)
            else:
                temp.append(word)
        tokens = temp
            
        # Replace slangs and negated words
        temp = []
        for word in tokens:
            if word in slangs:
                temp += slangs[word].split()
            elif word in negated:
                temp += negated[word].split()
            else:
                temp.append(word)
        tokens = temp

        # Replace user names
        tokens = ['<user>'  if '@' in word else word for word in tokens]
        
        #Replace numbers
        tokens = ['<number>' if word.isdigit() else word for word in tokens]
        
        # Remove urls
        tokens = ['' if 'http' in word else word for word in tokens]
        
        # Lemmatize
        #tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        # Remove stop words
        tokens = [word for word in tokens if word not in stopwords]

        # Remove tokens having length 1
        tokens = [word for word in tokens if word != '' and len(word) > 1]
        
        cleaned_tweets.append(tokens)
        hash_emos.append(hash_emo)

    return cleaned_tweets, hash_emos


# This function returns a n-dimensional feature vector
def feature_generation(texts, hashtags):
    feature_vectors = []

    for i in range(len(texts)):
        feats = [0] * feature_dimension
        for word in texts[i]:
            # Warriner er al.
            if word in ratings:
                feats[0] += ratings[word]['Valence']
                feats[1] += ratings[word]['Arousal']
                feats[2] += ratings[word]['Dominance']

            # Vader Sentiment
            polarity_scores = analyzer.polarity_scores(word)
            feats[3] += polarity_scores['pos']
            feats[4] += polarity_scores['neg']
            feats[5] += polarity_scores['neu']

            # NRC Emotion
            if word in nrc_emotion:
                feats[6] += nrc_emotion[word]['anger']
                feats[7] += nrc_emotion[word]['disgust']
                feats[8] += nrc_emotion[word]['fear']
                feats[9] += nrc_emotion[word]['joy']
                feats[10] += nrc_emotion[word]['sadness']
                feats[11] += nrc_emotion[word]['surprise']

            # NRC Affect Intensity
            if word in nrc_affect_intensity:
                feats[12] += nrc_affect_intensity[word]['anger']
                feats[13] += nrc_affect_intensity[word]['disgust']
                feats[14] += nrc_affect_intensity[word]['fear']
                feats[15] += nrc_affect_intensity[word]['joy']
                feats[16] += nrc_affect_intensity[word]['sadness']
                feats[17] += nrc_affect_intensity[word]['surprise']

            # AFINN
            if word in afinn:
                feats[18] += float(afinn[word])

            # BingLiu and MPQA
            if word in bingliu_mpqa:
                if bingliu_mpqa[word] == 'positive':
                    feats[19] += 1
                else:
                    feats[20] += 1


        count = len(texts[i])
        if count == 0:
            count = 1
        newArray = np.array(feats)/count
        feats = list(newArray)
        
        # Presence of consecutive exclamation mark or question mark
        for word in texts[i]:
            if word == '<!REPEAT>':
                feats[21] = 1
            elif word == '<?REPEAT>':
                feats[22] = 1

        for word in hashtags[i]:
            #NRC Hashtag Emotion
            if word in nrc_hashtag_emotion:
                feats[23] += nrc_hashtag_emotion[word]['anger']
                feats[24] += nrc_hashtag_emotion[word]['disgust']
                feats[25] += nrc_hashtag_emotion[word]['fear']
                feats[26] += nrc_hashtag_emotion[word]['joy']
                feats[27] += nrc_hashtag_emotion[word]['sadness']
                feats[28] += nrc_hashtag_emotion[word]['surprise']
        
        feature_vectors.append(feats)
    return np.array(feature_vectors)

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max([len(s) for s in lines])

def encode_text(tokenizer, lines, length):
    encoded = tokenizer.texts_to_sequences(lines)
    padded = pad_sequences(encoded, maxlen=length, padding='post')
    return padded

In [6]:
# ********* Model ********* # 
class TestCallback(Callback):
    def __init__(self, test_data):
        self.test_data = test_data
        self.accs = []

    def on_epoch_end(self, epoch, logs={}):
        x, y = self.test_data
        loss, acc = self.model.evaluate(x, y, verbose=0)
        self.accs.append(acc)
        print('\nTesting loss: {}, acc: {}\n'.format(loss, acc))
        
def model(max_tweet_length, max_hash_emo_length, vocab_size, vocab_size_hash_emo, tweet_matrix, hash_emo_matrix, dimension, feature_dimension, num_categories, train_embedding=False):
    # Channel 1
    inputs1 = Input(shape=(max_tweet_length,))
    embedding1 = Embedding(vocab_size, dimension, weights=[tweet_matrix], trainable=train_embedding)(inputs1)

    conv1 = Conv1D(filters=filters[0], kernel_size=kernel_sizes[0], activation='relu')(embedding1)
    drop1 = Dropout(dropout_rates[0])(conv1)
    pool1 = GlobalMaxPooling1D()(drop1)

    conv2 = Conv1D(filters=filters[1], kernel_size=kernel_sizes[1], activation='relu')(embedding1)
    drop2 = Dropout(dropout_rates[1])(conv2)
    pool2 = GlobalMaxPooling1D()(drop2)

    conv3 = Conv1D(filters=filters[2], kernel_size=kernel_sizes[2], activation='relu')(embedding1)
    drop3 = Dropout(dropout_rates[2])(conv3)
    pool3 = GlobalMaxPooling1D()(drop3)

    # Channel 2
    inputs2 = Input(shape=(max_hash_emo_length,))
    embedding2 = Embedding(vocab_size_hash_emo, dimension, weights=[hash_emo_matrix], trainable=train_embedding)(inputs2)
    conv4 = Conv1D(filters=filters[3], kernel_size=kernel_sizes[3], activation='relu')(embedding2)
    drop4 = Dropout(dropout_rates[3])(conv4)
    pool4 = GlobalMaxPooling1D()(drop4)

    # Lexical features
    features = Input(shape=(feature_dimension,))

    merged = concatenate([pool1, pool2, pool3, pool4, features])
    dense1 = Dense(hidden[0], activation='relu')(merged)
    dense2 = Dense(hidden[1], activation='relu')(dense1)
    dense3 = Dense(hidden[2], activation='relu')(dense2)
    outputs = Dense(num_categories, activation='softmax')(dense3)
    
    model = Model(inputs=[inputs1, inputs2, features], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #print(model.summary())
    #plot_model(model, show_shapes=True, to_file='multichannel.png')
    
    return model

In [7]:
print("Cleaning Data...")
cleaned_tweets, hash_emos = clean_tweets(texts)
print("Cleaning Completed!")

print("Generating Features...")
features = feature_generation(cleaned_tweets, hash_emos)
print("Feature Generation Completed!")


print("Encoding Data...")
# For Tweet Matrix
tokenizer_tweets = create_tokenizer(cleaned_tweets)
max_tweet_length = max_length(cleaned_tweets)
vocab_size = len(tokenizer_tweets.word_index) + 1
print('Vocabulary size: %d' % vocab_size)
X = encode_text(tokenizer_tweets, cleaned_tweets, max_tweet_length)

# For Hash-Emo Matrix
tokenizer_hash_emo = create_tokenizer(hash_emos)
max_hash_emo_length = max_length(hash_emos)
vocab_size_hash_emo = len(tokenizer_hash_emo.word_index) + 1
print('Vocabulary size (Hash-Emos): %d' % vocab_size_hash_emo)
encoded_hash_emo = encode_text(tokenizer_hash_emo, hash_emos, max_hash_emo_length)

# Labels
lb = LabelBinarizer()
lb.fit(labels)
Y = lb.transform(labels)
print("Encoding Completed!")


# Load embedding
print("Loading word embeddings...")
embeddings_index = dict()
f = open(embedding_dir)
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))


# Generate embedding matrices
print("Generating embedding matrices...")
tweet_matrix = zeros((vocab_size, dimension))
for word, i in tokenizer_tweets.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        tweet_matrix[i] = np.array(list(embedding_vector))
    else:
        tweet_matrix[i] = np.array(list(np.random.uniform(low=-1, high=1, size=(100,))))

hash_emo_matrix = zeros((vocab_size_hash_emo, dimension))

for word, i in tokenizer_hash_emo.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        hash_emo_matrix[i] = np.array(list(embedding_vector))
    else:
        hash_emo_matrix[i] = np.array(list(np.random.uniform(low=-1, high=1, size=(100,))))
print("Embedding matrices genearation completed!")

Cleaning Data...
Cleaning Completed!
Generating Features...
Feature Generation Completed!
Encoding Data...
Vocabulary size: 24672
Vocabulary size (Hash-Emos): 3533
Encoding Completed!
Loading word embeddings...
Loaded 1193514 word vectors.
Generating embedding matrices...
Embedding matrices genearation completed!


In [8]:
kf = KFold(len(labels), n_folds=10, shuffle=True, random_state=seed)

accuracies = []
counter = 1
for train, test in kf:
    print('Fold#', counter)
    counter += 1
    model_GloVe = model(max_tweet_length, 
                       max_hash_emo_length, 
                       vocab_size, 
                       vocab_size_hash_emo, 
                       tweet_matrix, 
                       hash_emo_matrix, 
                       dimension, 
                       feature_dimension,
                       num_categories, 
                       True)
    testObj = TestCallback(([X[test], encoded_hash_emo[test], features[test]], Y[test]))

    #earlystop = EarlyStopping(monitor='val_acc', min_delta=0.0001, patience=3, verbose=1, mode='auto')
    model_GloVe.fit([X[train], encoded_hash_emo[train], features[train]],
                    array(Y[train]),
                    epochs=epochs,
                    batch_size=batch_size,
                    callbacks=[testObj],
                    verbose = 1)
    scores = model_GloVe.evaluate([X[test], encoded_hash_emo[test], features[test]], Y[test], verbose=0)
    print("%s: %.2f%%" % (model_GloVe.metrics_names[1], scores[1]*100))
    index, value = max(enumerate(testObj.accs), key=operator.itemgetter(1))
    accuracies.append(value)

print(accuracies)
print(np.mean(accuracies))

Fold# 1
Epoch 1/5

Testing loss: 1.3120272347503692, acc: 0.4795821463054175

Epoch 2/5

Testing loss: 1.1313347520991268, acc: 0.5906932576995516

Epoch 3/5

Testing loss: 1.0720152396422167, acc: 0.6020892690389584

Epoch 4/5

Testing loss: 1.0975413583961986, acc: 0.5978157647654542

Epoch 5/5

Testing loss: 1.1148870996600202, acc: 0.6049382714917291

acc: 60.49%
Fold# 2
Epoch 1/5

Testing loss: 1.318865501625804, acc: 0.46840855123877806

Epoch 2/5

Testing loss: 1.1642483921911824, acc: 0.5705463182614705

Epoch 3/5

Testing loss: 1.1485669934551392, acc: 0.5719714964087389

Epoch 4/5

Testing loss: 1.1207367792265432, acc: 0.5833729215868861

Epoch 5/5

Testing loss: 1.161896063313065, acc: 0.5748218528023809

acc: 57.48%
Fold# 3
Epoch 1/5

Testing loss: 1.319908881640491, acc: 0.4935866984080815

Epoch 2/5

Testing loss: 1.2303967296369285, acc: 0.5320665081861183

Epoch 3/5

Testing loss: 1.1495162816058995, acc: 0.5757719715247528

Epoch 4/5

Testing loss: 1.12694447046221, a

In [None]:
from keras.models import load_model
model_GloVe.save("model.h5")
import pickle   
pickle.dump((lb, tokenizer_tweets, max_tweet_length, tokenizer_hash_emo, max_hash_emo_length, embeddings_index), open( "variables.p", "wb" ) )