In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
dataset_1 = pd.read_csv('..\Datasets\Emotion_final.csv')
dataset_2 = pd.read_csv('..\Datasets\Text_emotion.csv')

In [3]:
dataset_2 = dataset_2[['content', 'sentiment']]

In [4]:
dataset_2.rename(columns={'content': 'Text',
                        'sentiment': 'Emotion'},
                        inplace=True, errors='raise')

In [5]:
dataset = pd.concat([dataset_1, dataset_2])

In [6]:
dataset[dataset['Emotion'] == "happiness"] = "happy"

In [7]:
dataset = dataset[dataset.Emotion != 'empty']

In [8]:
dataset.head(1)

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness


In [9]:
dataset.shape

(60632, 2)

# Preprocessing techniques for textual data

In [10]:
text = dataset['Text']
for row in text[:10]:
    print(row)

i didnt feel humiliated
i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake
im grabbing a minute to post i feel greedy wrong
i am ever feeling nostalgic about the fireplace i will know that it is still on the property
i am feeling grouchy
ive been feeling a little burdened lately wasnt sure why that was
ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny
i feel as confused about life as a teenager or as jaded as a year old man
i have been with petronas for years i feel that petronas has performed well and made a huge profit
i feel romantic too


In [11]:
for index, row in dataset.iterrows():
    row['Text'] = row['Text'].lower()

Remove punctuation

In [12]:
import string
translator = str.maketrans('', '', string.punctuation)

for index, row in dataset.iterrows():
    row['Text'] = row['Text'].translate(translator)

Replace contraction

In [13]:
contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
def replaceContraction(text):
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text

In [14]:
for index, row in dataset.iterrows():
    row['Text'] = replaceContraction(row['Text'])

Replace negations

In [15]:
import nltk
from nltk.corpus import wordnet
nltk.download('punkt')

def replace(word, pos=None):
    """ Creates a set of all antonyms for the word and if there is only one antonym, it returns it """
    antonyms = set()
    for syn in wordnet.synsets(word, pos=pos):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                antonyms.add(antonym.name())
    if len(antonyms) == 1:
        return antonyms.pop()
    else:
        return None

def replaceNegations(text):
    """ Finds "not" and antonym for the next word and if found, replaces not and the next word with the antonym """
    i, l = 0, len(text)
    words = []
    while i < l:
        word = text[i]
        if word == 'not' and i+1 < l:
            ant = replace(text[i+1])
            if ant:
                words.append(ant)
                i += 2
                continue
        words.append(word)
        i += 1
    return words

def tokenize_negation(text):
    tokens = nltk.word_tokenize(text)
    tokens = replaceNegations(tokens)
    text = " ".join(tokens)
    return text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ifeol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
for index, row in dataset.iterrows():
    row['Text'] = tokenize_negation(row['Text'])

Replace stopwords

In [17]:
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

def tokenize_stopwords(text):
    finalTokens = []
    tokens = nltk.word_tokenize(text)
    for w in tokens:
        if (w not in stoplist):
            finalTokens.append(w)
    text = " ".join(finalTokens)
    return text

In [18]:
for index, row in dataset.iterrows():
    row['Text'] = tokenize_stopwords(row['Text'])

Lemmatize / Stemme

In [19]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer() #set stemmer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() # set lemmatizer

def tokenize_lemma(text):
    finalTokens = []
    tokens = nltk.word_tokenize(text)
    for w in tokens:
        finalTokens.append(lemmatizer.lemmatize(w)) # change this to lemmatizer.lemmatize(w) for Lemmatizing
    text = " ".join(finalTokens)
    return text

In [20]:
for index, row in dataset.iterrows():
    row['Text'] = tokenize_lemma(row['Text'])

In [21]:
dataset['Text']

0                                    didnt feel humiliated
1        go feeling hopeless damned hopeful around some...
2                im grabbing minute post feel greedy wrong
3        ever feeling nostalgic fireplace know still pr...
4                                          feeling grouchy
                               ...                        
39995                                      johnlloydtaylor
39996                                happy mother day love
39997    happy mother day mommy woman man long youre mo...
39998                                                happy
39999    mopedronin bullet train tokyo gf visiting japa...
Name: Text, Length: 60632, dtype: object

In [22]:
dataset.Emotion.value_counts()

happy         12238
sadness       11430
neutral        8638
worry          8459
love           5483
anger          3103
surprise       3066
fear           2652
fun            1776
relief         1526
hate           1323
enthusiasm      759
boredom         179
Name: Emotion, dtype: int64

In [23]:
dataset.shape

(60632, 2)

In [24]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2)

In [25]:
train_dataset.shape

(48505, 2)

In [26]:
from collections import Counter

def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] +=1
    return count

In [27]:
text = train_dataset.Text
counter = counter_word(text)

In [28]:
len(counter)

45734

In [29]:
counter

Counter({'theevilgumby': 1,
         'oh': 698,
         'went': 315,
         'crowdhow': 1,
         'epic': 20,
         'dude': 88,
         'going': 1528,
         'garden': 58,
         'centre': 17,
         'today': 1387,
         'funn': 4,
         'currently': 47,
         'drinking': 59,
         'egyptian': 5,
         'spice': 6,
         'tea': 83,
         'watching': 356,
         'hollyoaks': 15,
         'feel': 12185,
         'entirely': 25,
         'free': 239,
         'express': 35,
         'way': 935,
         'surroundings': 5,
         'life': 888,
         'myriad': 1,
         'experience': 127,
         'continue': 60,
         'make': 1367,
         'jeremyrylan': 1,
         'im': 5453,
         'mobile': 35,
         'web': 35,
         'let': 460,
         'look': 701,
         'awesome': 246,
         'thoughand': 2,
         'one': 1722,
         'read': 324,
         'tweet': 269,
         'anyway': 106,
         'always': 589,
         'convinced

In [30]:
num_words = len(counter)
max_length = 20

In [31]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_dataset.Text)

In [32]:
word_index = tokenizer.word_index

In [33]:
word_index

{'feel': 1,
 'feeling': 2,
 'im': 3,
 'happy': 4,
 'like': 5,
 'day': 6,
 'get': 7,
 'time': 8,
 'really': 9,
 'go': 10,
 'one': 11,
 'good': 12,
 'love': 13,
 'know': 14,
 'work': 15,
 'going': 16,
 'got': 17,
 'want': 18,
 'today': 19,
 'make': 20,
 'cant': 21,
 'back': 22,
 'still': 23,
 'u': 24,
 'think': 25,
 'thing': 26,
 'would': 27,
 'see': 28,
 'much': 29,
 'little': 30,
 'people': 31,
 'need': 32,
 'well': 33,
 'ive': 34,
 'way': 35,
 'home': 36,
 'even': 37,
 'new': 38,
 'could': 39,
 'life': 40,
 'mother': 41,
 'right': 42,
 'lol': 43,
 'night': 44,
 'say': 45,
 'friend': 46,
 'last': 47,
 'na': 48,
 'miss': 49,
 'something': 50,
 'didnt': 51,
 'amp': 52,
 'week': 53,
 'though': 54,
 'bit': 55,
 'come': 56,
 'sad': 57,
 'look': 58,
 'oh': 59,
 'hope': 60,
 'ill': 61,
 'bad': 62,
 'wish': 63,
 'year': 64,
 'thats': 65,
 'better': 66,
 '2': 67,
 'morning': 68,
 'never': 69,
 'getting': 70,
 'pretty': 71,
 'sorry': 72,
 'always': 73,
 'thanks': 74,
 'great': 75,
 'also': 76,
 

In [34]:
train_sequences = tokenizer.texts_to_sequences(train_dataset.Text)

In [35]:
train_sequences[0]

[14341, 59, 165, 14342, 2247, 618]

In [36]:
dataset.Text[0]

'didnt feel humiliated'

In [37]:
from keras.preprocessing.sequence import pad_sequences
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")

In [38]:
train_padded[0]

array([14341,    59,   165, 14342,  2247,   618,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0])

In [39]:
test_sequences = tokenizer.texts_to_sequences(test_dataset.Text)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

In [40]:
print(train_dataset.Text[2])
print(train_sequences[2])

2                funeral ceremonygloomy friday
2    im grabbing minute post feel greedy wrong
Name: Text, dtype: object
[1, 1934, 234, 1512, 35, 1, 5522, 40, 14343, 438, 952, 20]


In [41]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [42]:
def decode(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [43]:
print(f"Shape of train {train_padded.shape}")
print(f"Shape of test {test_padded.shape}")

Shape of train (48505, 20)
Shape of test (12127, 20)


In [44]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

model = Sequential()

model.add(Embedding(num_words, 32, input_length=max_length))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(1, activation="sigmoid"))

optimizer = Adam(learning_rate=3e-4)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

In [45]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 32)            1463488   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 1,488,385
Trainable params: 1,488,385
Non-trainable params: 0
_________________________________________________________________


In [46]:
history = model.fit(train_padded, train_dataset.Emotion, epochs=20, validation_data=(test_padded, test_dataset.Emotion))

Epoch 1/20


UnimplementedError: Graph execution error:

Detected at node 'binary_crossentropy/Cast' defined at (most recent call last):
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\runpy.py", line 193, in _run_module_as_main
      "__main__", mod_spec)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\runpy.py", line 85, in _run_code
      exec(code, run_globals)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
      app.launch_new_instance()
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
      app.start()
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
      self.io_loop.start()
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\asyncio\base_events.py", line 541, in run_forever
      self._run_once()
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\asyncio\base_events.py", line 1786, in _run_once
      handle._run()
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\asyncio\events.py", line 88, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue
      await self.process_one()
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one
      await dispatch(*args)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell
      await result
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request
      reply_content = await reply_content
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute
      res = shell.run_cell(code, store_history=store_history, silent=silent)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\IPython\core\interactiveshell.py", line 2915, in run_cell
      raw_cell, store_history, silent, shell_futures)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\IPython\core\interactiveshell.py", line 2960, in _run_cell
      return runner(coro)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\IPython\core\async_helpers.py", line 78, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\IPython\core\interactiveshell.py", line 3186, in run_cell_async
      interactivity=interactivity, compiler=compiler, result=result)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\IPython\core\interactiveshell.py", line 3377, in run_ast_nodes
      if (await self.run_code(code, result,  async_=asy)):
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\ifeol\AppData\Local\Temp\ipykernel_29764\645505158.py", line 1, in <module>
      history = model.fit(train_padded, train_dataset.Emotion, epochs=20, validation_data=(test_padded, test_dataset.Emotion))
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\utils\traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\engine\training.py", line 1384, in fit
      tmp_logs = self.train_function(iterator)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\engine\training.py", line 1021, in train_function
      return step_function(self, iterator)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\engine\training.py", line 1010, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\engine\training.py", line 1000, in run_step
      outputs = model.train_step(data)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\engine\training.py", line 860, in train_step
      loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\engine\training.py", line 919, in compute_loss
      y, y_pred, sample_weight, regularization_losses=self.losses)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
      loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\losses.py", line 141, in __call__
      losses = call_fn(y_true, y_pred)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\losses.py", line 245, in call
      return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\ifeol\anaconda3.1\envs\DevEnv\lib\site-packages\keras\losses.py", line 1922, in binary_crossentropy
      y_true = tf.cast(y_true, y_pred.dtype)
Node: 'binary_crossentropy/Cast'
Cast string to float is not supported
	 [[{{node binary_crossentropy/Cast}}]] [Op:__inference_train_function_3234]