In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
dataset_1 = pd.read_csv('../Datasets/Emotion_final.csv')
dataset_2 = pd.read_csv('../Datasets/Text_emotion.csv')

In [3]:
dataset_2 = dataset_2[['content', 'sentiment']]

In [4]:
dataset_2.rename(columns={'content': 'Text',
                        'sentiment': 'Emotion'},
                        inplace=True, errors='raise')

In [5]:
dataset = pd.concat([dataset_1, dataset_2])

In [6]:
dataset[dataset['Emotion'] == "happiness"] = "happy"

In [7]:
dataset = dataset[dataset.Emotion != 'empty']

In [8]:
dataset.head(1)

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness


In [9]:
dataset.shape

(60632, 2)

# Preprocessing techniques for textual data

In [10]:
text = dataset['Text']
for row in text[:10]:
    print(row)

i didnt feel humiliated
i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake
im grabbing a minute to post i feel greedy wrong
i am ever feeling nostalgic about the fireplace i will know that it is still on the property
i am feeling grouchy
ive been feeling a little burdened lately wasnt sure why that was
ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny
i feel as confused about life as a teenager or as jaded as a year old man
i have been with petronas for years i feel that petronas has performed well and made a huge profit
i feel romantic too


In [11]:
for index, row in dataset.iterrows():
    row['Text'] = row['Text'].lower()

Remove punctuation

In [12]:
import string
translator = str.maketrans('', '', string.punctuation)

for index, row in dataset.iterrows():
    row['Text'] = row['Text'].translate(translator)

Replace contraction

In [13]:
contraction_patterns = [ (r'won\'t', 'will not'), (r'can\'t', 'cannot'), (r'i\'m', 'i am'), (r'ain\'t', 'is not'), (r'(\w+)\'ll', '\g<1> will'), (r'(\w+)n\'t', '\g<1> not'),
                         (r'(\w+)\'ve', '\g<1> have'), (r'(\w+)\'s', '\g<1> is'), (r'(\w+)\'re', '\g<1> are'), (r'(\w+)\'d', '\g<1> would'), (r'&', 'and'), (r'dammit', 'damn it'), (r'dont', 'do not'), (r'wont', 'will not') ]
def replaceContraction(text):
    patterns = [(re.compile(regex), repl) for (regex, repl) in contraction_patterns]
    for (pattern, repl) in patterns:
        (text, count) = re.subn(pattern, repl, text)
    return text

In [14]:
for index, row in dataset.iterrows():
    row['Text'] = replaceContraction(row['Text'])

Replace negations

In [15]:
import nltk
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def replace(word, pos=None):
    """ Creates a set of all antonyms for the word and if there is only one antonym, it returns it """
    antonyms = set()
    for syn in wordnet.synsets(word, pos=pos):
        for lemma in syn.lemmas():
            for antonym in lemma.antonyms():
                antonyms.add(antonym.name())
    if len(antonyms) == 1:
        return antonyms.pop()
    else:
        return None

def replaceNegations(text):
    """ Finds "not" and antonym for the next word and if found, replaces not and the next word with the antonym """
    i, l = 0, len(text)
    words = []
    while i < l:
        word = text[i]
        if word == 'not' and i+1 < l:
            ant = replace(text[i+1])
            if ant:
                words.append(ant)
                i += 2
                continue
        words.append(word)
        i += 1
    return words

def tokenize_negation(text):
    tokens = nltk.word_tokenize(text)
    tokens = replaceNegations(tokens)
    text = " ".join(tokens)
    return text

[nltk_data] Downloading package punkt to /home/subut0n/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/subut0n/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/subut0n/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [16]:
for index, row in dataset.iterrows():
    row['Text'] = tokenize_negation(row['Text'])

Replace stopwords

In [17]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stoplist = stopwords.words('english')

def tokenize_stopwords(text):
    finalTokens = []
    tokens = nltk.word_tokenize(text)
    for w in tokens:
        if (w not in stoplist):
            finalTokens.append(w)
    text = " ".join(finalTokens)
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/subut0n/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
for index, row in dataset.iterrows():
    row['Text'] = tokenize_stopwords(row['Text'])

Lemmatize / Stemme

In [19]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer() #set stemmer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() # set lemmatizer

def tokenize_lemma(text):
    finalTokens = []
    tokens = nltk.word_tokenize(text)
    for w in tokens:
        finalTokens.append(lemmatizer.lemmatize(w)) # change this to lemmatizer.lemmatize(w) for Lemmatizing
    text = " ".join(finalTokens)
    return text

In [20]:
for index, row in dataset.iterrows():
    row['Text'] = tokenize_lemma(row['Text'])

In [21]:
dataset['Text']

0                                    didnt feel humiliated
1        go feeling hopeless damned hopeful around some...
2                im grabbing minute post feel greedy wrong
3        ever feeling nostalgic fireplace know still pr...
4                                          feeling grouchy
                               ...                        
39995                                      johnlloydtaylor
39996                                happy mother day love
39997    happy mother day mommy woman man long youre mo...
39998                                                happy
39999    mopedronin bullet train tokyo gf visiting japa...
Name: Text, Length: 60632, dtype: object

In [22]:
dataset.Emotion.value_counts()

happy         12238
sadness       11430
neutral        8638
worry          8459
love           5483
anger          3103
surprise       3066
fear           2652
fun            1776
relief         1526
hate           1323
enthusiasm      759
boredom         179
Name: Emotion, dtype: int64

In [23]:
dataset.shape

(60632, 2)

In [24]:
from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2)

In [25]:
train_dataset.shape

(48505, 2)

In [26]:
from collections import Counter

def counter_word(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] +=1
    return count

In [27]:
text = train_dataset.Text
counter = counter_word(text)

In [28]:
len(counter)

46031

In [29]:
counter

Counter({'smoker': 4,
         'year': 630,
         'quit': 26,
         'week': 717,
         'ago': 173,
         'right': 858,
         'finished': 144,
         'book': 278,
         'cant': 1375,
         'believe': 223,
         'free': 245,
         'feel': 12226,
         'knew': 142,
         'terrified': 77,
         'life': 884,
         'without': 367,
         'cigarette': 12,
         'lishreece': 1,
         'perfect': 140,
         'moment': 256,
         'broke': 146,
         'one': 1718,
         'tile': 2,
         'floor': 41,
         'hallway': 3,
         'lose': 79,
         'happy': 5187,
         'mother': 868,
         'day': 2857,
         'balmain': 2,
         'knockoff': 2,
         'bebe': 3,
         'make': 1376,
         'sadi': 2,
         'want': 1533,
         'real': 216,
         'dealthese': 1,
         'shoe': 71,
         'look': 703,
         'cheapy': 2,
         'teared': 1,
         'already': 403,
         'felt': 363,
         'stresse

In [30]:
num_words = len(counter)
max_length = 20

In [31]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_dataset.Text)

2022-05-19 12:20:25.879080: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-19 12:20:25.879098: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [32]:
word_index = tokenizer.word_index

In [33]:
word_index

{'feel': 1,
 'feeling': 2,
 'im': 3,
 'happy': 4,
 'like': 5,
 'day': 6,
 'get': 7,
 'time': 8,
 'really': 9,
 'go': 10,
 'good': 11,
 'one': 12,
 'love': 13,
 'know': 14,
 'going': 15,
 'work': 16,
 'want': 17,
 'got': 18,
 'make': 19,
 'cant': 20,
 'back': 21,
 'today': 22,
 'u': 23,
 'still': 24,
 'think': 25,
 'thing': 26,
 'would': 27,
 'see': 28,
 'need': 29,
 'much': 30,
 'little': 31,
 'people': 32,
 'well': 33,
 'ive': 34,
 'home': 35,
 'even': 36,
 'way': 37,
 'could': 38,
 'new': 39,
 'life': 40,
 'lol': 41,
 'mother': 42,
 'right': 43,
 'night': 44,
 'last': 45,
 'say': 46,
 'miss': 47,
 'friend': 48,
 'didnt': 49,
 'something': 50,
 'amp': 51,
 'bit': 52,
 'na': 53,
 'though': 54,
 'week': 55,
 'oh': 56,
 'sad': 57,
 'come': 58,
 'ill': 59,
 'look': 60,
 'hope': 61,
 'bad': 62,
 '2': 63,
 'morning': 64,
 'thats': 65,
 'wish': 66,
 'year': 67,
 'sorry': 68,
 'better': 69,
 'never': 70,
 'pretty': 71,
 'getting': 72,
 'always': 73,
 'thanks': 74,
 'great': 75,
 'made': 76,
 

In [34]:
train_sequences = tokenizer.texts_to_sequences(train_dataset.Text)

In [35]:
train_sequences[0]

[6362,
 67,
 1893,
 55,
 324,
 43,
 387,
 189,
 20,
 246,
 229,
 1,
 395,
 1893,
 711,
 40,
 127,
 3163]

In [36]:
dataset.Text[0]

'didnt feel humiliated'

In [37]:
from keras.preprocessing.sequence import pad_sequences
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")

In [38]:
train_padded[0]

array([6362,   67, 1893,   55,  324,   43,  387,  189,   20,  246,  229,
          1,  395, 1893,  711,   40,  127, 3163,    0,    0], dtype=int32)

In [39]:
train_padded.shape

(48505, 20)

In [40]:
test_sequences = tokenizer.texts_to_sequences(test_dataset.Text)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

In [41]:
train_dataset.Emotion.unique()

array(['fear', 'hate', 'neutral', 'worry', 'sadness', 'happy', 'surprise',
       'enthusiasm', 'anger', 'fun', 'relief', 'love', 'boredom'],
      dtype=object)

In [42]:
train_dataset.Emotion.replace(
    {"worry":0,
    "sadness":1,
    "surprise":2,
    "happy":3,
    "fun":4,
    "hate":5,
    "fear":6,
    "neutral":7,
    "relief":8,
    "anger":9,
    "love":10,
    "boredom":11,
    "enthusiasm":12}
)

365      6
4953     5
30240    7
15830    0
15336    1
        ..
5263     6
7595     9
17485    2
3176     0
33568    4
Name: Emotion, Length: 48505, dtype: int64

In [43]:
print(train_dataset.Text[2])
print(train_sequences[2])

funeral ceremonygloomy friday
[4, 42, 6, 42]


In [44]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [45]:
def decode(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

In [46]:
print(f"Shape of train {train_padded.shape}")
print(f"Shape of test {test_padded.shape}")

Shape of train (48505, 20)
Shape of test (12127, 20)


In [47]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Flatten, GlobalAveragePooling2D
from keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

model = Sequential()

model.add(Flatten())
model.add(Embedding(num_words, 32, input_length=max_length))
model.add(LSTM(64, dropout=0.1))
model.add(Dense(13, activation="softmax"))

optimizer = Adam(learning_rate=3e-4)

model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])

2022-05-19 12:20:30.118108: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-19 12:20:30.118140: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-19 12:20:30.118157: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (subut0n-PC): /proc/driver/nvidia/version does not exist
2022-05-19 12:20:30.118871: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [48]:
train_padded.shape

(48505, 20)

In [49]:
model.build(input_shape=(48505, 20))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (48505, 20)               0         
                                                                 
 embedding (Embedding)       (48505, 20, 32)           1472992   
                                                                 
 lstm (LSTM)                 (48505, 64)               24832     
                                                                 
 dense (Dense)               (48505, 13)               845       
                                                                 
Total params: 1,498,669
Trainable params: 1,498,669
Non-trainable params: 0
_________________________________________________________________


In [50]:
history = model.fit(train_padded, train_dataset.Emotion, epochs=20, validation_data=(test_padded, test_dataset.Emotion))

Epoch 1/20


ValueError: in user code:

    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/engine/training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/engine/training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/engine/compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/losses.py", line 1932, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "/home/subut0n/anaconda3/envs/ML_env/lib/python3.8/site-packages/keras/backend.py", line 5247, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)

    ValueError: `logits` and `labels` must have the same shape, received ((None, 13) vs (None, 1)).
