In [1]:
import pandas as pd
import numpy as np
import random
from tensorflow.keras import models, Sequential, layers 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Emotion_final.csv')

In [3]:
df = df.rename(columns={'Text': 'text', 'Emotion': 'emotion'})

In [4]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [5]:
df.emotion.value_counts().count()

6

In [6]:
df.shape

(21459, 2)

In [7]:
X = df.text
df.emotion = pd.Categorical(df.emotion)

In [8]:
df['code'] = df['emotion'].cat.codes

In [9]:
y = to_categorical(df['code'].values)

In [10]:
df.head()

Unnamed: 0,text,emotion,code
0,i didnt feel humiliated,sadness,4
1,i can go from feeling so hopeless to so damned...,sadness,4
2,im grabbing a minute to post i feel greedy wrong,anger,0
3,i am ever feeling nostalgic about the fireplac...,love,3
4,i am feeling grouchy,anger,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
y_train

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

In [13]:
df.head()

Unnamed: 0,text,emotion,code
0,i didnt feel humiliated,sadness,4
1,i can go from feeling so hopeless to so damned...,sadness,4
2,im grabbing a minute to post i feel greedy wrong,anger,0
3,i am ever feeling nostalgic about the fireplac...,love,3
4,i am feeling grouchy,anger,0


In [14]:
df.emotion.value_counts()

happy       7029
sadness     6265
anger       2993
fear        2652
love        1641
surprise     879
Name: emotion, dtype: int64

In [15]:
df.head()

Unnamed: 0,text,emotion,code
0,i didnt feel humiliated,sadness,4
1,i can go from feeling so hopeless to so damned...,sadness,4
2,im grabbing a minute to post i feel greedy wrong,anger,0
3,i am ever feeling nostalgic about the fireplac...,love,3
4,i am feeling grouchy,anger,0


In [16]:
### Let's tokenize the vocabulary 
tk = Tokenizer()
tk.fit_on_texts(X)
vocab_size = len(tk.word_index) + 1
print(f'There are {vocab_size} different words in your corpus')
X_train_token = tk.texts_to_sequences(X_train)
X_test_token = tk.texts_to_sequences(X_test)


### Pad your inputs
X_train_pad = pad_sequences(X_train_token, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_test_token, dtype='float32', padding='post')

There are 19259 different words in your corpus


In [17]:
X_train_pad.shape

(15021, 66)

In [18]:
vocab_size

19259

In [19]:
# model.add(layers.Masking(mark_value=-999))

In [20]:
import gensim.downloader as api
glove_gensim  = api.load('glove-wiki-gigaword-100') #100 dimension



In [21]:
# Vectorize
vector_size = 100
gensim_weight_matrix = np.zeros((vocab_size ,vector_size))
gensim_weight_matrix.shape
for word, index in tk.word_index.items():
    if index < vocab_size: # since index starts with zero 
        if word in glove_gensim.wv.vocab:
            gensim_weight_matrix[index] = glove_gensim[word]
        else:
            gensim_weight_matrix[index] = np.zeros(100)

  import sys


In [39]:
def create_model(EMBEDDING_DIM = 100): # this means the embedding layer will create  a vector in 100 dimension
  
  model_cnn = Sequential()
  model_cnn.add(layers.Embedding(input_dim = vocab_size,# the whole vocabulary size 
                            output_dim = EMBEDDING_DIM, # vector space dimension
                            input_length= X_train_pad.shape[1], # max_len of text sequence
                            weights = [gensim_weight_matrix],trainable = False))
  model_cnn.add(layers.Conv1D(10, kernel_size=3))
  model_cnn.add(layers.Flatten())
  model_cnn.add(layers.Dense(30, activation="relu"))
  model_cnn.add(layers.Dense(6, activation="softmax"))
  model_cnn.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'],
              )
  return model_cnn

In [41]:
model = create_model()

In [42]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 66, 100)           1925900   
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 64, 10)            3010      
_________________________________________________________________
flatten_6 (Flatten)          (None, 640)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 30)                19230     
_________________________________________________________________
dense_13 (Dense)             (None, 6)                 186       
Total params: 1,948,326
Trainable params: 22,426
Non-trainable params: 1,925,900
_________________________________________________________________


In [25]:
history = model.fit(X_train_pad, y_train, batch_size=16, epochs=10, verbose=1, validation_split=0.3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [26]:
# embedding_size = 20
# # model_cnn = Sequential()
# create_model
# # model_cnn.add(layers.Embedding(
# #     input_dim=X_pad.shape[0],
# #     input_length=X_pad.shape[1],
# #     output_dim=20,
# #     mask_zero=True))
# model_cnn.add(layers.Conv1D(10, kernel_size=3))
# model_cnn.add(layers.Flatten())
# model_cnn.add(layers.Dense(30, activation="relu"))
# model_cnn.add(layers.Dense(6, activation="softmax"))