In [None]:
from google.colab import drive
drive.mount('drive')

In [None]:
#ls drive
import os
os.chdir("drive/My Drive/datasets/emotion_nlp")

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow.keras import layers, optimizers, losses, activations
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, load_model

from tensorflow.keras.callbacks import  EarlyStopping, ModelCheckpoint
from IPython.display import display
from tensorflow.keras import backend as K

import os
#import glob
import random
from google.colab import files #library to upload files to colab notebook
%matplotlib inline

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
#load train data
df_train = pd.read_csv('train.txt', sep =';', header=None)

In [None]:
#load test data
df_test = pd.read_csv('test.txt', sep =';', header=None)

In [None]:
#rename columns
df_test.columns = df_train.columns = ['text', 'emotion']

In [None]:
#emotions
df_train.describe()
#we have 16000 sentences - 6 unique emotions

In [None]:
#emotions
emotions = df_train.emotion.unique().tolist()
emotions

In [None]:
#% of each emotion in the dataset
np.round(df_train.emotion.value_counts()/len(df_train),2)

In [None]:
#some visualization
sns.barplot(x = df_train.emotion.value_counts().index,   
            y = df_train.emotion.value_counts())
plt.title('emotions repartition')
plt.show()

In [None]:
df_train.sample(10)

In [None]:
emotions_dico = dict(zip(emotions,range(len(emotions))))

In [None]:
emotions_dico

In [None]:
df_train['label'] = df_train['emotion'].apply(lambda x: emotions_dico[x])

In [None]:
df_test['label'] = df_test['emotion'].apply(lambda x: emotions_dico[x])

In [None]:
train_sentences = df_train.text.tolist()
y_train = df_train.label

In [None]:
test_sentences = df_test.text.tolist()
y_test = df_test.label

In [None]:
MAX_VOCAB_SIZE = 50000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)

In [None]:
#train_sequences

In [None]:
Max_padd = max([len(x) for x in train_sequences])

In [None]:
train_sequences = pad_sequences(train_sequences,padding='post', maxlen=Max_padd)

In [None]:
train_sequences

In [None]:
test_sequences = tokenizer.texts_to_sequences(texts=test_sentences)
test_sequences = pad_sequences(test_sequences, maxlen=Max_padd, padding='post')

In [None]:
test_sequences.shape

In [None]:
#our vocab size
len(tokenizer.index_word)

In [None]:
T = train_sequences.shape[1]
V =  len(tokenizer.index_word)
K = len(emotions)

In [None]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:

# using early stopping to exit training if validation loss is not decreasing even after certain epochs (patience)
earlystopping = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 5)

# save the best model with lower validation loss
checkpointer = ModelCheckpoint(filepath = "emodetext_weights.hdf5", verbose = 1, save_best_only=True)

In [None]:
# Create the model

# We get to choose embedding dimensionality
D = 20

# Hidden state dimensionality
M = 15

# Note: we actually want to the size of the embedding to (V + 1) x D,
# because the first index starts from 1 and not 0.
# Thus, if the final index of the embedding matrix is V,
# then it actually must have size V + 1.

i = Input(shape=(T,))
x = Embedding(V + 1, D)(i)
x = LSTM(M, return_sequences=True)(x)
#x = LSTM(M, return_sequences=True)(x)
x = GlobalMaxPooling1D()(x)
x = Dense(256, activation='relu')(x)
x = Dense(512, activation='relu')(x)
x = Dense(512, activation='relu')(x)
x = Dense(K, activation='softmax')(x)

model = Model(i, x)


In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'] )

In [None]:
history = model.fit(x=train_sequences, y = y_train, epochs=20, validation_data=(test_sequences,y_test),
                    callbacks=[checkpointer, earlystopping])

In [None]:
model.save('emodetext_model_1.h5')

In [None]:
model = load_model('emodetext_model_1.h5')

In [None]:
y_pred = model.predict(test_sequences)

In [None]:
y_pred = np.argmax(y_pred,axis=-1)

In [None]:
y_test = np.argmax(y_test,
                   axis=-1)

In [None]:
emotions_dico = dict(zip(emotions_dico.values(), emotions_dico.keys()))

In [None]:
import random
#let see some prediction
for i in range(10):
  j = random.randint(0,len(y_test))
  print('Text:', test_sentences[j])
  print('prediction: ', emotions_dico[y_pred[j]])
  print('real emotion : ',emotions_dico[y_test[j]])
  print('**********************')

In [None]:
#some predictions
my_text = ['i am felling very good', 'i am so loved']
my_seqence = tokenizer.texts_to_sequences(my_text)
my_seqence = pad_sequences(my_seqence, maxlen=Max_padd, padding='post')

In [None]:
my_pred = np.argmax( model.predict(my_seqence), axis=-1)

In [None]:
[emotions_dico[x] for x in my_pred ]