In [1]:
!pip install unidecode



In [2]:
import json
import csv
import nltk
import pandas as pd
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt 
from unidecode import unidecode
import pickle

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout, Bidirectional
from tensorflow.compat.v1.keras.layers import CuDNNLSTM
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
class EmotionNLP():
  def __init__(self, 
               sequence_length=2000, 
               embed_dim=128, 
               batch_size=32, 
               epochs=200,
               num_words=1000,
               oov_token="<UNK>",
               pad_type="post",
               trunc_type="post"):
    


      self.sequence_length = sequence_length
      self.embed_dim = embed_dim
      self.batch_size = batch_size
      self.epochs = epochs

      self.num_words = num_words
      self.oov_token = oov_token
      self.pad_type = pad_type
      self.trunc_type = trunc_type



      print("Loading Data.......")
      df = self.load_data()
      print(df.head())


      print("Formatting Data......")
      sentences = df["sentence"]

      sentences = self.lower_case(sentences)

      sentences_sequences, word_index, maxlen, sentences_padded = self.tokenize_data(sentences)


      print("Data Stats......")
      print("Word index:\n", word_index)
      print("\nTraining sequences:\n", sentences_sequences)
      print("\nPadded training sequences:\n", sentences_padded)
      print("\nPadded training shape:", sentences_padded.shape)

      print("Splitting Data....")
      X_train, X_test, y_train, y_test = self.get_data(df, sentences_padded)

      print(f"X_train:\n {X_train}")
      print(f"X_test:\n {X_test}")
      print(f"y_train:\n {y_train}")
      print(f"y_test:\n {y_test}")
      


      print("Building Model.......")
      model = self.build_model(X_train)
      print("Built Model,")


      print("Training Model.......")
      history = self.train_dataset(model, X_train, y_train, X_test, y_test,)

      print("Model Trained.")

      print("Saving Plots.........")
      self.plot_acc(history)
      self.plot_loss(history)


  def load_data(self):
    with open("friends.json", "r") as f:
      data = json.load(f)

    speakers = []
    phrases = []
    word_emotions = []
    annotations = []
    for block in range(len(data)):
      for speech in range(block):
      
        try:
          speaker = data[block][speech]["speaker"]
          speakers.append(speaker)
        except:
          pass
        
        try:
          phrase = unidecode(data[block][speech]["utterance"])
          phrases.append(phrase)
        except:
          pass
        
        try:
          emotion = data[block][speech]["emotion"]
          word_emotions.append(emotion)
        except:
          pass

        try:
          annotation = data[block][speech]["annotation"]
          annotations.append(annotation)
        except:
          pass
    

    """"
    emotion mappings:
      [neutral, joy, sadness, fear, anger,  surprise, disgust, non-neutral]
    """

    i = 0
    emotions = []
    for emotion in word_emotions:
      i += 1
      if emotion == "neutral":
        emotions.append(0)

      if emotion == "joy":
        emotions.append(1)

      if emotion == "sadness":
        emotions.append(2)

      if emotion == "fear":
        emotions.append(3)

      if emotion == "anger":
        emotions.append(4)

      if emotion == "surprise":
        emotions.append(5)

      if emotion == "disgust":
        emotions.append(6)

      if emotion == "non-neutral":
        emotions.append(7)

    print(len(speakers))
    print(len(emotions))
    print(len(phrases))
    print(len(annotations))


    df = pd.DataFrame({"speaker" : speakers, "sentence" : phrases, "emotion" : emotions, "annotation" : annotations})
    
    with open("friends.json", "r") as f:
      data = json.load(f)

    speakers = []
    phrases = []
    word_emotions = []
    annotations = []
    for block in range(len(data)):
      for speech in range(block):
      
        try:
          speaker = data[block][speech]["speaker"]
          speakers.append(speaker)
        except:
          pass
        
        try:
          phrase = unidecode(data[block][speech]["utterance"])
          phrases.append(phrase)
        except:
          pass
        
        try:
          emotion = data[block][speech]["emotion"]
          word_emotions.append(emotion)
        except:
          pass

        try:
          annotation = data[block][speech]["annotation"]
          annotations.append(annotation)
        except:
          pass
    

    """"
    emotion mappings:
      [neutral, joy, sadness, fear, anger,  surprise, disgust, non-neutral]
    """

    i = 0
    emotions = []
    for emotion in word_emotions:
      i += 1
      if emotion == "neutral":
        emotions.append(0)

      if emotion == "joy":
        emotions.append(1)

      if emotion == "sadness":
        emotions.append(2)

      if emotion == "fear":
        emotions.append(3)

      if emotion == "anger":
        emotions.append(4)

      if emotion == "surprise":
        emotions.append(5)

      if emotion == "disgust":
        emotions.append(6)

      if emotion == "non-neutral":
        emotions.append(7)

    print(len(speakers))
    print(len(emotions))
    print(len(phrases))
    print(len(annotations))

    df.append({"speaker" : speakers, "sentence" : phrases, "emotion" : emotions, "annotation" : annotations}, ignore_index=True)
    df.to_csv("emotion_data.csv", index=False)
    return df

  def lower_case(self, text_list):
    lower_phrases = []
    for phrase in text_list.values:
      lower_phrases.append(phrase.lower())
    
    return lower_phrases
  
  def tokenize_data(self, text_list, num_words=1000, oov_token="<UNK>", pad_type="post", trunc_type="post"):
    # new_text_list = []
    # for text in text_list:
    #   new_text_list.append(text.decode())

    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token, filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
    tokenizer.fit_on_texts(text_list)
    word_index = tokenizer.word_index

    train_sequences = tokenizer.texts_to_sequences(text_list)

    tokenizer.fit_on_texts(text_list)

    
    

    maxlen = max([len(x) for x in train_sequences])
    
    train_padded = pad_sequences(train_sequences, padding=pad_type, truncating=trunc_type, maxlen=maxlen)
    print("here")

    with open('emotion_tokenizer.pickle', 'wb') as f:
      pickle.dump(tokenizer, f)


    return train_sequences, word_index, maxlen, train_padded

  
  def get_data(self, df, train_data):
    train_classes = pd.get_dummies(df["emotion"])
                                  
    X, y = train_data, train_classes
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train = np.asarray(X_train)
    X_test = np.asarray(X_test)
    y_train = np.asarray(y_train)
    y_test = np.asarray(y_test)

    return X_train, X_test, y_train, y_test

  def build_model(self, X, sequence_length=2000, embed_dim=128):
    model = Sequential()
    model.add(Embedding(sequence_length, embed_dim, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.4))
    model.add(Bidirectional(CuDNNLSTM(196)))
    model.add(Dropout(0.2))
    model.add(Dense(8, activation="sigmoid"))
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    return model 

  def train_dataset(self, model, X_train, y_train, X_test, y_test, batch_size=32):      
    history = model.fit(X_train, y_train, epochs=40, batch_size=batch_size, verbose=2)

    model.save("emotion_analysis.h5")

    score, acc = model.evaluate(X_test, y_test, verbose=2, batch_size=batch_size)

    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))

    return history

  # def plot_acc(self, history):
  #   plt.plot(history.history['accuracy'])
  #   plt.title('model accuracy')
  #   plt.ylabel('accuracy')
  #   plt.xlabel('epoch')
  #   plt.legend(['train', 'test'], loc='upper left')
  #   plt.savefig('graphs/acc.png')


  # def plot_loss(self, history):
  #   plt.plot(history.history['loss'])
  #   plt.title('model loss')
  #   plt.ylabel('loss')
  #   plt.xlabel('epoch')
  #   plt.legend(['train', 'test'], loc='upper left')
  #   plt.savefig('graphs/loss.png')

In [None]:
emotionNLP = EmotionNLP()

Loading Data.......
14349
14349
14349
14349
14349
14349
14349
14349
  speaker  ... annotation
0    Joey  ...    5000000
1    Ross  ...    3000011
2     Guy  ...    5000000
3    Ross  ...    0010400
4  Rachel  ...    0000050

[5 rows x 4 columns]
Formatting Data......
here
Data Stats......
Word index:

Training sequences:
 [[28, 1, 158], [102, 102, 59, 47, 2, 262, 3], [24, 22, 1, 64, 17, 308, 933], [3, 100, 12, 41, 5, 83, 20, 3, 203, 100, 934, 33, 109], [11], [2, 34, 124, 99, 35, 1, 207, 2, 393, 54, 49, 87], [8, 74, 15, 1, 5, 1, 154, 186, 29, 404, 1], [81, 136, 7, 119, 31, 806, 78, 7, 131, 31, 240, 6, 483, 133, 3, 7, 2, 116, 25, 484], [32, 19, 79, 120, 16, 1], [32, 19, 79, 301], [28, 11, 73, 3, 874, 5, 25, 49, 4, 534], [2, 43, 38], [40, 30, 42, 40, 875], [14, 13, 6, 520, 1, 1], [81], [28, 198, 60], [257, 136, 7, 2, 100, 6, 309, 1], [36, 66, 148, 580, 19, 272, 41, 96, 16, 1, 310, 54, 499], [1, 1, 750, 1, 1, 1, 1, 215], [24], [2, 17, 603, 81, 264, 622, 35, 3, 1, 37, 113, 1, 707, 18, 1, 5,

Analysis:
 - Notice overfitting around 150 epochs as the loss function first hits the lowest point and then fluxuates.