<a href="https://colab.research.google.com/github/kumar-abhishek/handson-ml2/blob/master/ECPE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Algorithm

1. Take the document, split into clauses
2. Find embeddings of the clauses
3. Feed embeddings of clauses into a Bi-LSTM layer(word-level), followed by attention layer 
4. Output of previous layer gets copied into 2 components.
5. 1 component is for emotion extraction and is a Bi-LSTM layer(clause-level)
6. 2nd compoent is for cause extraction and is a Bi-LSTM layer(clause-level)
7. Loss Lp of the whole model is the weighted sum of two components:
Lp=n*Le+(1-n)*Lc
where n is a hyper-param






In [0]:

import unicodedata
import re
import numpy as np
import os
import io
import time
import keras
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
# input is sentence, output is emotion cause pairs
# Determine clauses by splitting on punctuation.

# preprocess

# Converts the unicode file to ascii
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')

def remove_nonascii(w):
  w = unicode_to_ascii(w.lower().strip())

  # creating a space between a word and the punctuation following it
  # eg: "he is a boy." => "he is a boy ."
  # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
  w = re.sub(r"([?.!,¿])", r" \1 ", w)
  w = re.sub(r'[" "]+', " ", w)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

  w = w.strip()
  return w


def preprocess_sentence(w):
  w = remove_nonascii(w)  
  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  w = '<start> ' + w + ' <end>'
  return w

In [0]:
def extract_cause(text):
  cur_cause=''
  try:
    cur_cause = re.findall('<cause>(.*?)<\\\cause>', text)[0]
    # Remove tags from line
    text=re.sub('<cause>', '', text)
    text=re.sub('<\\\cause>', '', text)
  except:
    pass
  #print('here:', text)
  return (cur_cause, text)

In [0]:
def clean_filter_clauses(all_clauses):
  cause = ''
  clauses=[]
  for clause in all_clauses:
    e_cause, e_text = extract_cause(clause)
    if e_cause!='':
      cause = remove_nonascii(e_cause)
    clauses.append(remove_nonascii(e_text))
  return cause, clauses



In [0]:
path_to_file = "data.txt"  
# 1. Remove any accents
# 2. Clean the sentences
# 3. Return word pairs in the format: [document, emotion, cause, clauses list]
document=[]
emotion=[]
cause=[]
clause=[]
def create_dataset(path, num_examples):
  lines = io.open(path, encoding='UTF-8').read().strip().split('\n')
  for i, line in enumerate(lines[:num_examples]):
    cur_emotion = re.findall('<(.*?)>', line)[0]
    # removing emotion tag in document
    text_without_emotion=line[2+len(cur_emotion):len(line)-len(cur_emotion)-3]
    #document.append(text_without_emotion)
    emotion.append(cur_emotion)

    # Determine clauses by splitting on punctuation.
    all_clauses = re.split("[.,!;:\"]+", text_without_emotion)
    filter_cause, filter_clauses = clean_filter_clauses(all_clauses)
    cause.append(filter_cause)
    clause.append([filter_clauses])
    doc = extract_cause(text_without_emotion)[1]
    # clean up document
    clean_doc = preprocess_sentence(doc)
    document.append(clean_doc)

    # clean up clauses
    # TODO
  return [document, emotion, cause, clause]

In [116]:
document, emotion, cause, clause_list = create_dataset(path_to_file, 5)
for i in range(5):
  print(document[i])
  print(emotion[i])
  print(cause[i])
  print(clause_list[i])
  print('\n--------\n')

<start> i suppose i am happy , being so tiny it means i am able to surprise people with what is generally seen as my confident and outgoing personality . <end>
happy
being so tiny
[['i suppose i am happy', 'being so tiny', 'it means i am able to surprise people with what is generally seen as my confident and outgoing personality', '']]

--------

<start> lennox has always truly wanted to fight for the world title and was happy , because he was taking the tough route . <end>
happy
because he was taking the tough route
[['lennox has always truly wanted to fight for the world title and was happy', 'because he was taking the tough route', '']]

--------

<start> he was a professional musician now , still sensitive and happy , doing something he loved . <end>
happy
doing something he loved
[['he was a professional musician now', 'still sensitive and happy', 'doing something he loved', '']]

--------

<start> holmes is happy , because , he has the freedom of the house when we are out . <end>

In [172]:
X=document
#y=list(zip(emotion, cause))
y=emotion
print(y)

['happy', 'happy', 'happy', 'happy', 'happy']


In [0]:
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)

  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

  return tensor, lang_tokenizer

In [174]:
tokenizer=keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="xxxxxxx")
tokenizer.fit_on_texts(X)
tokenizer.fit_on_texts(X)
X_dict=tokenizer.word_index
print(len(X_dict))
print(X_dict.items())

77
dict_items([('xxxxxxx', 1), ('i', 2), ('start', 3), ('happy', 4), ('to', 5), ('end', 6), ('the', 7), ('was', 8), ('with', 9), ('and', 10), ('he', 11), ('my', 12), ('am', 13), ('so', 14), ('it', 15), ('is', 16), ('has', 17), ('because', 18), ('work', 19), ('suppose', 20), ('being', 21), ('tiny', 22), ('means', 23), ('able', 24), ('surprise', 25), ('people', 26), ('what', 27), ('generally', 28), ('seen', 29), ('as', 30), ('confident', 31), ('outgoing', 32), ('personality', 33), ('lennox', 34), ('always', 35), ('truly', 36), ('wanted', 37), ('fight', 38), ('for', 39), ('world', 40), ('title', 41), ('taking', 42), ('tough', 43), ('route', 44), ('a', 45), ('professional', 46), ('musician', 47), ('now', 48), ('still', 49), ('sensitive', 50), ('doing', 51), ('something', 52), ('loved', 53), ('holmes', 54), ('freedom', 55), ('of', 56), ('house', 57), ('when', 58), ('we', 59), ('are', 60), ('out', 61), ('had', 62), ('problems', 63), ('tutors', 64), ('trying', 65), ('encourage', 66), ('me', 6

In [0]:
X_seq=tokenizer.texts_to_sequences(X)
#X_seq[:2]

In [176]:
X_padded_seq=pad_sequences(X_seq,padding='post',maxlen=40)
X_padded_seq[:3]

array([[ 3,  2, 20,  2, 13,  4, 21, 14, 22, 15, 23,  2, 13, 24,  5, 25,
        26,  9, 27, 16, 28, 29, 30, 12, 31, 10, 32, 33,  6,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 34, 17, 35, 36, 37,  5, 38, 39,  7, 40, 41, 10,  8,  4, 18,
        11,  8, 42,  7, 43, 44,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0],
       [ 3, 11,  8, 45, 46, 47, 48, 49, 50, 10,  4, 51, 52, 11, 53,  6,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0]], dtype=int32)

In [186]:
print(X_padded_seq.shape)
y=np.array(y)

print(y.shape)

(5, 40)
(5,)


In [0]:
vocab_size = len(tokenizer.word_index) + 1

text_model = tf.keras.Sequential([tf.keras.layers.Embedding(input_length=40,input_dim=10000,output_dim=50),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation="relu"),
    #tf.keras.layers.Dense(1, activation="sigmoid"),
    tf.keras.layers.Dense(vocab_size, activation='softmax')

])

In [194]:
text_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

text_model.summary()


Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 40, 50)            500000    
_________________________________________________________________
flatten_9 (Flatten)          (None, 2000)              0         
_________________________________________________________________
dense_18 (Dense)             (None, 6)                 12006     
_________________________________________________________________
dense_19 (Dense)             (None, 78)                546       
Total params: 512,552
Trainable params: 512,552
Non-trainable params: 0
_________________________________________________________________


In [193]:
text_model.fit(X_padded_seq, y, epochs=10)

ValueError: ignored