In [1]:
import nltk, json, pandas as pd, numpy as np, pickle
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences



Using TensorFlow backend.


In [5]:
def preprocess_text(x):
    for punct in '"!&?.,}-/<>#$%\()*+:;=?@[\\]^_`|\~':
        x = x.replace(punct, ' ')
    
    x = ' '.join(x.split())
    x = x.lower()
    
    return x

In [16]:
def create_utterances(filename, split):
    sentences, act_labels, emotion_labels, speakers, conv_id, utt_id = [], [], [], [], [], []
    
    lengths = []
    with open(filename, 'r') as f:
        for c_id, line in enumerate(f):
            s = eval(line)
            for u_id, item in enumerate(s['dialogue']):
                sentences.append(item['text'])
                act_labels.append(item['act'])
                emotion_labels.append(item['emotion'])
                conv_id.append(split[:2] + '_c' + str(c_id))
                utt_id.append(split[:2] + '_c' + str(c_id) + '_u' + str(u_id))
                speakers.append(str(u_id%2))
                
                # print("sentences: ", sentences)
                # print("act_labels: ", act_labels)
                # print("emotion_labels: ", emotion_labels)
                # print("conv_id: ", conv_id)
                # print("utt_id: ", utt_id)
                # print("speakers: ", speakers)
                
            # break
                # u_id += 1
                
    data = pd.DataFrame(sentences, columns=['sentence'])
    data['sentence'] = data['sentence'].apply(lambda x: preprocess_text(x))
    data['act_label'] = act_labels
    data['emotion_label'] = emotion_labels
    data['speaker'] = speakers
    data['conv_id'] = conv_id
    data['utt_id'] = utt_id
    
    return data

In [17]:

# if __name__ == '__main__':


train_data = create_utterances('./dailydialog/train.json', 'train')
valid_data = create_utterances('./dailydialog/valid.json', 'valid')
test_data = create_utterances('./dailydialog/test.json', 'test')

print(train_data)

# print(train_data["sentence"])
    


                                                sentence   act_label  \
0      say jim how about going for a few beers after ...   directive   
1      you know that is tempting but is really not go...  commissive   
2              what do you mean it will help us to relax    question   
3      do you really think so i don't it will just ma...    question   
4      i guess you are right but what shall we do i d...    question   
5      i suggest a walk over to the gym where we can ...   directive   
6      that's a good idea i hear mary and sally often...  commissive   
7      sounds great to me if they are willing we coul...      inform   
8                                    good let ' s go now   directive   
9                                              all right  commissive   
10                                   can you do push ups    question   
11     of course i can it's a piece of cake believe i...      inform   
12                      really i think that's impossible    ques

In [19]:
all_act_labels, all_emotion_labels = set(train_data['act_label']), set(train_data['emotion_label'])
print("all_act_labels: ", all_act_labels)
print("all_emotion_labels: ", all_emotion_labels)

all_act_labels:  {'question', 'commissive', 'directive', 'inform'}
all_emotion_labels:  {'no_emotion', 'anger', 'disgust', 'surprise', 'sadness', 'happiness', 'fear'}


In [20]:
act_label_encoder, emotion_label_encoder, act_label_decoder, emotion_label_decoder = {}, {}, {}, {}
for i, label in enumerate(all_act_labels):
    act_label_encoder[label] = i
    act_label_decoder[i] = label

for i, label in enumerate(all_emotion_labels):
    emotion_label_encoder[label] = i
    emotion_label_decoder[i] = label
    
print("act_label_encoder: ", act_label_encoder)
print("act_label_decoder: ", act_label_decoder)
print("emotion_label_encoder: ", emotion_label_encoder)
print("emotion_label_decoder: ", emotion_label_decoder)


act_label_encoder:  {'question': 0, 'commissive': 1, 'directive': 2, 'inform': 3}
act_label_decoder:  {0: 'question', 1: 'commissive', 2: 'directive', 3: 'inform'}
emotion_label_encoder:  {'no_emotion': 0, 'anger': 1, 'disgust': 2, 'surprise': 3, 'sadness': 4, 'happiness': 5, 'fear': 6}
emotion_label_decoder:  {0: 'no_emotion', 1: 'anger', 2: 'disgust', 3: 'surprise', 4: 'sadness', 5: 'happiness', 6: 'fear'}


In [21]:
pickle.dump(act_label_encoder, open('./dailydialog/act_label_encoder.pkl', 'wb'))
pickle.dump(act_label_decoder, open('./dailydialog/act_label_decoder.pkl', 'wb'))
pickle.dump(emotion_label_encoder, open('./dailydialog/emotion_label_encoder.pkl', 'wb'))
pickle.dump(emotion_label_decoder, open('./dailydialog/emotion_label_decoder.pkl', 'wb'))


In [22]:
def encode_labels(encoder, l):
    return encoder[l]

In [23]:
train_data['encoded_act_label'] = train_data['act_label'].map(lambda x: encode_labels(act_label_encoder, x))
test_data['encoded_act_label'] = test_data['act_label'].map(lambda x: encode_labels(act_label_encoder, x))
valid_data['encoded_act_label'] = valid_data['act_label'].map(lambda x: encode_labels(act_label_encoder, x))

train_data['encoded_emotion_label'] = train_data['emotion_label'].map(lambda x: encode_labels(emotion_label_encoder, x))
test_data['encoded_emotion_label'] = test_data['emotion_label'].map(lambda x: encode_labels(emotion_label_encoder, x))
valid_data['encoded_emotion_label'] = valid_data['emotion_label'].map(lambda x: encode_labels(emotion_label_encoder, x))

print(train_data)


                                                sentence   act_label  \
0      say jim how about going for a few beers after ...   directive   
1      you know that is tempting but is really not go...  commissive   
2              what do you mean it will help us to relax    question   
3      do you really think so i don't it will just ma...    question   
4      i guess you are right but what shall we do i d...    question   
5      i suggest a walk over to the gym where we can ...   directive   
6      that's a good idea i hear mary and sally often...  commissive   
7      sounds great to me if they are willing we coul...      inform   
8                                    good let ' s go now   directive   
9                                              all right  commissive   
10                                   can you do push ups    question   
11     of course i can it's a piece of cake believe i...      inform   
12                      really i think that's impossible    ques

In [24]:
## tokenize all sentences ##
all_text = list(train_data['sentence'])
tokenizer = Tokenizer()

In [25]:
tokenizer.fit_on_texts(all_text)

In [26]:
print(tokenizer)

<keras_preprocessing.text.Tokenizer object at 0x000000001C92CB70>


In [27]:
pickle.dump(tokenizer, open('./dailydialog/tokenizer.pkl', 'wb'))


In [28]:
## convert the sentences into sequences ##
train_sequence = tokenizer.texts_to_sequences(list(train_data['sentence']))
valid_sequence = tokenizer.texts_to_sequences(list(valid_data['sentence']))
test_sequence = tokenizer.texts_to_sequences(list(test_data['sentence']))

In [31]:
print(type(train_sequence))
print(train_sequence[0])

<class 'list'>
[146, 962, 30, 32, 74, 14, 5, 206, 3294, 159, 294]


In [33]:
print(tokenizer.sequences_to_texts([[146, 962, 30, 32, 74, 14, 5, 206, 3294, 159, 294]]))

['say jim how about going for a few beers after dinner']


In [34]:
train_data['sentence_length'] = [len(item) for item in train_sequence]
valid_data['sentence_length'] = [len(item) for item in valid_sequence]
test_data['sentence_length'] = [len(item) for item in test_sequence]

In [35]:
print(train_data)

                                                sentence   act_label  \
0      say jim how about going for a few beers after ...   directive   
1      you know that is tempting but is really not go...  commissive   
2              what do you mean it will help us to relax    question   
3      do you really think so i don't it will just ma...    question   
4      i guess you are right but what shall we do i d...    question   
5      i suggest a walk over to the gym where we can ...   directive   
6      that's a good idea i hear mary and sally often...  commissive   
7      sounds great to me if they are willing we coul...      inform   
8                                    good let ' s go now   directive   
9                                              all right  commissive   
10                                   can you do push ups    question   
11     of course i can it's a piece of cake believe i...      inform   
12                      really i think that's impossible    ques

In [36]:
max_num_tokens = 250

In [37]:
train_sequence = pad_sequences(train_sequence, maxlen=max_num_tokens, padding='post')
valid_sequence = pad_sequences(valid_sequence, maxlen=max_num_tokens, padding='post')
test_sequence = pad_sequences(test_sequence, maxlen=max_num_tokens, padding='post')

train_data['sequence'] = list(train_sequence)
valid_data['sequence'] = list(valid_sequence)
test_data['sequence'] = list(test_sequence)

In [47]:
print(train_data['sequence'][0])


[ 146  962   30   32   74   14    5  206 3294  159  294    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [46]:
print(tokenizer.sequences_to_texts([train_data['sequence'][0]]))
print(tokenizer.sequences_to_texts([[146, 962, 30, 32, 74, 14, 5, 206, 3294, 159, 294, 0, 0]]))

['say jim how about going for a few beers after dinner']
['say jim how about going for a few beers after dinner the you']


In [49]:
## save the data in pickle format ##
convSpeakers, convInputSequence, convInputMaxSequenceLength, convActLabels, convEmotionLabels = {}, {}, {}, {}, {}
train_conv_ids, test_conv_ids, valid_conv_ids = set(train_data['conv_id']), set(test_data['conv_id']), set(valid_data['conv_id'])
all_data = train_data.append(test_data, ignore_index=True).append(valid_data, ignore_index=True)


In [50]:
print(all_data)


                                                 sentence   act_label  \
0       say jim how about going for a few beers after ...   directive   
1       you know that is tempting but is really not go...  commissive   
2               what do you mean it will help us to relax    question   
3       do you really think so i don't it will just ma...    question   
4       i guess you are right but what shall we do i d...    question   
5       i suggest a walk over to the gym where we can ...   directive   
6       that's a good idea i hear mary and sally often...  commissive   
7       sounds great to me if they are willing we coul...      inform   
8                                     good let ' s go now   directive   
9                                               all right  commissive   
10                                    can you do push ups    question   
11      of course i can it's a piece of cake believe i...      inform   
12                       really i think that's impo

In [51]:
print(train_conv_ids)

{'tr_c9840', 'tr_c3935', 'tr_c8387', 'tr_c7553', 'tr_c9272', 'tr_c6985', 'tr_c4090', 'tr_c5116', 'tr_c281', 'tr_c4062', 'tr_c5578', 'tr_c10844', 'tr_c5170', 'tr_c1795', 'tr_c7534', 'tr_c9572', 'tr_c4432', 'tr_c4277', 'tr_c3136', 'tr_c8316', 'tr_c3365', 'tr_c5454', 'tr_c2907', 'tr_c542', 'tr_c2532', 'tr_c6431', 'tr_c1793', 'tr_c5462', 'tr_c8406', 'tr_c9852', 'tr_c1205', 'tr_c2109', 'tr_c10548', 'tr_c6779', 'tr_c10171', 'tr_c10442', 'tr_c3456', 'tr_c640', 'tr_c2022', 'tr_c1202', 'tr_c3443', 'tr_c8920', 'tr_c931', 'tr_c7640', 'tr_c7426', 'tr_c6554', 'tr_c9567', 'tr_c5709', 'tr_c1208', 'tr_c3187', 'tr_c3946', 'tr_c6722', 'tr_c3960', 'tr_c9241', 'tr_c3491', 'tr_c10563', 'tr_c7757', 'tr_c5383', 'tr_c4326', 'tr_c4524', 'tr_c10207', 'tr_c3045', 'tr_c9856', 'tr_c4555', 'tr_c437', 'tr_c6482', 'tr_c6373', 'tr_c9035', 'tr_c3969', 'tr_c8850', 'tr_c3183', 'tr_c5822', 'tr_c10108', 'tr_c4710', 'tr_c5228', 'tr_c1051', 'tr_c141', 'tr_c1728', 'tr_c3020', 'tr_c7907', 'tr_c9107', 'tr_c5595', 'tr_c5834', 't

In [55]:
print ('Preparing dataset. Hang on...')
for item in list(train_conv_ids) + list(test_conv_ids) + list(valid_conv_ids):

    df = all_data[all_data['conv_id'] == item]
    convSpeakers[item] = list(df['speaker'])
    convInputSequence[item] = list(df['sequence'])
    convInputMaxSequenceLength[item] = max(list(df['sentence_length']))
    convActLabels[item] = list(df['encoded_act_label'])
    convEmotionLabels[item] = list(df['encoded_emotion_label'])
    
    # print("df: ", df)
    # print("convSpeakers: ", convSpeakers)
    # print("convInputSequence: ", convInputSequence)
    # print("convInputMaxSequenceLength: ", convInputMaxSequenceLength)
    # print("convActLabels: ", convActLabels)
    # print("convEmotionLabels: ", convEmotionLabels)
    # 
    # break

Preparing dataset. Hang on...


In [56]:
pickle.dump([convSpeakers, convInputSequence, convInputMaxSequenceLength, convActLabels, convEmotionLabels,
             train_conv_ids, test_conv_ids, valid_conv_ids], open('./dailydialog/daily_dialogue.pkl', 'wb'))

In [57]:
word_index = tokenizer.word_index
print(word_index)



In [58]:
print(tokenizer.__dict__)



In [60]:
def load_pretrained_glove():
    print("Loading GloVe model, this can take some time...")
    glv_vector = {}
    f = open('./nlp-source/glove.840B.300d.txt', encoding='utf-8')

    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float')
            glv_vector[word] = coefs
        except ValueError:
            continue
    f.close()
    print("Completed loading pretrained GloVe model.")
    return glv_vector

In [61]:
## save pretrained embedding matrix ##
glv_vector = load_pretrained_glove()
word_vector_length = len(glv_vector['the'])
word_index = tokenizer.word_index
inv_word_index = {v: k for k, v in word_index.items()}
num_unique_words = len(word_index)
glv_embedding_matrix = np.zeros((num_unique_words+1, word_vector_length))

for j in range(1, num_unique_words+1):
    try:
        glv_embedding_matrix[j] = glv_vector[inv_word_index[j]]
    except KeyError:
        glv_embedding_matrix[j] = np.random.randn(word_vector_length)/200

np.ndarray.dump(glv_embedding_matrix, open('./dailydialog/glv_embedding_matrix', 'wb'))
print ('Done. Completed preprocessing.')

Loading GloVe model, this can take some time...
Completed loading pretrained GloVe model.
Done. Completed preprocessing.


In [None]:
print(inv_word_index)