# Data preparation

## Load packages

In [None]:
import sys
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

## Load Data

In [None]:
data_df = pd.read_csv('/emotion.data')
data_df.columns = ['id', 'text', 'emotion']

In [None]:
data_df.emotion.value_counts()

## Load embeddings

In [None]:
def load_embeddings(file):
    """
    input: embeddings file
    output: embedding index
    """
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
    return embeddings_index

In [None]:
emb_folder = "/glove6b50dtxt/"
emb_file_name = 'glove.6B.50d.txt'
emb_path = os.path.join(emb_folder, emb_file_name)
emb_glove = load_embeddings(emb_path)

# Data exploration

In [None]:
data_df.head()

In [None]:
def build_vocabulary(texts):
    """
    input: list of list of words
    output: dictionary of words and their count
    """
    print('build vocabulary')
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in tqdm(sentences):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

import operator
def check_coverage(vocab, embeddings_index):
    '''
    input: vocabulary, embedding index
    output: list of unknown words; also prints the vocabulary coverage of embeddings and the % of comments text covered by the embeddings
    '''
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in tqdm(vocab.keys()):
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass
    print('Found embeddings for {:.3%} of vocabulary'.format(len(known_words)/len(vocab)))
    print('Found embeddings for {:.3%} of all text'.format(nb_known_words/(nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]
    return unknown_words

## Build vocabulary

In [None]:
vocabulary = build_vocabulary(data_df['text'])
print(f'Vocabulary size: {len(vocabulary)}')

In [None]:
oof_emb = check_coverage(vocabulary, emb_glove)
oof_emb[0:20]

## Improve vocabulary coverage 

In [None]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

In [None]:
def known_contractions(embed):
    '''
    input: embedding matrix
    output: known contractions (from embeddings)
    '''
    known = []
    for contract in tqdm_notebook(contraction_mapping):
        if contract in embed:
            known.append(contract)
    return known

def clean_contractions(text, mapping=contraction_mapping):
    '''
    input: current text, contraction mappings
    output: modify the comments to use the base form from contraction mapping
    '''
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

In [None]:
data_df['text'] = data_df['text'].apply(lambda x: clean_contractions(x))

In [None]:
vocabulary = build_vocabulary(data_df['text'])
print(f'Vocabulary size: {len(vocabulary)}')

In [None]:
oof_emb = check_coverage(vocabulary, emb_glove)

In [None]:
new_contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization',
                      "href": " ", "shouldnt": "should not", "hadnt": "had not", "werent": "were not", "itll": "it will",
                      "nofollow": "no follow", "theyve":"they have", "everyones": "everyone", "theyll": "they will"}


In [None]:
#apply new entried contractions clean
data_df['text'] = data_df['text'].apply(lambda x: clean_contractions(x,new_contraction_mapping))

# build the vocabulary
vocabulary = build_vocabulary(data_df['text'])
print(f'Vocabulary size: {len(vocabulary)}')

# check the vocabulary coverage
oof_emb = check_coverage(vocabulary, emb_glove)

In [None]:
print(f'Not covered vocabulary: {len(oof_emb)}; first 20 most frequent: {list(oof_emb[0:20])}')

# Model

In [None]:
import numpy as np
from sklearn import preprocessing
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, SpatialDropout1D, Embedding, add
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import pickle

## Model parameters

In [None]:
EMBED_SIZE = 50  
MAX_FEATURES = 200000  
MAXLEN = 50 

NO_EPOCHS = 25 
VERBOSE = 1  
PATIENCE = 5  
BATCH_SIZE = 128  
LSTM_UNITS = 128  
DENSE_HIDDEN_UNITS = 256  

## Tokenize

In [None]:
def tokenize(texts):
    tokenizer = Tokenizer(num_words=MAX_FEATURES)
    tokenizer.fit_on_texts(texts)
    word_index = tokenizer.word_index
    print(f"Found {len(word_index)} unique tokens.")
    return word_index, tokenizer

## Build embedding matrix

In [None]:
def build_embedding_matrix(word_index, emb_glove):
    print('build embedding matrix')
    embeddings_index = emb_glove
    embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))
    for word, i in tqdm(word_index.items()):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

## Model preparation

In [None]:
def prepare_model(data_df, tokenizer):
    print("build model")

    print("label encoding")

    sequences = tokenizer.texts_to_sequences(data_df['text_proc'])
    data = pad_sequences(sequences, maxlen=MAXLEN)
    labels = data_df.emotion.values
    labels = to_categorical(np.asarray(labels))

    X = data
    y = labels

    x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

    print('Shape of data tensor:', X.shape)
    print('Shape of label tensor:', y.shape)
    print(f'Data train/valid: {x_train.shape}/{x_val.shape} Label train/valid:  {y_train.shape}/{y_val.shape}')
    return x_train, x_val, y_train, y_val


## Build the model and train

In [None]:
def build_train_model(word_index, embedding_matrix, x_train, x_val, y_train, y_val):
    DENSE_OUTPUT = y_val.shape[1]
    earlystopper = EarlyStopping(monitor='val_loss', patience=PATIENCE, verbose=VERBOSE)
    checkpointer = ModelCheckpoint('best_model.b5',
                                    monitor='val_acc',
                                    verbose=VERBOSE,
                                    save_best_only=True,
                                    save_weights_only=True)

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBED_SIZE,
                                weights=[embedding_matrix],
                                input_length=MAXLEN,
                                trainable=False)

    model = None    
    sequence_input = Input(shape=(MAXLEN,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    x = SpatialDropout1D(0.4)(embedded_sequences)
    x = Bidirectional(LSTM(LSTM_UNITS, return_sequences=True))(x)
    hidden = GlobalMaxPooling1D()(x)
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(DENSE_OUTPUT, activation='sigmoid')(hidden)

    model = Model(sequence_input, result)
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['acc'])
    print("run model - run and save the model pipeline")
    train_model  = model.fit(x_train, y_train, validation_data=[x_val, y_val],
                        epochs=NO_EPOCHS,
                        batch_size=BATCH_SIZE,
                        callbacks=[earlystopper, checkpointer],verbose=VERBOSE)
    print("run model - predict validation set")
    return model


## Validation

In [None]:
def validation_model(model, x_val, y_val):
    print("run model - predict validation set")
    score = model.evaluate(x_val, y_val, verbose=0)
    print(f'Last validation loss: {score[0]}, accuracy: {score[1]}')
    # load saved optimal model
    model_optimal = model
    model_optimal.load_weights('best_model.b5')
    score = model_optimal.evaluate(x_val, y_val, verbose=0)
    print(f'Best validation loss: {score[0]}, accuracy: {score[1]}')

    print("run model - check prediction accuracy | precision | recall | F1-score")
    y_pred = model_optimal.predict(x_val)
    true_val = np.argmax(y_val, axis=1)
    pred_val = np.argmax(y_pred, axis=1)
    print(classification_report(true_val, pred_val))
    print("run model - completed validation")
    pickle.dump(model_optimal,open(os.path.join('.', "emotion.model"),'wb'))
    print("best model saved")

In [None]:
def build_run_model(data_df, embeddings):
    word_index, tokenizer = tokenize(data_df['text_proc'])
    embedding_matrix = build_embedding_matrix(word_index, embeddings)
    x_train, x_val, y_train, y_val = prepare_model(data_df, tokenizer)
    model = build_train_model(word_index, embedding_matrix, x_train, x_val, y_train, y_val)
    validation_model(model, x_val, y_val)

In [None]:
from sklearn.preprocessing import LabelEncoder
train = data_df
train['text_proc'] = train['text']
train_y = train.emotion.values
le = LabelEncoder()
le.fit(train_y)
train.emotion = le.transform(train_y)
le_name_mapping = dict(zip(le.transform(le.classes_), le.classes_))

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
word_index, tokenizer = tokenize(data_df['text_proc'])
embedding_matrix = build_embedding_matrix(word_index, emb_glove)
x_train, x_val, y_train, y_val = prepare_model(data_df, tokenizer)
model = build_train_model(word_index, embedding_matrix, x_train, x_val, y_train, y_val)

In [None]:
validation_model(model, x_val, y_val)