In [14]:
pip install --upgrade joblib

Collecting joblib
  Obtaining dependency information for joblib from https://files.pythonhosted.org/packages/10/40/d551139c85db202f1f384ba8bcf96aca2f329440a844f924c8a0040b6d02/joblib-1.3.2-py3-none-any.whl.metadata
  Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)
Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
   ---------------------------------------- 302.2/302.2 kB 6.2 MB/s eta 0:00:00
Installing collected packages: joblib
  Attempting uninstall: joblib
    Found existing installation: joblib 0.9.4
    Uninstalling joblib-0.9.4:
      Successfully uninstalled joblib-0.9.4
Successfully installed joblib-1.3.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
from keras.models import load_model
import joblib
from keras.layers import Embedding

In [3]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import numpy as np
import pandas as pd

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [5]:
def preprocess_and_tokenize(data):    

    #remove html markup
    data = re.sub("(<.*?>)", "", data)

    #remove urls
    data = re.sub(r'http\S+', '', data)
    
    #remove hashtags and @names
    data= re.sub(r"(#[\d\w\.]+)", '', data)
    data= re.sub(r"(@[\d\w\.]+)", '', data)

    #remove punctuation and non-ascii digits
    data = re.sub("(\\W|\\d)", " ", data)
    
    #remove whitespace
    data = data.strip()
    
    # tokenization with nltk
    data = word_tokenize(data)
    
    # stemming with nltk
    porter = PorterStemmer()
    stem_data = [porter.stem(word) for word in data]
        
    return str(stem_data)

In [28]:
CNN = load_model('Saved_Models/CNN_w2v.h5')
biLSTM = load_model('Saved_Models/biLSTM_w2v.h5')
SVM = joblib.load('Saved_Models/tfidf_svm.sav')
RF = joblib.load('Final_Models/tfidf_rf.sav')

In [42]:
Message = [' Very funny . Whats wrong with you today You are my secretary and you are not supposed to talk to me in that tone of voice . Do you know that']
Message1 = Message[0]

In [30]:
import pandas as pd

In [31]:
data_train = pd.read_csv('data/data_train.csv', encoding='utf-8')
data_test = pd.read_csv('data/data_test.csv', encoding='utf-8')
data = pd.concat([data_train, data_test])
print(data)

      Emotion                                               Text
0     neutral   There are tons of other paintings that I thin...
1     sadness  Yet the dog had grown old and less capable , a...
2        fear  When I get into the tube or the train without ...
3        fear  This last may be a source of considerable disq...
4       anger  She disliked the intimacy he showed towards so...
...       ...                                                ...
3388  sadness  My sweetheart left me, or rather we decided to...
3389  sadness  Well , it's too bad that we like different kin...
3390  neutral                                      It sure is . 
3391  sadness  He ’ s got laid off again . I do feel sorry fo...
3392    anger  When stupid people push me during rush time in...

[11327 rows x 2 columns]


In [32]:
def clean_text(data):

    # remove hashtags and @usernames
    data = re.sub(r"(#[\d\w\.]+)", '', data)
    data = re.sub(r"(@[\d\w\.]+)", '', data)
    
    # tekenization using nltk
    data = word_tokenize(data)
    
    return data

In [33]:
import urllib.request
import zipfile
import os

fname = 'embeddings/wiki-news-300d-1M.vec'

if not os.path.isfile(fname):
    print('Downloading word vectors...')
    urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip',
                              'wiki-news-300d-1M.vec.zip')
    print('Unzipping...')
    with zipfile.ZipFile('wiki-news-300d-1M.vec.zip', 'r') as zip_ref:
        zip_ref.extractall('embeddings')
    print('done.')
    
    os.remove('wiki-news-300d-1M.vec.zip')

In [34]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix

In [35]:
texts = [' '.join(clean_text(text)) for text in data.Text]
print(texts)



In [36]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
print(tokenizer.word_index)



In [37]:
index_of_words = tokenizer.word_index
embed_num_dims = 300
vocab_size = len(index_of_words) + 1
embedd_matrix = create_embedding_matrix(fname, index_of_words, embed_num_dims)
embedd_matrix.shape

(12088, 300)

In [38]:
print(vocab_size)

12088


In [39]:
# Embedding layer before the actaul BLSTM 
embedd_layer = Embedding(vocab_size,
                         embed_num_dims,
                         input_length = 500,
                         weights = [embedd_matrix],
                         trainable=False)

In [43]:
Message1 = preprocess_and_tokenize(Message1)
print(Message)
seq = tokenizer.texts_to_sequences(Message)
print(seq)
pad = pad_sequences(seq, maxlen=500)
print(pad)

[' Very funny . Whats wrong with you today You are my secretary and you are not supposed to talk to me in that tone of voice . Do you know that']
[[26, 1141, 10249, 325, 18, 15, 282, 15, 51, 7, 3299, 5, 15, 51, 19, 690, 4, 298, 4, 13, 9, 11, 2088, 8, 532, 29, 15, 71, 11]]
[[    0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0

Traditional ML.

In [44]:
SVM.predict([Message1])

array(['anger'], dtype=object)

In [46]:
RF.predict([Message1])

array(['anger'], dtype=object)

CNN

In [47]:
class_names = ['joy', 'fear', 'anger', 'sadness', 'neutral']

In [24]:
pred = CNN.predict(pad)
print(pred)
print('predicted: {}'.format(class_names[np.argmax(pred)]))

[[0.75919294 0.05924973 0.08518337 0.09234122 0.00403282]]
predicted: joy


biLSTM

In [25]:
print(pred)

[[0.75919294 0.05924973 0.08518337 0.09234122 0.00403282]]


In [26]:
pred = biLSTM.predict(pad)
print('predicted: {}'.format(class_names[np.argmax(pred)]))

predicted: fear
