In [78]:
import pandas as pd
import numpy as np
import random
from tensorflow.keras import models, Sequential, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import gensim.downloader as api
import pickle
import gensim.downloader as api


In [3]:
df = pd.read_csv('/content/drive/My Drive/data/CNN_NLP/steeve_cleaned_data.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,content,sentiment
0,0,i didnt feel humiliated,sadness
1,1,i can go from feeling so hopeless to so damned...,sadness
2,2,im grabbing a minute to post i feel greedy wrong,anger
3,3,i am ever feeling nostalgic about the fireplac...,love
4,4,i am feeling grouchy,anger


In [5]:
df = df.rename(columns={'content': 'text', 'sentiment': 'emotion','Unnamed: 0': 'index'}).set_index('index')

In [6]:
df.head()

Unnamed: 0_level_0,text,emotion
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [7]:
df.emotion.value_counts().count()

6

In [8]:
df.shape

(58137, 2)

In [9]:
X = df.text
df.emotion = pd.Categorical(df.emotion)

In [10]:
df['code'] = df['emotion'].cat.codes

In [11]:
y = to_categorical(df['code'].values)

In [12]:
df.head()

Unnamed: 0_level_0,text,emotion,code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,i didnt feel humiliated,sadness,5
1,i can go from feeling so hopeless to so damned...,sadness,5
2,im grabbing a minute to post i feel greedy wrong,anger,0
3,i am ever feeling nostalgic about the fireplac...,love,3
4,i am feeling grouchy,anger,0


In [13]:
df.groupby(['emotion', 'code']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
emotion,code,Unnamed: 2_level_1
anger,0,4426.0
anger,1,
anger,2,
anger,3,
anger,4,
anger,5,
fear,0,
fear,1,11106.0
fear,2,
fear,3,


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [15]:
y_train

array([[0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

In [16]:
df.head()

Unnamed: 0_level_0,text,emotion,code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,i didnt feel humiliated,sadness,5
1,i can go from feeling so hopeless to so damned...,sadness,5
2,im grabbing a minute to post i feel greedy wrong,anger,0
3,i am ever feeling nostalgic about the fireplac...,love,3
4,i am feeling grouchy,anger,0


In [17]:
df.emotion.value_counts()

happiness    16298
sadness      11427
fear         11106
neutral       9398
love          5482
anger         4426
Name: emotion, dtype: int64

In [18]:
### Let's tokenize the vocabulary 
tk = Tokenizer()
tk.fit_on_texts(X)
vocab_size = len(tk.word_index) + 1
print(f'There are {vocab_size} different words in your corpus')
X_train_token = tk.texts_to_sequences(X_train)
X_test_token = tk.texts_to_sequences(X_test)


### Pad your inputs
X_train_pad = pad_sequences(X_train_token, maxlen=66, dtype='float32', padding='post' )
X_test_pad = pad_sequences(X_test_token, maxlen=66, dtype='float32', padding='post')

There are 40795 different words in your corpus


In [19]:
X_train_pad.shape

(40695, 66)

In [20]:
X_test_pad.shape

(17442, 66)

In [21]:
vocab_size

40795

In [80]:
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [None]:
glove_gensim = api.load('glove-wiki-gigaword-100')

In [84]:
glove_gensim_200  = api.load('glove-wiki-gigaword-200') #100 dimension

In [83]:
glove_twitter_200 = api.load('glove-twitter-200')



In [23]:
# Vectorize
vector_size = 100
gensim_weight_matrix = np.zeros((vocab_size ,vector_size))
gensim_weight_matrix.shape
for word, index in tk.word_index.items():
    if index < vocab_size: # since index starts with zero 
        if word in glove_gensim.wv.vocab:
            gensim_weight_matrix[index] = glove_gensim[word]
        else:
            gensim_weight_matrix[index] = np.zeros(100)

  import sys


In [24]:
def create_model(EMBEDDING_DIM = 100): # this means the embedding layer will create  a vector in 100 dimension
  model_cnn = Sequential()
  model_cnn.add(layers.Embedding(input_dim = vocab_size,# the whole vocabulary size 
                            output_dim = EMBEDDING_DIM, # vector space dimension
                            input_length= X_train_pad.shape[1], # max_len of text sequence
                            weights = [gensim_weight_matrix],
                            trainable = False,
                            mask_zero=True))
  model_cnn.add(layers.Conv1D(10, kernel_size=3))
  model_cnn.add(layers.Flatten())
  #model_cnn.add(layers.Dense(60, activation="relu"))
  model_cnn.add(layers.Dense(30, activation="relu"))
  model_cnn.add(layers.Dense(6, activation="softmax"))
  model_cnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
  return model_cnn

In [25]:
model = create_model()

In [26]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 66, 100)           4079500   
_________________________________________________________________
conv1d (Conv1D)              (None, 64, 10)            3010      
_________________________________________________________________
flatten (Flatten)            (None, 640)               0         
_________________________________________________________________
dense (Dense)                (None, 30)                19230     
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 186       
Total params: 4,101,926
Trainable params: 22,426
Non-trainable params: 4,079,500
_________________________________________________________________


In [27]:
es = callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [28]:
history = model.fit(X_train_pad, y_train, batch_size=8, epochs=100, verbose=1, validation_split=0.3, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [29]:
model.evaluate(X_test_pad, y_test)



[1.4092276096343994, 0.42970988154411316]

In [30]:
def create_model2(EMBEDDING_DIM = 100): # this means the embedding layer will create  a vector in 100 dimension
  model_cnn = Sequential()
  model_cnn.add(layers.Embedding(input_dim = vocab_size,# the whole vocabulary size 
                            output_dim = EMBEDDING_DIM, # vector space dimension
                            input_length= X_train_pad.shape[1], # max_len of text sequence
                            weights = [gensim_weight_matrix],
                            trainable = False,
                            mask_zero=True))
  model_cnn.add(layers.Bidirectional(layers.LSTM(100,return_sequences=True)))
  model_cnn.add(layers.Bidirectional(layers.LSTM(100,return_sequences=True)))
  model_cnn.add(layers.Dropout(.2,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(100,return_sequences=False)))
  #model_cnn.add(layers.Dense(60, activation="relu"))
  model_cnn.add(layers.Dense(30, activation="relu"))
  model_cnn.add(layers.Dense(6, activation="softmax"))
  model_cnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
  return model_cnn

In [31]:
model2 = create_model2()

In [32]:
es = callbacks.EarlyStopping(patience=3, restore_best_weights=True)

In [33]:
history2 = model2.fit(X_train_pad, y_train, batch_size=32, epochs=10, verbose=1, validation_split=0.3, callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


In [34]:
model2.evaluate(X_test_pad, y_test)



[1.038488507270813, 0.5892673134803772]

In [68]:
def create_model3(EMBEDDING_DIM = 100): # this means the embedding layer will create  a vector in 100 dimension
  model_cnn = Sequential()
  model_cnn.add(layers.Embedding(input_dim = vocab_size,# the whole vocabulary size 
                            output_dim = EMBEDDING_DIM, # vector space dimension
                            input_length= X_train_pad.shape[1], # max_len of text sequence
                            weights = [gensim_weight_matrix],
                            trainable = False,
                            mask_zero=True))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(100,return_sequences=True)))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(200,return_sequences=True)))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(100,return_sequences=False)))
  #model_cnn.add(layers.Dense(60, activation="relu"))
  model_cnn.add(layers.Dense(30, activation="relu"))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Dense(6, activation="softmax"))
  model_cnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
  return model_cnn

In [69]:
model3 = create_model3()

In [70]:
es = callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [71]:
history3 = model3.fit(X_train_pad, y_train, batch_size=32, epochs=100, verbose=1, validation_split=0.3, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100


In [66]:
model3.evaluate(X_test_pad, y_test)



[1.030755639076233, 0.5921912789344788]

In [67]:
models.save_model(
    model3, filepath="/content/drive/My Drive/Colab Notebooks/CNN NLP Model"
)



INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/CNN NLP Model/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/CNN NLP Model/assets


Model 4

Using more vectors and different word 2 vec models

In [87]:
# Vectorize
vector_size = 200
gensim_weight_matrix = np.zeros((vocab_size ,vector_size))
gensim_weight_matrix.shape
for word, index in tk.word_index.items():
    if index < vocab_size: # since index starts with zero 
        if word in glove_gensim_200.wv.vocab:
            gensim_weight_matrix[index] = glove_gensim_200[word]
        else:
            gensim_weight_matrix[index] = np.zeros(200)

  import sys


In [89]:
def create_model4(EMBEDDING_DIM = 200): # this means the embedding layer will create  a vector in 100 dimension
  model_cnn = Sequential()
  model_cnn.add(layers.Embedding(input_dim = vocab_size,# the whole vocabulary size 
                            output_dim = EMBEDDING_DIM, # vector space dimension
                            input_length= X_train_pad.shape[1], # max_len of text sequence
                            weights = [gensim_weight_matrix],
                            trainable = False,
                            mask_zero=True))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(100,return_sequences=True)))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(200,return_sequences=True)))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(200,return_sequences=False)))
  #model_cnn.add(layers.Dense(60, activation="relu"))
  model_cnn.add(layers.Dense(30, activation="relu"))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Dense(6, activation="softmax"))
  model_cnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
  return model_cnn

In [90]:
model4 = create_model4()

In [91]:
history4 = model4.fit(X_train_pad, y_train, batch_size=32, epochs=100, verbose=1, validation_split=0.3, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100


In [93]:
model4.evaluate(X_test_pad, y_test)



[0.9905635714530945, 0.6056071519851685]

Model 5

Using the Twitter Gensim

In [94]:
# Vectorize
vector_size = 200
gensim_weight_matrix = np.zeros((vocab_size ,vector_size))
gensim_weight_matrix.shape
for word, index in tk.word_index.items():
    if index < vocab_size: # since index starts with zero 
        if word in glove_twitter_200.wv.vocab:
            gensim_weight_matrix[index] = glove_twitter_200[word]
        else:
            gensim_weight_matrix[index] = np.zeros(200)

  import sys


In [96]:
def create_model5(EMBEDDING_DIM = 200): # this means the embedding layer will create  a vector in 100 dimension
  model_cnn = Sequential()
  model_cnn.add(layers.Embedding(input_dim = vocab_size,# the whole vocabulary size 
                            output_dim = EMBEDDING_DIM, # vector space dimension
                            input_length= X_train_pad.shape[1], # max_len of text sequence
                            weights = [gensim_weight_matrix],
                            trainable = False,
                            mask_zero=True))
  model_cnn.add(layers.Dropout(.5,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(100,return_sequences=True)))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(200,return_sequences=True)))
  model_cnn.add(layers.Dropout(.3,))
  model_cnn.add(layers.Bidirectional(layers.LSTM(200,return_sequences=False)))
  #model_cnn.add(layers.Dense(60, activation="relu"))
  model_cnn.add(layers.Dense(30, activation="relu"))
  model_cnn.add(layers.Dropout(.5,))
  model_cnn.add(layers.Dense(6, activation="softmax"))
  model_cnn.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
  return model_cnn

In [97]:
model5 = create_model5()

In [98]:
history5 = model5.fit(X_train_pad, y_train, batch_size=32, epochs=100, verbose=1, validation_split=0.3, callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100


In [99]:
model5.evaluate(X_test_pad, y_test)



[0.9587792158126831, 0.613232433795929]

In [100]:
models.save_model(
    model5, filepath="/content/drive/My Drive/Colab Notebooks/CNN NLP Model/model_7"
)



INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/CNN NLP Model/model_7/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/Colab Notebooks/CNN NLP Model/model_7/assets


In [168]:
def predict():
    sentence = str(input('Enter a sentence : '))
    sentence_lst=[]
    sentence_lst.append(sentence)
    sentence_seq=tk.texts_to_sequences(sentence_lst)
    sentence_padded=pad_sequences(sentence_seq,maxlen=1000,padding='post')
    ans=model5.predict(sentence_padded)
    labels = ['Anger', 'Fear', 'Happiness', 'Love','Neutral','Sadness']
    ans_dict = dict(zip(labels, ans[0]))
    return {k: float(v) for k, v in ans_dict.items()}


In [177]:
predict()

Enter a sentence : October arrived, spreading a damp chill over the grounds and into the castle. Madam Pomfrey, the nurse, was kept busy by a sudden spate of colds among the staff and students. Her Pepperup potion worked instantly, though it left the drinker smoking at the ears for several hours afterward. Ginny Weasley, who had been looking pale, was bullied into taking some by Percy. The steam pouring from under her vivid hair gave the impression that her whole head was on fire.  Raindrops the size of bullets thundered on the castle windows for days on end; the lake rose, the flower beds turned into muddy streams, and Hagrid's pumpkins swelled to the size of garden sheds. Oliver Wood's enthusiasm for regular training sessions, however, was not dampened, which was why Harry was to be found, late one stormy Saturday afternoon a few days before Halloween, returning to Gryffindor Tower, drenched to the skin and splattered with mud.  Even aside from the rain and wind it hadn't been a happ

{'Anger': 0.12196821719408035,
 'Fear': 0.12557420134544373,
 'Happiness': 0.08757888525724411,
 'Love': 0.009088932536542416,
 'Neutral': 0.00750353466719389,
 'Sadness': 0.6482862234115601}

In [198]:
def get_mood():
    sentence = str(input('Enter a sentence : '))
    sentence_lst=[]
    sentence_lst.append(sentence)
    sentence_seq=tk.texts_to_sequences(sentence_lst)
    sentence_padded=pad_sequences(sentence_seq,maxlen=1000,padding='post')
    ans=model5.predict(sentence_padded)
    labels = ['Anger', 'Fear', 'Happiness', 'Love','Neutral','Sadness']
    ans_dict = dict(zip(labels, ans[0]))
    for key,val in ans_dict.items():
      if (str(val)!=max(str(val))):
        return key


In [200]:
get_mood()

Enter a sentence : I hate you


'Anger'

In [None]:
open('tokenizer.pickle', 'wb')

In [None]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tk, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [165]:
# class Sentence():

#   def get_key(value):
#     dictionary={'anger':0,'fear':1,'happy':2,'love':3,'neutral':4,'sadness':5}
#     for key,val in dictionary.items():
#       if (val==value):
#         return key
  
#   def get_scores():
#     sentence = str(input('Enter a sentence : '))
#     sentence_lst=[]
#     sentence_lst.append(sentence)
#     sentence_seq=tk.texts_to_sequences(sentence_lst)
#     sentence_padded=pad_sequences(sentence_seq,maxlen=1000,padding='post')
#     ans=model5.predict(sentence_padded)
#     labels = ['Anger', 'Fear', 'Happiness', 'Love','Neutral','Sadness']
#     ans_dict = dict(zip(labels, ans[0]))
#     return {k: float(v) for k, v in ans_dict.items()}

#   def get_final_feeling():
#     pass

In [166]:
# new_text = Sentence

In [167]:
# new_text.get_scores()

Enter a sentence : Ebu likes cheese


{'Anger': 0.004795673303306103,
 'Fear': 0.10971111804246902,
 'Happiness': 0.2665919363498688,
 'Love': 0.11712934821844101,
 'Neutral': 0.4704040288925171,
 'Sadness': 0.03136785328388214}

In [None]:
# bi directional lstm
# dropout layers
# model.add(Bidirectional(LSTM(100,return_sequences=True)))