In [1]:
from google.colab import drive
drive.mount('/content/driver')

Mounted at /content/driver


In [2]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.1


In [19]:
import os
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from category_encoders import OneHotEncoder

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
BASE_DIR = '/content/driver/MyDrive/Colab Notebooks/NLP/Tweet'

In [22]:
DATA_DIR = os.path.join(BASE_DIR, 'merged_training.pkl')

In [23]:
df = pd.read_pickle(DATA_DIR)
df.head()

Unnamed: 0,text,emotions
27383,i feel awful about it too because it s my job ...,sadness
110083,im alone i feel awful,sadness
140764,ive probably mentioned this before but i reall...,joy
100071,i was feeling a little low few days back,sadness
2837,i beleive that i am much more sensitive to oth...,love


In [6]:
df.reset_index(drop=True, inplace=True)

In [7]:
# Remove duplicated tweets
df.drop_duplicates(inplace=True)

In [8]:
# features
X = df['text']
# target
y = df['emotions']

train_dataset, tdataset, train_labels, tlabels= train_test_split(X, y, test_size=0.3, random_state=42)
val_dataset, test_dataset, val_labels, test_labels = train_test_split(tdataset, tlabels, test_size=.4, random_state=42)

In [9]:
train_datasets = train_dataset.to_list()
val_datasets = val_dataset.to_list()
test_datasets = test_dataset.to_list()

In [10]:
# parameters
vocab_size = 10000
max_length = 70
oov_token = '<OOV>'
padding_type = 'post'
trunc_type = 'post'

# Instiantiate the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_datasets)

# get word index.
word_index = tokenizer.word_index

# generate and pad the sequences for training dataset.
sequences = tokenizer.texts_to_sequences(train_datasets)
train_padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# generate and padded the sequences for validation dataset.
val_sequences = tokenizer.texts_to_sequences(val_datasets)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# generate and padded the sequences for testing dataset.
test_sequences = tokenizer.texts_to_sequences(test_datasets)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
embedding_dim = 16


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])

# model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 70, 16)            160000    
                                                                 
 bidirectional (Bidirectiona  (None, 70, 64)           12544     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               24832     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 16)                1040      
                                                                 
 dense_1 (Dense)             (None, 6)                 102       
                                                                 
Total params: 198,518
Trainable params: 198,518
Non-trai

In [12]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
ohe = OneHotEncoder(use_cat_names=True)
train_labels_encoded = ohe.fit_transform(train_labels)
val_labels_encoded = ohe.transform(val_labels)

In [14]:
train_labels = np.array(train_labels_encoded)
val_labels = np.array(val_labels_encoded)

In [15]:
# fit the model
history = model.fit(
    train_padded,
    train_labels,
    epochs=5,
    batch_size=128,
    validation_data=(val_padded, val_labels)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Save the Tokenizer

In [45]:
TOKENIZER_DIR = os.path.join(BASE_DIR, 'tokenizer_data.json')

# tokenzier parameters
tokenizer_params = {
    "num_words": tokenizer.num_words,
    "oov_token": tokenizer.oov_token,
    "max_length": max_length,
    "padding_type": padding_type,
    "trunc_type": trunc_type
}

# save the tokenizer
save_data = {
    "tokenizer_params": tokenizer_params,
    "word_index": tokenizer.word_index
}

with open(TOKENIZER_DIR, 'w') as f:
  json.dump(save_data, f)

### Save the Model

In [25]:
MODEL_DIR = os.path.join(BASE_DIR, 'tweet_model.h5')
model.save(MODEL_DIR)

### Load the Tokenizer

In [46]:
with open(TOKENIZER_DIR, 'r') as f:
  tokenizer_data = json.load(f)

tokenizer_params = tokenizer_data['tokenizer_params']
word_index = tokenizer_data['word_index']


In [35]:
tweet = "i feel awful about it too because it s my job to get him in a position to succeed and it just didn t happen here"

a = tokenizer.texts_to_sequences([tweet])
pad_sequences(a, maxlen=100, padding='post', truncating='post')

array([[   2,    3,  467,   28,   13,   94,   37,   13,   84,   11,  332,
           5,   55,   82,   12,    7, 1132,    5, 2947,    4,   13,   32,
         259,   43,  541,  136,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)

In [47]:
token = Tokenizer(num_words=vocab_size, oov_token=oov_token)
token.word_index = word_index

b = token.texts_to_sequences([tweet])
pad_sequences(b, maxlen=100, padding='post', truncating='post')

array([[   2,    3,  467,   28,   13,   94,   37,   13,   84,   11,  332,
           5,   55,   82,   12,    7, 1132,    5, 2947,    4,   13,   32,
         259,   43,  541,  136,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0]], dtype=int32)