In [1]:
from google.colab import drive
drive.mount('/content/driver')

Mounted at /content/driver


In [28]:
!pip install category_encoders



In [29]:
import os
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from category_encoders import OneHotEncoder

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
BASE_DIR = '/content/driver/MyDrive/Colab Notebooks/NLP/Tweet'

### Load the Data

In [31]:
DATA_DIR = os.path.join(BASE_DIR, 'merged_training.pkl')

df = pd.read_pickle(DATA_DIR)
df.head()

Unnamed: 0,text,emotions
27383,i feel awful about it too because it s my job ...,sadness
110083,im alone i feel awful,sadness
140764,ive probably mentioned this before but i reall...,joy
100071,i was feeling a little low few days back,sadness
2837,i beleive that i am much more sensitive to oth...,love


In [32]:
df.reset_index(drop=True, inplace=True)

In [33]:
# Remove duplicated tweets
df.drop_duplicates(inplace=True)

### Split the Data

In [34]:
# features
X = df['text']
# target
y = df['emotions']

train_dataset, tdataset, train_labels, tlabels= train_test_split(X, y, test_size=0.3, random_state=42)
val_dataset, test_dataset, val_labels, test_labels = train_test_split(tdataset, tlabels, test_size=.4, random_state=42)

In [35]:
train_datasets = train_dataset.to_list()
val_datasets = val_dataset.to_list()
test_datasets = test_dataset.to_list()

### Tokenizer

In [36]:
# parameters
vocab_size = 10000
max_length = 70
oov_token = '<OOV>'
padding_type = 'post'
trunc_type = 'post'

# Instiantiate the tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_datasets)

# get word index.
word_index = tokenizer.word_index

# generate and pad the sequences for training dataset.
sequences = tokenizer.texts_to_sequences(train_datasets)
train_padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# generate and padded the sequences for validation dataset.
val_sequences = tokenizer.texts_to_sequences(val_datasets)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# generate and padded the sequences for testing dataset.
test_sequences = tokenizer.texts_to_sequences(test_datasets)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

### Build the Model

In [37]:
embedding_dim = 16

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])

# model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 70, 16)            160000    
                                                                 
 bidirectional_2 (Bidirectio  (None, 70, 128)          41472     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 16)                1040      
                                                                 
 dense_3 (Dense)             (None, 6)                 102       
                                                                 
Total params: 243,830
Trainable params: 243,830
Non-tr

In [38]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

### Labeling

In [39]:
ohe = OneHotEncoder(use_cat_names=True)
train_labels_encoded = ohe.fit_transform(train_labels)
val_labels_encoded = ohe.transform(val_labels)

In [40]:
train_labels = np.array(train_labels_encoded)
val_labels = np.array(val_labels_encoded)

### Save the Labels

In [65]:
label =  {i : column.split("_")[1].capitalize() for i, column in enumerate(train_labels_encoded.columns)}
label

{0: 'Surprise', 1: 'Fear', 2: 'Joy', 3: 'Anger', 4: 'Sadness', 5: 'Love'}

In [67]:
LABELS_DIR = os.path.join(BASE_DIR, 'Labels.json')

with open(LABELS_DIR, 'w') as f:
  json.dump(label, f)

### Train the Model

In [41]:
# fit the model
history = model.fit(
    train_padded,
    train_labels,
    epochs=10,
    batch_size=128,
    validation_data=(val_padded, val_labels)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Save the Tokenizer

In [68]:
TOKENIZER_DIR = os.path.join(BASE_DIR, 'tokenizer_data.json')

# tokenzier parameters
tokenizer_params = {
    "num_words": tokenizer.num_words,
    "oov_token": tokenizer.oov_token,
    "max_length": max_length,
    "padding_type": padding_type,
    "trunc_type": trunc_type
}

# save the tokenizer
save_data = {
    "tokenizer_params": tokenizer_params,
    "word_index": tokenizer.word_index
}

with open(TOKENIZER_DIR, 'w') as f:
  json.dump(save_data, f)

### Save the Model

In [69]:
MODEL_DIR = os.path.join(BASE_DIR, 'tweet_model.h5')
model.save(MODEL_DIR)

### Load the Tokenizer

In [None]:
with open(TOKENIZER_DIR, 'r') as f:
  tokenizer_data = json.load(f)

tokenizer_params = tokenizer_data['tokenizer_params']
word_index = tokenizer_data['word_index']


### Save sample testset

In [76]:
TEST_DIR = os.path.join(BASE_DIR, 'test.csv')

test_dataset.sample(n=20).to_csv(TEST_DIR, index=False)