# Emotion Classification in short texts with BERT



In [None]:
#Mounting the Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# install ktrain on Google Colab
!pip3 install ktrain

Collecting ktrain
[?25l  Downloading https://files.pythonhosted.org/packages/4c/88/10d29578f47d0d140bf669d5598e9f5a50465ddc423b32031c65e840d003/ktrain-0.26.3.tar.gz (25.3MB)
[K     |████████████████████████████████| 25.3MB 119kB/s 
[?25hCollecting scikit-learn==0.23.2
[?25l  Downloading https://files.pythonhosted.org/packages/f4/cb/64623369f348e9bfb29ff898a57ac7c91ed4921f228e9726546614d63ccb/scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 46.2MB/s 
Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/0e/72/a3add0e4eec4eb9e2569554f7c70f4a3c27712f40e3284d483e88094cc0e/langdetect-1.0.9.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 48.4MB/s 
Collecting cchardet
[?25l  Downloading https://files.pythonhosted.org/packages/80/72/a4fba7559978de00cf44081c548c5d294bf00ac7dcda2db405d2baa8c67a/cchardet-2.1.7-cp37-cp37m-manylinux2010_x86_64.whl (263kB)
[K     |██████████████████████████

In [None]:
import pandas as pd
import numpy as np

import ktrain
from ktrain import text

In [None]:
#Import Data

data_train = pd.read_csv('/content/drive/MyDrive/Capstone/data/data_train.csv', encoding='utf-8')
data_test = pd.read_csv('/content/drive/MyDrive/Capstone/data/data_test.csv', encoding='utf-8')

X_train = data_train.Text.tolist()
X_test = data_test.Text.tolist()

y_train = data_train.Emotion.tolist()
y_test = data_test.Emotion.tolist()

data = data_train.append(data_test, ignore_index=True)

class_names = ['joy', 'sadness', 'fear', 'anger', 'neutral']

print('size of training set: %s' % (len(data_train['Text'])))
print('size of validation set: %s' % (len(data_test['Text'])))
print(data.Emotion.value_counts())

data.head(10)

size of training set: 7934
size of validation set: 3393
joy        2326
sadness    2317
anger      2259
neutral    2254
fear       2171
Name: Emotion, dtype: int64


Unnamed: 0,Emotion,Text
0,neutral,There are tons of other paintings that I thin...
1,sadness,"Yet the dog had grown old and less capable , a..."
2,fear,When I get into the tube or the train without ...
3,fear,This last may be a source of considerable disq...
4,anger,She disliked the intimacy he showed towards so...
5,sadness,When my family heard that my Mother's cousin w...
6,joy,Finding out I am chosen to collect norms for C...
7,anger,A spokesperson said : ` Glen is furious that t...
8,neutral,Yes .
9,sadness,"When I see people with burns I feel sad, actua..."


In [None]:
#Categorizing the Label
encoding = {
    'joy': 0,
    'sadness': 1,
    'fear': 2,
    'anger': 3,
    'neutral': 4
}

# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

In [None]:
(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train,
                                                                       x_test=X_test, y_test=y_test,
                                                                       class_names=class_names,
                                                                       preprocess_mode='bert',
                                                                       maxlen=350, 
                                                                       max_features=35000)

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


task: text classification


## 2. Training and validation


In [None]:
model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 350
done.


In [None]:
learner = ktrain.get_learner(model, train_data=(x_train, y_train), 
                             val_data=(x_test, y_test),
                             batch_size=6)

In [None]:
learner.fit_onecycle(2e-5, 3)



begin training using onecycle policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f6dafd8b350>

Validation

In [None]:
learner.validate(val_data=(x_test, y_test), class_names=class_names)

              precision    recall  f1-score   support

         joy       0.88      0.84      0.86       707
     sadness       0.81      0.82      0.81       676
        fear       0.88      0.84      0.86       679
       anger       0.78      0.79      0.78       693
     neutral       0.77      0.83      0.80       638

    accuracy                           0.82      3393
   macro avg       0.82      0.82      0.82      3393
weighted avg       0.83      0.82      0.82      3393



array([[596,  13,  15,  14,  69],
       [ 13, 552,  23,  57,  31],
       [ 16,  33, 568,  48,  14],
       [ 18,  60,  27, 546,  42],
       [ 35,  26,  11,  34, 532]])

#### Testing with other inputs

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc)
predictor.get_classes()

['joy', 'sadness', 'fear', 'anger', 'neutral']

In [None]:
import time 

message = 'I just broke up with my boyfriend'

start_time = time.time() 
prediction = predictor.predict(message)

print('predicted: {} ({:.2f})'.format(prediction, (time.time() - start_time)))

predicted: sadness (0.10)


## 4. Saving Bert model


In [None]:
# let's save the predictor for later use
predictor.save("/content/drive/MyDrive/Capstone/models/bert_model")

