<a href="https://colab.research.google.com/github/laume/nlp_emotions/blob/master/emotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


## Sentiment Analysis: Emotion in Text

In a variation on the popular task of sentiment analysis, this dataset contains labels for the emotional content (such as happiness, sadness, and anger) of texts. Hundreds to thousands of examples across 13 labels.

https://www.figure-eight.com/data-for-everyone/


## Prepare environment

In [65]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [0]:
DATA_DIR = 'data/emotions'
SETUP = True

In [0]:
import os

In [0]:
if os.path.isdir(DATA_DIR):
    SETUP = False

### Install libraries and perform setup

In [6]:
if SETUP:
    !pip install --upgrade --quiet dlai
    !pip install -q -U toai==0.2.*
    !pip install -q -U nb_black
    !pip install -q -U tensorflow-datasets
    !pip install -q -U --no-deps tensorflow-addons~=0.6
    !pip install -q -U tensorflow_hub
    !pip install -q -U git+https://github.com/huggingface/transformers
    print(__import__("toai").__version__)
    print(f'dlai version: {__import__("dlai").__version__}, tf version: {__import__("tensorflow").__version__}')

[K     |████████████████████████████████| 380.8MB 43kB/s 
[K     |████████████████████████████████| 450kB 45.4MB/s 
[K     |████████████████████████████████| 3.8MB 49.6MB/s 
[K     |████████████████████████████████| 81kB 10.1MB/s 
[?25h  Building wheel for dlai (setup.py) ... [?25l[?25hdone
[31mERROR: tensorflow 1.15.0 has requirement tensorboard<1.16.0,>=1.15.0, but you'll have tensorboard 2.0.2 which is incompatible.[0m
[31mERROR: tensorflow 1.15.0 has requirement tensorflow-estimator==1.15.1, but you'll have tensorflow-estimator 2.0.1 which is incompatible.[0m
[31mERROR: tensorboard 2.0.2 has requirement grpcio>=1.24.3, but you'll have grpcio 1.15.0 which is incompatible.[0m
[31mERROR: google-colab 1.0.0 has requirement google-auth~=1.4.0, but you'll have google-auth 1.7.1 which is incompatible.[0m
[K     |████████████████████████████████| 153kB 9.8MB/s 
[K     |████████████████████████████████| 81kB 9.8MB/s 
[K     |████████████████████████████████| 552kB 50.3MB/s

In [7]:
import dlai
from dlai.imports import *
from toai.imports import *
from toai.data import DataBundle, DataParams, DataContainer
from toai.metrics import sparse_top_2_categorical_accuracy
from toai.utils import save_file, load_file
from toai.models import save_keras_model, load_keras_model
import tensorflow as tf
from tensorflow import keras
import tensorflow_addons as tfa
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import transformers



In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
%matplotlib inline

In [0]:
if SETUP:
    DATA_DIR = Path(DATA_DIR)
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    !cp '/content/gdrive/My Drive/text_emotion.csv' {DATA_DIR}

In [11]:
!ls {DATA_DIR}

text_emotion.csv


In [0]:
df = pd.read_csv(DATA_DIR/'text_emotion.csv')

In [56]:
df.head().T

Unnamed: 0,0,1,2,3,4
tweet_id,1956967341,1956967666,1956967696,1956967789,1956968416
sentiment,empty,sadness,sadness,enthusiasm,neutral
author,xoshayzers,wannamama,coolfunky,czareaquino,xkilljoyx
content,@tiffanylue i know i was listenin to bad habi...,Layin n bed with a headache ughhhh...waitin o...,Funeral ceremony...gloomy friday...,wants to hang out with friends SOON!,@dannycastillo We want to trade with someone w...


In [14]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
tweet_id,40000,,,,1845180000.0,118858000.0,1693960000.0,1751430000.0,1855440000.0,1962780000.0,1966440000.0
sentiment,40000,13.0,neutral,8638.0,,,,,,,
author,40000,33871.0,MissxMarisa,23.0,,,,,,,
content,40000,39827.0,I just received a mothers day card from my lov...,14.0,,,,,,,


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 4 columns):
tweet_id     40000 non-null int64
sentiment    40000 non-null object
author       40000 non-null object
content      40000 non-null object
dtypes: int64(1), object(3)
memory usage: 1.2+ MB


In [0]:
df = df[['content', 'sentiment']]

In [15]:
df.sentiment.value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [64]:
df[df.sentiment == 'neutral'][:10].content.values

array(['@dannycastillo We want to trade with someone who has Houston tickets, but no one will.',
       'cant fall asleep',
       'No Topic Maps talks at the Balisage Markup Conference 2009   Program online at http://tr.im/mL6Z (via @bobdc) #topicmaps',
       '@cynthia_123 i cant sleep', 'I missed the bl***y bus!!!!!!!!',
       'feels strong contractions but wants to go out.  http://plurk.com/p/wxidk',
       'SoCal!  stoked. or maybe not.. tomorrow',
       'Screw you @davidbrussee! I only have 3 weeks...',
       'has work this afternoon',
       '@GABBYiSACTiVE Aw you would not unfollow me would you? Then I would cry'],
      dtype=object)

In [0]:
def keep_values(df, col_name, values):
    return df.loc[df[col_name].isin(values), :].reset_index(drop=True)

In [0]:
df = keep_values(
    df, "sentiment", df["sentiment"].value_counts()[:5].index
)

In [0]:
# neutral      8638
# worry        8459
# happiness    5209
# sadness      5165
# love         3842
# surprise     2187
# fun          1776
# relief       1526
# hate         1323
# Name: sentiment, dtype: int64

In [20]:
df.sentiment.value_counts()

neutral      8638
worry        8459
happiness    5209
sadness      5165
love         3842
Name: sentiment, dtype: int64

In [0]:
def make_category_map(labels):
    return {x: i for i, x in enumerate(sorted(set(labels)))}

In [0]:
def init_category_map(filename, labels):
    try:
        category_map = load_file(filename)
    except:
        category_map = make_category_map(labels)
        save_file(category_map, filename)
    return category_map

In [23]:
category_map = init_category_map(
    DATA_DIR / "category_map.pickle", df["sentiment"].values
)
category_map

{'happiness': 0, 'love': 1, 'neutral': 2, 'sadness': 3, 'worry': 4}

In [24]:
n_categories = len(category_map)
n_categories

5

In [0]:
df.sentiment = df.sentiment.map(category_map)

In [0]:
data_container = DataContainer(
    *DataBundle.split(
        data_bundle=DataBundle.from_dataframe(
            dataframe=df, x_col="content", y_col="sentiment"
        ),
        fracs=[0.8, 0.1, 0.1],
    )
)

In [27]:
len(data_container.train), len(data_container.validation), len(data_container.test)

(25051, 3132, 3130)

In [0]:
class_weights = dict(
    enumerate(
        sk.utils.class_weight.compute_class_weight(
            "balanced", np.unique(data_container.train.y), data_container.train.y
        )
    )
)

In [29]:
class_weights

{0: 1.223790913531998,
 1: 1.6357166176950702,
 2: 0.7234945848375451,
 3: 1.2006230529595014,
 4: 0.7372277810476751}

In [0]:
def make_dataset_from_data_bundle(data_bundle):
    return tf.data.Dataset.from_tensor_slices((data_bundle.x, data_bundle.y))

In [0]:
data_container.train.dataset = make_dataset_from_data_bundle(data_container.train)
data_container.validation.dataset = make_dataset_from_data_bundle(
    data_container.validation
)
data_container.test.dataset = make_dataset_from_data_bundle(data_container.test)

In [32]:
data_container.train.x[0]

"i am the only arabic girl who's online  every one is  a sleep .."

In [33]:
data_container.train.y[0]

3

In [0]:
def make_sentence_length_limiter(limit):
    def inner(x, y):
        return tf.strings.substr(x, 0, limit), y

    return inner

In [0]:
length_limiter = make_sentence_length_limiter(500)

In [0]:
BATCH_SIZE = 8
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [0]:
train_dataset = (
    data_container.train.dataset.repeat()
    .shuffle(len(data_container.train))
    .batch(BATCH_SIZE)
    .map(length_limiter)
    .prefetch(AUTOTUNE)
)

In [38]:
for x, y in train_dataset.take(1):
    print(x.shape)
    print(y.shape)
    print(x[0])
    print(y[0])

(8,)
(8,)
tf.Tensor(b"I never order chips any more due to how unhealthy they are, but getting a burrito from Chipotle or Qdoba doesn't feel right without em", shape=(), dtype=string)
tf.Tensor(4, shape=(), dtype=int64)


In [0]:
validation_dataset = (
    data_container.validation.dataset.batch(BATCH_SIZE)
    .map(length_limiter)
    .prefetch(AUTOTUNE)
)

In [40]:
for x, y in validation_dataset.take(1):
    print(x.shape)
    print(y.shape)
    print(x[0])
    print(y[0])

(8,)
(8,)
tf.Tensor(b"@BB517  not even a little bit biKnightual  (ha ha... tweedeck doesn't see the spelling problem with BK....ha!) but love it anyway.", shape=(), dtype=string)
tf.Tensor(1, shape=(), dtype=int64)


In [0]:
train_dataset_steps = math.ceil(len(data_container.train) / BATCH_SIZE)

In [0]:
def train_model(
    model,
    epochs,
    lrs=None,
    optimizers=None,
    patience=5,
    verbose=1,
    log_dir=str(DATA_DIR / "logs"),
):
    if optimizers is None:
        optimizers = [keras.optimizers.Adam(lr) for lr in lrs]
    model.layers[0].trainable = False
    model.compile(
        loss=keras.losses.sparse_categorical_crossentropy,
        optimizer=optimizers[0],
        metrics=[
            keras.metrics.sparse_categorical_accuracy,
            sparse_top_2_categorical_accuracy,
        ],
    )
    model.fit(
        train_dataset,
        steps_per_epoch=train_dataset_steps,
        validation_data=validation_dataset,
        epochs=epochs[0],
        callbacks=[
            keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.3),
            keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
        ],
        class_weight=class_weights,
        verbose=verbose,
    )
    model.layers[0].trainable = True
    model.compile(
        loss=keras.losses.sparse_categorical_crossentropy,
        optimizer=optimizers[1],
        metrics=[
            keras.metrics.sparse_categorical_accuracy,
            sparse_top_2_categorical_accuracy,
        ],
    )
    model.fit(
        train_dataset,
        steps_per_epoch=train_dataset_steps,
        validation_data=validation_dataset,
        epochs=epochs[1],
        callbacks=[
            keras.callbacks.ReduceLROnPlateau(patience=patience // 2, factor=0.3),
            keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
            #             keras.callbacks.TensorBoard(log_dir=log_dir),
        ],
        class_weight=class_weights,
        verbose=verbose,
    )

In [0]:
def make_hub_model(url):
    return keras.Sequential(
        [
            hub.KerasLayer(url, dtype=tf.string, input_shape=[]),
            keras.layers.Dropout(0.5),
            keras.layers.Dense(n_categories, activation=keras.activations.softmax),
        ]
    )

In [0]:
def run_models(urls):
    for url in urls:
        model = make_hub_model(url)
        model_name = f"{url.split('/')[4]}"
        print(f" {model_name} ".center(80, "="))
        shutil.rmtree(str(DATA_DIR / model_name), ignore_errors=True)
        train_model(
            model=model,
            epochs=[2, 3],
            optimizers=[keras.optimizers.Adam(lr=1e-4), keras.optimizers.Adam(lr=3e-5)],
            patience=2,
            verbose=2,
            log_dir=str(DATA_DIR / model_name),
        )
        model.save(f"{DATA_DIR / model_name}.h5")
        save_keras_model(
            model,
            str(DATA_DIR / model_name / "architecture"),
            str(DATA_DIR / model_name / "weights"),
        )
        keras.backend.clear_session()
        del model
        keras.backend.clear_session()

In [0]:
model_urls = (
    "https://tfhub.dev/google/Wiki-words-250-with-normalization/2",
    # "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2",
)

In [47]:
# run_models(model_urls)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/Wiki-words-250-with-normalization/2'.
INFO:absl:Downloading https://tfhub.dev/google/Wiki-words-250-with-normalization/2: 880.00MB
INFO:absl:Downloaded https://tfhub.dev/google/Wiki-words-250-with-normalization/2, Total size: 970.91MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/Wiki-words-250-with-normalization/2'.


Train for 3813 steps, validate for 477 steps
Epoch 1/2
3813/3813 - 17s - loss: 2.0476 - sparse_categorical_accuracy: 0.2331 - sparse_top_2_categorical_accuracy: 0.4200 - val_loss: 2.0016 - val_sparse_categorical_accuracy: 0.2523 - val_sparse_top_2_categorical_accuracy: 0.4616
Epoch 2/2
3813/3813 - 16s - loss: 2.0060 - sparse_categorical_accuracy: 0.2564 - sparse_top_2_categorical_accuracy: 0.4521 - val_loss: 1.9830 - val_sparse_categorical_accuracy: 0.2662 - val_sparse_top_2_categorical_accuracy: 0.4723
Train for 3813 steps, validate for 477 steps
Epoch 1/3
3813/3813 - 260s - loss: 1.9789 - sparse_categorical_accuracy: 0.2780 - sparse_top_2_categorical_accuracy: 0.4753 - val_loss: 1.9534 - val_sparse_categorical_accuracy: 0.2927 - val_sparse_top_2_categorical_accuracy: 0.4980
Epoch 2/3
3813/3813 - 259s - loss: 1.9410 - sparse_categorical_accuracy: 0.2952 - sparse_top_2_categorical_accuracy: 0.5033 - val_loss: 1.9283 - val_sparse_categorical_accuracy: 0.3013 - val_sparse_top_2_categoric

INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2'.
INFO:absl:Downloaded https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2, Total size: 483.55MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/nnlm-en-dim128-with-normalization/2'.


Train for 3813 steps, validate for 477 steps
Epoch 1/2
3813/3813 - 15s - loss: 2.0754 - sparse_categorical_accuracy: 0.2250 - sparse_top_2_categorical_accuracy: 0.3987 - val_loss: 2.0067 - val_sparse_categorical_accuracy: 0.2609 - val_sparse_top_2_categorical_accuracy: 0.4626
Epoch 2/2
3813/3813 - 14s - loss: 2.0045 - sparse_categorical_accuracy: 0.2564 - sparse_top_2_categorical_accuracy: 0.4510 - val_loss: 1.9737 - val_sparse_categorical_accuracy: 0.2819 - val_sparse_top_2_categorical_accuracy: 0.4812
Train for 3813 steps, validate for 477 steps
Epoch 1/3
3813/3813 - 138s - loss: 1.9773 - sparse_categorical_accuracy: 0.2751 - sparse_top_2_categorical_accuracy: 0.4742 - val_loss: 1.9516 - val_sparse_categorical_accuracy: 0.2911 - val_sparse_top_2_categorical_accuracy: 0.5001
Epoch 2/3
3813/3813 - 137s - loss: 1.9472 - sparse_categorical_accuracy: 0.2983 - sparse_top_2_categorical_accuracy: 0.4997 - val_loss: 1.9305 - val_sparse_categorical_accuracy: 0.3040 - val_sparse_top_2_categoric

In [0]:
def evaluate_models(urls, versions):
    reports = {}
    for url in urls:
        for version in versions:
            model_name = f"{url.split('/')[4]}"
            print(f" {model_name} ".center(80, "="))
            try:
                model = keras.model.load_model(
                    f"{DATA_DIR / model_name}.h5",
                    custom_objects={"KerasLayer": hub.KerasLayer},
                )
            except:
                print(f"Loading architecture & weights separately")
                model = load_keras_model(
                    str(DATA_DIR / model_name / "architecture"),
                    str(DATA_DIR / model_name / "weights"),
                    custom_objects={"KerasLayer": hub.KerasLayer},
                )
            reports[model_name] = classification_report(
                data_container.validation.y,
                model.predict(validation_dataset).argmax(axis=1),
            )
            del model
    return reports

In [0]:
version_model_map = {"base": make_hub_model}

In [61]:
# reports = evaluate_models(model_urls, version_model_map.keys())

Loading architecture & weights separately


  'precision', 'predicted', average, warn_for)


Loading architecture & weights separately




















































In [62]:
# for model_name, report in reports.items():
#     print(f" {model_name} ".center(80, "="))
#     print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       185
           1       0.38      0.11      0.17       515
           2       0.00      0.00      0.00       117
           3       0.42      0.21      0.28       373
           4       0.33      0.51      0.40       915
           5       0.00      0.00      0.00       161
           6       0.33      0.00      0.00       508
           7       0.00      0.00      0.00       209
           8       0.29      0.72      0.41       830

    accuracy                           0.31      3813
   macro avg       0.19      0.17      0.14      3813
weighted avg       0.28      0.31      0.24      3813

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       185
           1       0.36      0.17      0.24       515
           2       0.00      0.00      0.00       117
           3       0.49      0.23      0.31       373
           4       0.37 

In [0]:
# ====================== Wiki-words-250-with-normalization =======================
#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       185
#            1       0.38      0.11      0.17       515
#            2       0.00      0.00      0.00       117
#            3       0.42      0.21      0.28       373
#            4       0.33      0.51      0.40       915
#            5       0.00      0.00      0.00       161
#            6       0.33      0.00      0.00       508
#            7       0.00      0.00      0.00       209
#            8       0.29      0.72      0.41       830

#     accuracy                           0.31      3813
#    macro avg       0.19      0.17      0.14      3813
# weighted avg       0.28      0.31      0.24      3813

# ====================== nnlm-en-dim128-with-normalization =======================
#               precision    recall  f1-score   support

#            0       0.00      0.00      0.00       185
#            1       0.36      0.17      0.24       515
#            2       0.00      0.00      0.00       117
#            3       0.49      0.23      0.31       373
#            4       0.37      0.46      0.41       915
#            5       0.00      0.00      0.00       161
#            6       0.00      0.00      0.00       508
#            7       0.00      0.00      0.00       209
#            8       0.28      0.75      0.41       830

#     accuracy                           0.32      3813
#    macro avg       0.17      0.18      0.15      3813
# weighted avg       0.25      0.32      0.25      3813


In [65]:
data_container.train.value_counts()

{0: 1408,
 1: 4125,
 2: 1077,
 3: 3105,
 4: 6869,
 5: 1215,
 6: 4149,
 7: 1790,
 8: 6762}

## Balance data for better accuracy

In [0]:
train_bundle = DataBundle.from_unbalanced(
    data_container.train,
)

In [48]:
train_bundle.value_counts()

{0: 4094, 1: 6126, 2: 6925, 3: 4173, 4: 6796}

In [0]:
data_container.train.dataset = make_dataset_from_data_bundle(train_bundle)

In [0]:
train_dataset = (
    data_container.train.dataset.repeat()
    .shuffle(len(train_bundle))
    .batch(BATCH_SIZE)
    .map(length_limiter)
    .prefetch(AUTOTUNE)
)

In [51]:
for x, y in train_dataset.take(1):
    print(x.shape)
    print(y.shape)
    print(x[0])
    print(y[0])

(8,)
(8,)
tf.Tensor(b'omg supernatural is on after good news week', shape=(), dtype=string)
tf.Tensor(0, shape=(), dtype=int64)


In [0]:
train_dataset_steps = math.ceil(len(train_bundle) / BATCH_SIZE)

In [53]:
train_dataset_steps

3515

After epochs=[2, 5] resulsts was better, decided to train more.

In [0]:
def run_models(urls, epochs):
    for url in urls:
        model = make_hub_model(url)
        model_name = f"{url.split('/')[4]}"
        print(f" {model_name} ".center(80, "="))
        shutil.rmtree(str(DATA_DIR / model_name), ignore_errors=True)
        train_model(
            model=model,
            epochs=epochs,
            optimizers=[keras.optimizers.Adam(lr=1e-4), keras.optimizers.Adam(lr=3e-5)],
            patience=2,
            verbose=2,
            log_dir=str(DATA_DIR / model_name),
        )
        model.save(f"{DATA_DIR / model_name}.h5")
        save_keras_model(
            model,
            str(DATA_DIR / model_name / "architecture"),
            str(DATA_DIR / model_name / "weights"),
        )
        keras.backend.clear_session()
        del model
        keras.backend.clear_session()

In [91]:
# run_models(model_urls)

Train for 6457 steps, validate for 477 steps
Epoch 1/2
6457/6457 - 23s - loss: 2.1567 - sparse_categorical_accuracy: 0.1796 - sparse_top_2_categorical_accuracy: 0.3196 - val_loss: 2.1879 - val_sparse_categorical_accuracy: 0.1259 - val_sparse_top_2_categorical_accuracy: 0.2814
Epoch 2/2
6457/6457 - 22s - loss: 2.1309 - sparse_categorical_accuracy: 0.2019 - sparse_top_2_categorical_accuracy: 0.3505 - val_loss: 2.1536 - val_sparse_categorical_accuracy: 0.1883 - val_sparse_top_2_categorical_accuracy: 0.3622
Train for 6457 steps, validate for 477 steps
Epoch 1/5
6457/6457 - 430s - loss: 2.0845 - sparse_categorical_accuracy: 0.2452 - sparse_top_2_categorical_accuracy: 0.3975 - val_loss: 2.1473 - val_sparse_categorical_accuracy: 0.1686 - val_sparse_top_2_categorical_accuracy: 0.3145
Epoch 2/5
6457/6457 - 429s - loss: 2.0273 - sparse_categorical_accuracy: 0.2810 - sparse_top_2_categorical_accuracy: 0.4358 - val_loss: 2.0899 - val_sparse_categorical_accuracy: 0.2046 - val_sparse_top_2_categoric

In [92]:
reports = evaluate_models(model_urls, version_model_map.keys())

Loading architecture & weights separately
Loading architecture & weights separately


In [93]:
for model_name, report in reports.items():
    print(f" {model_name} ".center(80, "="))
    print(report)

              precision    recall  f1-score   support

           0       0.17      0.10      0.13       185
           1       0.11      0.00      0.00       515
           2       0.12      0.44      0.19       117
           3       0.29      0.57      0.39       373
           4       0.40      0.36      0.38       915
           5       0.09      0.45      0.15       161
           6       0.41      0.03      0.05       508
           7       0.12      0.08      0.09       209
           8       0.39      0.34      0.36       830

    accuracy                           0.26      3813
   macro avg       0.23      0.26      0.19      3813
weighted avg       0.30      0.26      0.24      3813

              precision    recall  f1-score   support

           0       0.16      0.18      0.17       185
           1       0.22      0.00      0.01       515
           2       0.10      0.55      0.16       117
           3       0.27      0.60      0.37       373
           4       0.45 

In [0]:
# epochs=[2, 5]
# ====================== Wiki-words-250-with-normalization =======================
#               precision    recall  f1-score   support

#            0       0.17      0.10      0.13       185
#            1       0.11      0.00      0.00       515
#            2       0.12      0.44      0.19       117
#            3       0.29      0.57      0.39       373
#            4       0.40      0.36      0.38       915
#            5       0.09      0.45      0.15       161
#            6       0.41      0.03      0.05       508
#            7       0.12      0.08      0.09       209
#            8       0.39      0.34      0.36       830

#     accuracy                           0.26      3813
#    macro avg       0.23      0.26      0.19      3813
# weighted avg       0.30      0.26      0.24      3813

# ====================== nnlm-en-dim128-with-normalization =======================
#               precision    recall  f1-score   support

#            0       0.16      0.18      0.17       185
#            1       0.22      0.00      0.01       515
#            2       0.10      0.55      0.16       117
#            3       0.27      0.60      0.37       373
#            4       0.45      0.29      0.35       915
#            5       0.08      0.36      0.13       161
#            6       0.38      0.01      0.02       508
#            7       0.21      0.04      0.06       209
#            8       0.37      0.33      0.35       830

#     accuracy                           0.24      3813
#    macro avg       0.25      0.26      0.18      3813
# weighted avg       0.32      0.24      0.22      3813


In [51]:
run_models(model_urls, epochs=[2, 15])

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/Wiki-words-250-with-normalization/2'.
INFO:absl:Downloading https://tfhub.dev/google/Wiki-words-250-with-normalization/2: 790.00MB
INFO:absl:Downloaded https://tfhub.dev/google/Wiki-words-250-with-normalization/2, Total size: 970.91MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/Wiki-words-250-with-normalization/2'.


Train for 6434 steps, validate for 477 steps
Epoch 1/2
6434/6434 - 34s - loss: 2.1524 - sparse_categorical_accuracy: 0.1806 - sparse_top_2_categorical_accuracy: 0.3250 - val_loss: 2.1841 - val_sparse_categorical_accuracy: 0.1340 - val_sparse_top_2_categorical_accuracy: 0.2985
Epoch 2/2
6434/6434 - 32s - loss: 2.1314 - sparse_categorical_accuracy: 0.2017 - sparse_top_2_categorical_accuracy: 0.3510 - val_loss: 2.1454 - val_sparse_categorical_accuracy: 0.1886 - val_sparse_top_2_categorical_accuracy: 0.3669
Train for 6434 steps, validate for 477 steps
Epoch 1/15
6434/6434 - 1290s - loss: 2.0850 - sparse_categorical_accuracy: 0.2430 - sparse_top_2_categorical_accuracy: 0.3954 - val_loss: 2.1423 - val_sparse_categorical_accuracy: 0.1778 - val_sparse_top_2_categorical_accuracy: 0.3331
Epoch 2/15
6434/6434 - 1292s - loss: 2.0234 - sparse_categorical_accuracy: 0.2807 - sparse_top_2_categorical_accuracy: 0.4400 - val_loss: 2.0771 - val_sparse_categorical_accuracy: 0.2001 - val_sparse_top_2_categ







KeyboardInterrupt: ignored

In [64]:
reports = evaluate_models(model_urls, version_model_map.keys())

Loading architecture & weights separately


FileNotFoundError: ignored

In [0]:
for model_name, report in reports.items():
    print(f" {model_name} ".center(80, "="))
    print(report)

## Use Bert

In [36]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-cased")

100%|██████████| 213450/213450 [00:00<00:00, 395599.89B/s]


In [0]:
def tokenize_and_pad(arr, tokenizer, sequence_length):
    return np.array(
        [
            np.pad(
                tokenizer.encode(x, add_special_tokens=True),
                (0, sequence_length),
                "constant",
                constant_values=tokenizer.pad_token_id,
            )[:sequence_length]
            for x in arr
        ]
    )

In [0]:
data_container.train.x = tokenize_and_pad(train_bundle.x, tokenizer, 64)
data_container.validation.x = tokenize_and_pad(data_container.validation.x, tokenizer, 64)
data_container.test.x = tokenize_and_pad(data_container.test.x, tokenizer, 64)

In [39]:
len(data_container.train.x)

51690

In [0]:
def make_bert_dataset(data_bundle, tokenizer):
    features = tf.data.Dataset.from_tensor_slices(data_bundle.x)
    labels = tf.data.Dataset.from_tensor_slices(data_bundle.y)
    dataset = tf.data.Dataset.zip((features, labels)).map(
        lambda x, y: (
            {
                "input_ids": x,
                "attention_mask": int(x != tokenizer.pad_token_id),
                "token_type_ids": tf.zeros_like(x),
            },
            y,
        )
    )
    return dataset

In [0]:
base_bert_dataset = make_bert_dataset(data_container.train, tokenizer)

In [0]:
train_bert_dataset = (
    base_bert_dataset.cache()
    .repeat()
    .shuffle(len(data_container.train.x))
    .batch(BATCH_SIZE)
    .prefetch(AUTOTUNE)
)

In [0]:
validation_bert_dataset = (
    make_bert_dataset(data_container.validation, tokenizer).batch(BATCH_SIZE).prefetch(AUTOTUNE)
)

In [0]:
test_bert_dataset = (
    make_bert_dataset(data_container.test, tokenizer).batch(BATCH_SIZE).prefetch(AUTOTUNE)
)

In [0]:
def init_label_map(filename, data_bundle):
    try:
        label_map = load_file(filename)
    except:
        label_map = data_bundle.make_label_map()
        save_file(label_map, filename)
    return label_map

In [0]:
label_map = init_label_map(DATA_DIR / "label_map.pickle", data_container.train)

In [0]:
!pip install -q -U toai

In [98]:
from toai.data import DataBundle, DataParams, DataContainer



In [0]:
data_container = DataContainer(
    base=base_bert_dataset,
    train=train_bert_dataset,
    train_steps=len(data_container.train.x) // BATCH_SIZE,
    validation=validation_bert_dataset,
    test=test_bert_dataset,
    label_map=label_map,
)

In [53]:
data_container.n_classes

9

In [0]:
def train_model(
    model,
    data_container,
    epochs,
    lrs=None,
    optimizers=None,
    patience=5,
    verbose=1,
    class_weights=None,
    log_dir=str(DATA_DIR / "logs"),
):
    model.layers[0].trainable = False
    if optimizers is None:
        optimizers = [keras.optimizers.Adam(lr) for lr in lrs]
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizers[0],
        metrics=[
            keras.metrics.sparse_categorical_accuracy,
            sparse_top_2_categorical_accuracy,
        ],
    )
    model.fit(
        data_container.train,
        steps_per_epoch=data_container.train_steps,
        validation_data=data_container.validation,
        epochs=epochs[0],
        callbacks=[
            keras.callbacks.ReduceLROnPlateau(patience=1, factor=0.3),
            keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True),
        ],
        class_weight=class_weights,
        verbose=verbose,
    )
    model.layers[0].trainable = True
    model.compile(
        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        optimizer=optimizers[1],
        metrics=[
            keras.metrics.sparse_categorical_accuracy,
            sparse_top_2_categorical_accuracy,
        ],
    )
    model.fit(
        data_container.train,
        steps_per_epoch=data_container.train_steps,
        validation_data=data_container.validation,
        epochs=epochs[1],
        callbacks=[
            keras.callbacks.ReduceLROnPlateau(patience=patience // 2, factor=0.3),
            keras.callbacks.EarlyStopping(patience=patience, restore_best_weights=True),
            keras.callbacks.TensorBoard(log_dir=log_dir),
        ],
        class_weight=class_weights,
        verbose=verbose,
    )

In [56]:
config = transformers.BertConfig.from_pretrained(
    "bert-base-cased", num_labels=data_container.n_classes
)

100%|██████████| 313/313 [00:00<00:00, 62156.96B/s]


In [0]:
class TFBertForSequenceClassification(transformers.TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super(TFBertForSequenceClassification, self).__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        self.bert = transformers.TFBertMainLayer(config, name="bert")
        self.dropout1 = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.hidden1 = tf.keras.layers.Dense(1024, activation=tf.keras.activations.relu)
        self.dropout2 = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(
            config.num_labels,
            kernel_initializer=transformers.modeling_tf_utils.get_initializer(
                config.initializer_range
            ),
            name="classifier",
        )

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)

        pooled_output = outputs[1]

        pooled_output = self.dropout1(
            pooled_output, training=kwargs.get("training", False)
        )
        hidden = self.dropout2(
            self.hidden1(pooled_output), training=kwargs.get("training", False)
        )
        logits = self.classifier(hidden)

        outputs = (logits,)

        return outputs


In [58]:
model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-cased", config=config
)

100%|██████████| 526681800/526681800 [00:44<00:00, 11892963.36B/s]


In [59]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
dense (Dense)                multiple                  787456    
_________________________________________________________________
dropout_38 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  9225      
Total params: 109,106,953
Trainable params: 109,106,953
Non-trainable params: 0
_________________________________________________________________


In [0]:
class_weights = dict(
    enumerate(
        sk.utils.class_weight.compute_class_weight(
            "balanced", np.unique(train_bundle.y), train_bundle.y
        )
    )
)

In [64]:
train_model(
    model,
    data_container,
    [0, 5],
    [3e-6, 1e-6],
    class_weights=class_weights,
    patience=2,
)

Train for 6461 steps, validate for 477 steps
Train for 6461 steps, validate for 477 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5


In [65]:
print(
    classification_report(
        [label.numpy() for _, label in data_container.validation.unbatch()],
        model.predict(data_container.validation).argmax(axis=1),
    )
)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       160
           1       0.00      0.00      0.00       573
           2       0.00      0.00      0.00       143
           3       0.00      0.00      0.00       351
           4       0.26      0.56      0.35       863
           5       0.00      0.00      0.00       163
           6       0.00      0.00      0.00       528
           7       0.00      0.00      0.00       190
           8       0.25      0.58      0.35       842

    accuracy                           0.25      3813
   macro avg       0.06      0.13      0.08      3813
weighted avg       0.11      0.25      0.16      3813



  'precision', 'predicted', average, warn_for)
