# Textbook Name Extraction
### by Jai Smith (Jai.K.Smith.22@dartmouth.edu)

In [3]:
import json
import os
import shutil
from bs4 import BeautifulSoup
import tensorflow as tf
import tensorflow_hub as hub
# import tensorflow_addons.text as text
# import tensorflow_addons.optimizers as optimizers

import constants
from modules import loader, preprocessing

ModuleNotFoundError: No module named 'tensorflow_hub'

## Download source data

**Note:** `books.csv` dataset cannot be downloaded automatically, please download and save to `data/raw/books.csv`.

In [16]:
os.makedirs('data/raw')

FILENAMES = [constants.TIMETABLE_FILENAME, constants.ORC_FILENAME]
for filename in FILENAMES:
    if not os.path.isfile('data/raw/' + filename):
        loader.fetch_source(constants.DATA_SOURCE, filename)

## Prepare dataset for training

Before running this step, please create the `data/processed` folder containing `positive/` and `negative/` subfolders.

In [2]:
# clear and recreate processed dir (empty)
if os.path.isdir('data/processed'):
    shutil.rmtree('data/processed', ignore_errors=True)
os.makedirs('data/processed/train/positive')
os.mkdir('data/processed/train/negative')
os.makedirs('data/processed/test/positive')
os.mkdir('data/processed/test/negative')

# generate dataset
preprocessing.generate_positive()
preprocessing.generate_negative()

## Configure tf dataset

In [17]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

# training set
raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/processed/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

# validation set
val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/processed/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# testing set
test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'data/processed/test',
    batch_size=batch_size
)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 18914 files belonging to 2 classes.
Using 15132 files for training.
Found 18914 files belonging to 2 classes.
Using 3782 files for validation.
Found 3340 files belonging to 2 classes.


## Select BERT model

[see options here](https://www.tensorflow.org/text/tutorials/classify_text_with_bert#choose_a_bert_model_to_fine-tune)

In [20]:
# # use ALBERT
# tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/albert_en_base/2'
# tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/albert_en_preprocess/3'

# use small bert
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

AttributeError: module 'tensorflow_addons.text' has no attribute 'normalize_utf8'

## Define model

Based off binary text classification example [here](https://www.tensorflow.org/text/tutorials/classify_text_with_bert)

In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

classifier_model = build_classifier_model()

## Define loss function

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

## Define optimizer

In [None]:
epochs = 5
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

## Load BERT

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

## Train

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

## Test

In [None]:
loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

## Training stats

In [None]:
history_dict = history.history
print(history_dict.keys())

acc = history_dict['binary_accuracy']
val_acc = history_dict['val_binary_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)
fig = plt.figure(figsize=(10, 6))
fig.tight_layout()

plt.subplot(2, 1, 1)
# "bo" is for "blue dot"
plt.plot(epochs, loss, 'r', label='Training loss')
# b is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
# plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(2, 1, 2)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

## Export

In [None]:
dataset_name = 'textbook_name_recognition'
saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

classifier_model.save(saved_model_path, include_optimizer=False)

# !zip -r /content/textbook_name_recognition_bert.zip /content/textbook_name_recognition_bert

## Import

In [22]:
reloaded_classifier_model = tf.saved_model.load('textbook_name_recognition_bert')

FileNotFoundError: Op type not registered 'CaseFoldUTF8' in binary running on Jais-MacBook-Pro.local. Make sure the Op and Kernel are registered in the binary running in this process. Note that if you are loading a saved graph which used ops from tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done before importing the graph, as contrib ops are lazily registered when the module is first accessed.
 If trying to load on a different device from the computational device, consider using setting the `experimental_io_device` option on tf.saved_model.LoadOptions to the io_device such as '/job:localhost'.

## Sample usage

In [23]:
def print_examples(inputs, results):
  for i in range(len(inputs)):
    print('input: {}\n\tscore: {:.3f}%'.format(inputs[i], results[i][0] * 100))

examples = [
  'harry potter by jk. rowling',
  'Intro to Data Structures, Cormen et. al, 2020',
  'You will be required to purchase these textbooks',
  'required readings:',
  'harry potter, 339 pages',
  'What is gender',
  'Al-kitaab fii Ta\'allum al-\'Arabiyya, with DVDs, a Textbook for Intermediate Arabic, Part two, third Edition Georgetown University Press, Washington',
  'intro to gender studies',
  'intro to chemistry',
  'Embodied psychologies'
]

results = tf.sigmoid(classifier_model(tf.constant(examples)))

print_examples(examples, results)

NameError: name 'classifier_model' is not defined