# DAG - Movie Review Sentiment Classifier Trainer 

## Summary

The present document shows how the model must be created, trained, and evaluates how well it performs on some anedoctal test scenarios. Once the training logic is ready, the code here presente must become an Airflow Operator that will then be referenced in the Model Training DAG.

## Settings

### Hyper Parameters

In [None]:
seq_len    = 500
vocab_size = 10000
batch_size = 50
epochs     = 5
emb_dims   = 64
lstm_units = 128

### Storage

In [None]:
imdb_sentiment_path = 'data/raw/imdb-sentiment.zip'

In [None]:
output_model_path = 'model/movie-sentiment-classifier'

## Environment

### Dependencies

In [None]:
!pip install -U numpy
!pip install -U pandas
!pip install -U tensorflow
!pip install -U scikit-learn
!pip install -U matplotlib

### Imports

In [None]:
import json
import zipfile
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Data

### Loading

In [None]:
df = None
with zipfile.ZipFile(imdb_sentiment_path) as zip_file:
    df = pd.read_csv(
        zip_file.open('train.csv'),
        header=0,
        error_bad_lines=False)
df.head()

### Tokenizer

In [None]:
def create_tokenizer(texts, vocab_size):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=vocab_size,
        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
        lower=True,
        split=' ',
        oov_token='<oov>',
        document_count=0)
    tokenizer.fit_on_texts(texts)
    return tokenizer

In [None]:
tokenizer = create_tokenizer(df.text, vocab_size)

In [None]:
(
    tokenizer.num_words,
    tokenizer.word_index['something'],
    tokenizer.index_word[139])

In [None]:
pd.DataFrame.from_dict(tokenizer.word_index, orient='index')

### Input and Target

In [None]:
def extract_input_and_target(df, tokenizer):
    x = tokenizer.texts_to_sequences(df.text)
    y = list(df.sentiment)
    return x, y 

In [None]:
x_raw, y_raw = extract_input_and_target(df, tokenizer)

In [None]:
print(f'x_raw.shape: {(len(x_raw), len(x_raw[0]))}')
print(f'y_raw.shape: {(len(y_raw),)}')

### Input Padding

In [None]:
def pad_input(x, seq_len):
    return tf.keras.preprocessing.sequence.pad_sequences(
        x,
        maxlen=seq_len,
        dtype='int32',
        padding='post',
        value=0)

In [None]:
x_padded = pad_input(x_raw, seq_len)

In [None]:
x_padded.shape

In [None]:
pd.DataFrame(x_padded)

### Target Categorical

In [None]:
def categorise_target(y):
    y = np.array(y)
    return y.reshape(y.shape[0], 1, 1)

In [None]:
y_categorical = categorise_target(y_raw)

In [None]:
y_categorical.shape

### Splits

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(x_padded, y_categorical, train_size=0.99)

In [None]:
[ x_train.shape, y_train.shape, x_valid.shape, y_valid.shape]

## Model

### Creating


In [None]:
def create_model(vocab_size, seq_len, emb_dims, lstm_units):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=emb_dims, input_length=seq_len, mask_zero=True),
        tf.keras.layers.LSTM(lstm_units, dropout=0.5, recurrent_dropout=0.5),
        tf.keras.layers.Dense(2, activation='softmax')
    ])
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['acc'])
    model.summary()
    return model

In [None]:
model = create_model(
    vocab_size=vocab_size,
    seq_len=seq_len,
    emb_dims=emb_dims,
    lstm_units=lstm_units)

### Training

In [None]:
def train_model(model, train, valid, batch_size=32, epochs=3):
    return model.fit(
        x=train[0], y=train[1],
        batch_size=batch_size,
        epochs=epochs,
        validation_data=valid,
        shuffle=True)

In [None]:
train_result = train_model(
    model,
    train=(x_train, y_train),
    valid=(x_valid, y_valid),
    batch_size=batch_size,
    epochs=epochs)

### Assessing

In [None]:
def plot_metric(history, metric):
    train = history[metric]
    valid = history[f'val_{metric}']
    epochs = range(1, len(train) + 1)
    plt.plot(epochs, train, f'b', label=f'{metric} (train)')
    plt.plot(epochs, valid, f'g', label=f'{metric} (valid)')
    plt.title('{metric}: training vs validation')
    plt.xlabel('epochs')
    plt.xlabel(metric)
    plt.legend()
    plt.show()

In [None]:
plot_metric(train_result.history, 'acc')

In [None]:
plot_metric(train_result.history, 'loss')

In [None]:
model.save(output_model_path)

In [None]:
with open(f'{output_model_path}/assets/tokenizer.json', 'w+', encoding='utf-8') as file:
    file.write(tokenizer.to_json())

## Conclusions

**LSTM**
```
seq_len    = 500
vocab_size = 10000
batch_size = 50
epochs     = 5
emb_dims   = 64
lstm_units = 128

Epoch 1/5 - loss: 0.4595 - acc: 0.7863 - val_loss: 0.2429 - val_acc: 0.9120
Epoch 2/5 - loss: 0.3095 - acc: 0.8804 - val_loss: 0.2876 - val_acc: 0.8920
Epoch 3/5 - loss: 0.2568 - acc: 0.9016 - val_loss: 0.4429 - val_acc: 0.8360
Epoch 4/5 - loss: 0.2358 - acc: 0.9108 - val_loss: 0.2617 - val_acc: 0.9120
Epoch 5/5 - loss: 0.2121 - acc: 0.9207 - val_loss: 0.2542 - val_acc: 0.9120
```

**BiLSTM**
```
seq_len    = 500
vocab_size = 10000
batch_size = 50
epochs     = 5
emb_dims   = 128
lstm_units = 128

Epoch 1/5 - loss: 0.4488 - acc: 0.7933 - val_loss: 0.2628 - val_acc: 0.9040
Epoch 2/5 - loss: 0.2688 - acc: 0.8933 - val_loss: 0.3313 - val_acc: 0.8960
Epoch 3/5 - loss: 0.2201 - acc: 0.9145 - val_loss: 0.2992 - val_acc: 0.8920
Epoch 4/5 - loss: 0.1888 - acc: 0.9289 - val_loss: 0.2390 - val_acc: 0.8680
Epoch 5/5 - loss: 0.1633 - acc: 0.9374 - val_loss: 0.2607 - val_acc: 0.8640
```