In [2]:
# Training notebook for easy vs. standard German classifier 
# Authors: Hadi Asghari & Freya Hewett
# Version: 2023.02

# Note: due to copyright reasons we can only release a subset of our dataset, therefore if you re-run this 
# notebook, the results may differ

import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from keras.callbacks import ModelCheckpoint

In [2]:
# Limit GPU memory use by TF
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    print(gpus)
    tf.config.experimental.set_memory_growth(gpus[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [12]:
# LOAD DATASETS  
train_pds = pd.read_csv("./data/dataset-train.csv", sep="\t")
print("train:", len(train_pds), "\n", train_pds.head(1))
eval_pds = pd.read_csv("./data/datset-test.csv", sep="\t")
print("validation:" , len(eval_pds), "\n", eval_pds.head(1))

# TURN DATA INTO TF DATASETS
minibatchsize = 32
train_x = train_pds.text  # used by keras tokenizers later
train_ds = tf.data.Dataset.from_tensor_slices((train_pds.text, train_pds.label))
train_ds = train_ds.batch(minibatchsize)  # needed to allow iteration
eval_ds = tf.data.Dataset.from_tensor_slices((eval_pds.text, eval_pds.label))  
eval_ds = eval_ds.batch(minibatchsize) 

train: 10164 
   source                                               text  label
0  klexi  Das Musical [ˈmju:zikəl] ist eine in der Regel...      0
validation: 2530 
   source                                               text  label
0  klexi  Der Leopard (Panthera pardus) ist eine Art aus...      0


In [None]:
# Test of some basic metrics to make sure our trainingset is in order.
print("*** balance:")  
print(train_pds.groupby(['source', 'label']).count())

print("\n*** check training sample in validation: ")
for i, s in enumerate(eval_pds.text):
    if len(train_pds[train_pds.text==s]):
        print('repetitive val. sample:', i)  
        print(s, "\n")

print("\n*** basic complexity:")
import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer, sent_tokenize
tkz = RegexpTokenizer(r'\w+')
train_pds["words"] = train_pds['text'].apply(lambda x: len(tkz.tokenize(x)))
train_pds["wordlen"] = train_pds['text'].str.len() / train_pds["words"]
train_pds["sentlen"] = train_pds["words"] / train_pds['text'].apply(lambda x: len(sent_tokenize(x)))

train_pds.groupby(['source', 'label']).mean()

In [5]:
# preprocessing / tokenizer for BoW model
max_tokens = 20000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode="multi_hot",
)
text_vectorization.adapt(train_x)

train_v = train_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
eval_v = eval_ds.map(lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

2022-06-14 12:04:50.678505: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-06-14 12:04:50.730990: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


In [6]:
vocab = text_vectorization.get_vocabulary()
print(len(vocab))  # we have 20k words
print(vocab[:10])  # words in order of commonality 
print(vocab[-10:]) 

20000
['[UNK]', 'die', 'der', 'in', 'und', 'das', 'den', 'ist', 'im', 'von']
['unangenehm', 'umzugehen', 'umweltminister', 'umweltaktivistin', 'umweg', 'umrunden', 'umkreis', 'umgeknickte', 'umgegangen', 'umgebenden']


In [16]:
# 2-layers fully connected architecture for BoW

def get_model(max_tokens=max_tokens, hidden_dim=16):
    inputs = keras.Input(shape=(max_tokens,))
    x = layers.Dense(hidden_dim, activation="relu")(inputs)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(optimizer="rmsprop",
                  loss="binary_crossentropy",
                  metrics=["accuracy"])
    return model

model = get_model()
model.summary()

# Train model

model.fit(
    train_v,
    validation_data=eval_v,
    epochs=10,
    callbacks=[ModelCheckpoint('modelcheckpoint', save_best_only=True)])


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320,033
Trainable params: 320,033
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
  9/321 [..............................] - ETA: 2s - loss: 0.3929 - accuracy: 0.7604  

2022-06-14 13:26:45.584220: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-06-14 13:26:47.989713: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2821861f0>

In [8]:
# Test model. Note accuracy is .99
model = keras.models.load_model('modelcheckpoint')
print(model.evaluate(eval_v))



2022-06-14 13:01:33.160126: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


[0.027651270851492882, 0.9914330840110779]


In [7]:
# SAVE MODEL
model.save("mbow-alldata")

pickle.dump({'config': text_vectorization.get_config(), 'weights': text_vectorization.get_weights()},
            open("textvectorizer.pickle", "wb"))

2022-05-10 11:10:32.054712: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: model-nnbow-bigger/assets


**Additional notes**

- We experimented with more complex models as well, but the accuracy of the model in this notebook was sufficient (.99 on test set)
- The model was tested on LS pages from states: 286 (78%) out of 365 pages received a prediction of >0.7 