# NER Model
---

## Data load

In [1]:
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv("../data/features.csv")
data.head()


Unnamed: 0,title,brand,color,size
0,Becotray 51 X 40 X 17 Cm Azul Oferta,BECO,azul,17;17 cm;51 x 40 x 17 cm;40;51
1,Becotray 51 X 40 X 17 Cm Marrón Oferta,BECO,marrón,17;17 cm;51 x 40 x 17 cm;40;51
2,"Heno Home Friends Menta/Escaramujo 0,5Kg Oferta",COMINTER,menta,"0,5kg;0,"
3,Seresto Collar Antiparasitario Perro - 8Kg / 38Cm,BAYER,,38cm;8kg
4,Seresto Collar Antiparasitario Perro + 8 Kg / ...,BAYER,,8 kg;70cm


## Create corpus

In [3]:
from utils.create_corpus import parse_corpus


In [4]:
corpus = parse_corpus(data)
print(corpus[2])


{'Heno': 'O', 'Home': 'O', 'Friends': 'O', 'Menta': 'B-color', '/': 'O', 'Escaramujo': 'O', '0,5Kg': 'O', 'Oferta': 'O'}


## Get model inputs and labels

In [5]:
from utils.inputs import get_vectors, flatten, get_inputs_and_labels


In [6]:
a, b = get_inputs_and_labels(corpus)

In [7]:
import json

In [8]:
with open("../data/inputs.jsonl", "w") as f:
    for _a, _b in zip(a, b):
        f.write(json.dumps({"tokens": _a, "tags": _b}) + "\n")


In [10]:
X, Y, _, num_tags = get_vectors(corpus)


In [11]:
print(X[0], Y[0])
print(num_tags)

[786, 7, 1712, 799, 1295, 365, 222, 1952] [4, 4, 4, 4, 4, 4, 5, 4]
6


### Calculate vocab size and save

In [1]:
vocab = flatten(corpus)
vocab_size = len(vocab)
print(vocab_size)


NameError: name 'flatten' is not defined

In [None]:
np.savetxt("../data/vocab.txt", vocab, fmt="'%s'")


UnicodeEncodeError: 'charmap' codec can't encode character '\u0445' in position 4: character maps to <undefined>

## Split data

In [None]:
from sklearn.model_selection import train_test_split


KeyboardInterrupt: 

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=1)
print(len(X_train), len(X_val))


1487 372


## Create model

In [None]:
from utils.model import NERModel


2022-09-22 13:54:40.019222: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-22 13:54:47.574423: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-22 13:54:47.574879: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model = NERModel(tags=num_tags, vocab_size=vocab_size)
# print(model)


## Data batch

In [None]:
from utils.batch import batch_generator
from trax.data.inputs import add_loss_weights


In [None]:
mask_value = -1

In [None]:
train_batch_gen = batch_generator(X_train, Y_train, pad=mask_value)
val_batch_gen = batch_generator(X_val, Y_val, pad=mask_value)


In [None]:
train_gen = add_loss_weights(train_batch_gen, id_to_mask=mask_value)
val_gen = add_loss_weights(val_batch_gen, id_to_mask=mask_value)


## Train model

In [None]:
from trax.supervised import training
from trax import optimizers as opts
from trax import layers as tl


In [None]:
def train_model(
    model, train_generator, val_generator, train_steps=1, output_dir="model"
):
    train_task = training.TrainTask(
        train_generator,
        loss_layer=tl.CrossEntropyLoss(),
        optimizer=opts.Adam(0.01),
        n_steps_per_checkpoint=10,
    )

    eval_task = training.EvalTask(
        labeled_data=val_generator,
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
        n_eval_batches=10,
    )

    training_loop = training.Loop(
        model,
        tasks=[train_task],
        eval_tasks=[eval_task],
        output_dir=output_dir,
    )

    training_loop.run(n_steps=train_steps)
    return training_loop


In [None]:
epochs = 1000
output_dir = "../models"

training_loop = train_model(model, train_gen, val_gen, epochs, output_dir)


2022-09-22 13:58:58.578240: E external/org_tensorflow/tensorflow/compiler/xla/service/slow_operation_alarm.cc:65] 
********************************
[Compiling module jit_single_device_update_fn.98] Very slow compile?  If you want to file a bug, run with envvar XLA_FLAGS=--xla_dump_to=/tmp/foo and attach the results.
********************************
2022-09-22 14:12:37.804781: E external/org_tensorflow/tensorflow/compiler/xla/service/slow_operation_alarm.cc:133] The operation took 15m39.226732796s

********************************
[Compiling module jit_single_device_update_fn.98] Very slow compile?  If you want to file a bug, run with envvar XLA_FLAGS=--xla_dump_to=/tmp/foo and attach the results.
********************************
  with gzip.GzipFile(fileobj=f, compresslevel=compresslevel) as gzipf:



Step      1: Total number of trainable weights: 55166769
Step      1: Ran 1 train steps in 981.87 secs
Step      1: train CrossEntropyLoss |  3.00129676


  with gzip_lib.GzipFile(fileobj=f, compresslevel=2) as gzipf:


Step      1: eval  CrossEntropyLoss |  3.19672992
Step      1: eval          Accuracy |  0.71199360

Step     10: Ran 9 train steps in 21.69 secs
Step     10: train CrossEntropyLoss |  5.36177540
Step     10: eval  CrossEntropyLoss |  4.82323470
Step     10: eval          Accuracy |  0.70426232

Step     20: Ran 10 train steps in 22.28 secs
Step     20: train CrossEntropyLoss |  3.33057261
Step     20: eval  CrossEntropyLoss |  4.04009326
Step     20: eval          Accuracy |  0.71858896

Step     30: Ran 10 train steps in 22.64 secs
Step     30: train CrossEntropyLoss |  2.95643616
Step     30: eval  CrossEntropyLoss |  1.96336987
Step     30: eval          Accuracy |  0.12207776

Step     40: Ran 10 train steps in 22.80 secs
Step     40: train CrossEntropyLoss |  1.40213001
Step     40: eval  CrossEntropyLoss |  0.59534968
Step     40: eval          Accuracy |  0.69862046

Step     50: Ran 10 train steps in 22.48 secs
Step     50: train CrossEntropyLoss |  0.74997789
Step     50: eva

  return [f for f in flat if f is not None and f is not ()]  # pylint: disable=literal-comparison
  return [f for f in flat if f is not None and f is not ()]  # pylint: disable=literal-comparison
  return [f for f in flat if f is not None and f is not ()]  # pylint: disable=literal-comparison
  return [f for f in flat if f is not None and f is not ()]  # pylint: disable=literal-comparison
  return [f for f in flat if f is not None and f is not ()]  # pylint: disable=literal-comparison
  return [f for f in flat if f is not None and f is not ()]  # pylint: disable=literal-comparison


StopIteration: 