# NER Model
---

In [103]:
import trax
from trax import layers as tl


In [104]:
import os
import numpy as np
import pandas as pd
import random as rnd


## Data load

In [105]:
data = pd.read_csv("../data/features.csv")
data.head()


Unnamed: 0,title,color,size
0,Zapatillas La Sportiva Ultra Raptor Hombre Neg...,Negro/Verde,46
1,Calcetines running Compressport Run High V3 Bl...,Blanco/Rosa,45 - 48
2,ZAPATILLAS CHIRUCA TASMANIA 10 GORE-TEX GRIS 4...,Gris,41
3,ZAPATILLAS CHIRUCA TASMANIA 10 GORE-TEX GRIS 4...,Gris,40
4,Botas Boreal APACHE GRIS 47 Gris,Gris,47


## Create corpus

In [106]:
from utils.create_corpus import parse_corpus


In [107]:
corpus = parse_corpus(data)
print(corpus[0])


{'Zapatillas': 'O', 'La': 'O', 'Sportiva': 'O', 'Ultra': 'O', 'Raptor': 'O', 'Hombre': 'O', 'Negro': 'B-color', 'Verde': 'B-color', '46': 'B-size', 'Negro/Verde': 'O'}


## Get model inputs and labels

In [108]:
def flatten(*lists, unique=False):
    flattened = [item for l in lists for sublist in l for item in sublist]
    return list(set(flattened)) if unique else flattened


def get_len(*lists) -> int:
    return sum(len(row) for l in lists for row in l)


In [109]:
def get_inputs_and_labels(corpus):
    X, Y = [], []

    for line in corpus:
        x = list(line.keys())
        y = list(line.values())
        X.append(x)
        Y.append(y)

    def get_vector(x, surplus=0):
        flattened = flatten(x, unique=True)
        idx = {word: i + surplus for i, word in enumerate(flattened)}
        vector = [[idx[word] for word in row] for row in x]
        return vector, len(flattened)

    vector_X, num_x = get_vector(X, surplus=1)
    vector_Y, num_y = get_vector(Y)

    return vector_X, vector_Y, num_x, num_y


In [110]:
X, Y, _, num_tags = get_inputs_and_labels(corpus)
print(num_tags)

5


In [111]:
print(X[0], Y[0])


[1226, 916, 447, 954, 85, 1275, 121, 1268, 24, 133] [0, 0, 0, 0, 0, 0, 2, 2, 4, 0]


### Calculate vocab size

In [112]:
vocab_size = get_len(X)
print(vocab_size)


17411


## Split data

In [113]:
from sklearn.model_selection import train_test_split


In [124]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=1)
print(len(X_train), len(X_val))


1487 372


## Data batch

In [115]:
def batch_generator(X, Y, batch_size=4, pad=None, shuffle=True):
    """Generate batches of data.
    Args:
        X (list): List of sentences.
        Y (list): List of labels.
        pad_value: Value use for padding. Will be removed on trainning. Necessary for regular shaped batches.
        batch_size (int, optional): Size of the batch. Defaults to 4.
        shuffle (bool, optional): Shuffle data. Defaults to True.
    Yields:
        tuple: Batch of data.
    """
    if shuffle:
        data = list(zip(X, Y))
        rnd.shuffle(data)
        X, Y = zip(*data)

    max_input_size = max(len(x) for x in X)

    def set(batch_idx, global_idx, batch, data):
        item = data[global_idx]
        batch[i, : len(item)] = item

    data_idx = 0
    while data_idx < len(X):
        x = np.full((batch_size, max_input_size), fill_value=pad)
        y = x.copy()

        for i in range(batch_size):
            set(i, data_idx, x, X)
            set(i, data_idx, y, Y)

            data_idx += 1
            if data_idx >= len(X):
                break

        yield x, y


## Create model

In [116]:
from trax.models import reformer


In [117]:
def NERModel(tags, vocab_size, d_model=50):
    model = tl.Serial(
        reformer.Reformer(vocab_size, d_model, ff_activation=tl.LogSoftmax),
        tl.Dense(tags),
        tl.LogSoftmax(),
    )
    return model


In [118]:
model = NERModel(tags=num_tags, vocab_size=vocab_size)
# print(model)


## Train model

In [119]:
from trax.supervised import training
from trax.data.inputs import add_loss_weights
from trax import optimizers as opts


In [120]:
train_batch_gen = batch_generator(X_train, Y_train, pad=vocab_size)
val_batch_gen = batch_generator(X_val, Y_val, pad=vocab_size)


In [121]:
train_gen = add_loss_weights(train_batch_gen, id_to_mask=vocab_size)
val_gen = add_loss_weights(val_batch_gen, id_to_mask=vocab_size)


In [122]:
def train_model(
    model, train_generator, val_generator, train_steps=1, output_dir="model"
):
    train_task = training.TrainTask(
        train_generator,
        loss_layer=tl.CrossEntropyLoss(),
        optimizer=opts.Adam(0.01),
        n_steps_per_checkpoint=10,
    )

    eval_task = training.EvalTask(
        labeled_data=val_generator,
        metrics=[tl.CrossEntropyLoss(), tl.Accuracy()],
        n_eval_batches=10,
    )

    training_loop = training.Loop(
        model,
        train_task,
        eval_tasks=[eval_task],
        output_dir=output_dir,
    )

    training_loop.run(n_steps=train_steps)
    return training_loop


In [123]:
epochs = 1
output_dir = "../models"

training_loop = train_model(model, train_gen, val_gen, epochs, output_dir)


4


TypeError: object of type 'int' has no len()
  In call to configurable 'EvalTask' (<class 'trax.supervised.training.EvalTask'>)