## Librerías

In [1]:
import random
import gzip
import json
import mlflow
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from torch.utils.data import IterableDataset, DataLoader
from tqdm.notebook import tqdm, trange

## Parámetros

In [2]:
_train_data = "./data/meli-challenge-2019/spanish.train.jsonl.gz"
_validation_data = "./data/meli-challenge-2019/spanish.validation.jsonl.gz"
_test_data = "./data/meli-challenge-2019/spanish.test.jsonl.gz"
_token_to_index = "./data/meli-challenge-2019/spanish_token_to_index.json.gz"
_pretrained_embeddings = "./data/SBW-vectors-300-min5.txt.gz"
_language = "spanish"
_embeddings_size = 300
_hidden_layers = [256, 128]
_dropout = 0.3
_epochs = 3

_CNN_filters_count = 100
_CNN_filters_length = [2, 3, 4]

## Dataset

In [3]:
class MeliChallengeDataset(IterableDataset):
    def __init__(self,
                 dataset_path,
                 random_buffer_size=2048):
        assert random_buffer_size > 0
        self.dataset_path = dataset_path
        self.random_buffer_size = random_buffer_size

        with gzip.open(self.dataset_path, "rt") as dataset:
            item = json.loads(next(dataset).strip())
            self.n_labels = item["n_labels"]
            self.dataset_size = item["size"]

    def __len__(self):
        return self.dataset_size

    def __iter__(self):
        try:
            with gzip.open(self.dataset_path, "rt") as dataset:
                shuffle_buffer = []

                for line in dataset:
                    item = json.loads(line.strip())
                    item = {
                        "data": item["data"],
                        "target": item["target"]
                    }

                    if self.random_buffer_size == 1:
                        yield item
                    else:
                        shuffle_buffer.append(item)

                        if len(shuffle_buffer) == self.random_buffer_size:
                            random.shuffle(shuffle_buffer)
                            for item in shuffle_buffer:
                                yield item
                            shuffle_buffer = []

                if len(shuffle_buffer) > 0:
                    random.shuffle(shuffle_buffer)
                    for item in shuffle_buffer:
                        yield item
        except GeneratorExit:
            return

## Collation function

In [4]:
class PadSequences:
    def __init__(self, pad_value=0, max_length=None, min_length=1):
        assert max_length is None or min_length <= max_length
        self.pad_value = pad_value
        self.max_length = max_length
        self.min_length = min_length

    def __call__(self, items):
        data, target = list(zip(*[(item["data"], item["target"]) for item in items]))
        seq_lengths = [len(d) for d in data]

        if self.max_length:
            max_length = self.max_length
            seq_lengths = [min(self.max_length, l) for l in seq_lengths]
        else:
            max_length = max(self.min_length, max(seq_lengths))

        data = [d[:l] + [self.pad_value] * (max_length - l)
                for d, l in zip(data, seq_lengths)]
            
        return {
            "data": torch.LongTensor(data),
            "target": torch.LongTensor(target)
        }

## Lectura de datos

Una cuestión importante, las redes convolucionales sobre text esperan que todas las secuencias sean al menos del tamaño de la convolución máxima (caso contrario ocurrirá un error por no poder realizar la convolución sobre un espacio más chico que el tamaño de la convolución). Es por eso que utilizamos el parámetro min_length esta vez.

In [5]:
pad_sequences = PadSequences(
    pad_value=0,
    max_length=None,
    min_length=max(_CNN_filters_length)
)

In [6]:
# Building training dataset
train_dataset = MeliChallengeDataset(
    dataset_path=_train_data,
    random_buffer_size=2048  # This can be a hypterparameter
)

train_loader = DataLoader(
    train_dataset,
    batch_size=128,  # This can be a hyperparameter
    shuffle=False,
    collate_fn=pad_sequences,
    drop_last=False
)

In [7]:
# Building validation dataset
validation_dataset = MeliChallengeDataset(
    dataset_path=_validation_data,
    random_buffer_size=1
)
validation_loader = DataLoader(
    validation_dataset,
    batch_size=128,
    shuffle=False,
    collate_fn=pad_sequences,
    drop_last=False
)

In [8]:
# Building test dataset
test_dataset = MeliChallengeDataset(
    dataset_path=_test_data,
    random_buffer_size=1
)
test_loader = DataLoader(
    test_dataset,
    batch_size=128,
    shuffle=False,
    collate_fn=pad_sequences,
    drop_last=False
)


## Modelo de clasificación CNN

In [9]:
class CNNClassifier(nn.Module):
    def __init__(self, 
                 pretrained_embeddings_path, 
                 token_to_index,
                 n_labels,
                 vector_size,
                 freeze_embedings):
        super().__init__()
        with gzip.open(token_to_index, "rt") as fh:
            token_to_index = json.load(fh)
        embeddings_matrix = torch.randn(len(token_to_index), vector_size)
        embeddings_matrix[0] = torch.zeros(vector_size)
        with gzip.open(pretrained_embeddings_path, "rt") as fh:
            next(fh)
            for line in fh:
                word, vector = line.strip().split(None, 1)
                if word in token_to_index:
                    embeddings_matrix[token_to_index[word]] =\
                        torch.FloatTensor([float(n) for n in vector.split()])
        self.embeddings = nn.Embedding.from_pretrained(embeddings_matrix,
                                                       freeze=freeze_embedings,
                                                       padding_idx=0)
        self.convs = []
        for filter_lenght in _CNN_filters_length:
            self.convs.append(
                nn.Conv1d(vector_size, _CNN_filters_count, filter_lenght)
            )
        self.convs = nn.ModuleList(self.convs)
        self.fc = nn.Linear(_CNN_filters_count * len(_CNN_filters_length), 128)
        self.output = nn.Linear(128, n_labels)
        self.vector_size = vector_size
    
    @staticmethod
    def conv_global_max_pool(x, conv):
        return F.relu(conv(x).transpose(1, 2).max(1)[0])
    
    def forward(self, x):
        x = self.embeddings(x).transpose(1, 2)  # Conv1d takes (batch, channel, seq_len)
        x = [self.conv_global_max_pool(x, conv) for conv in self.convs]
        x = torch.cat(x, dim=1)
        x = F.relu(self.fc(x))
        x = torch.sigmoid(self.output(x))
        return x
    
    

In [10]:
model = CNNClassifier(
    pretrained_embeddings_path=_pretrained_embeddings, 
    token_to_index=_token_to_index,
    n_labels=train_dataset.n_labels,
    vector_size=_embeddings_size,
    freeze_embedings=True
)

In [11]:
print(model)

CNNClassifier(
  (embeddings): Embedding(50002, 300, padding_idx=0)
  (convs): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(2,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
  )
  (fc): Linear(in_features=300, out_features=128, bias=True)
  (output): Linear(in_features=128, out_features=632, bias=True)
)


## Experimento de MLFlow

In [12]:
torch.cuda.is_available()

True

In [13]:
mlflow.set_experiment("Práctico CNN")

with mlflow.start_run():
    # Log all relevent hyperparameters
    mlflow.log_params({
        "model_type": "Convolutional Network",
        "embeddings": _pretrained_embeddings,
        "dropout": _dropout,
        "embeddings_size": _embeddings_size,
        "freeze_embedding": True,
        "epochs": _epochs,
        "filters_count": _CNN_filters_count,
        "filters_length": _CNN_filters_length,
        "fc_size": 128
   
    })
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    model = model.to(device)
    loss = nn.CrossEntropyLoss()
    optimizer = optim.Adam(
        model.parameters(),
        lr=1e-3,  # This can be a hyperparameter
        weight_decay=1e-5  # This can be a hyperparameter
    )

    for epoch in trange(_epochs):
        model.train()
        running_loss = []
        for idx, batch in enumerate(tqdm(train_loader)):
            optimizer.zero_grad()
            data = batch["data"].to(device)
            target = batch["target"].to(device)
            output = model(data)
            loss_value = loss(output, target)
            loss_value.backward()
            optimizer.step()
            running_loss.append(loss_value.item())
        mlflow.log_metric("train_loss", sum(running_loss) / len(running_loss), epoch)

        if validation_dataset:
            model.eval()
            running_loss = []
            targets = []
            predictions = []
            with torch.no_grad():
                for batch in tqdm(validation_loader):
                    data = batch["data"].to(device)
                    target = batch["target"].to(device)
                    output = model(data)
                    running_loss.append(
                        loss(output, target).item()
                    )
                    targets.extend(batch["target"].numpy())
                    predictions.extend(output.argmax(axis=1).detach().cpu().numpy())
                mlflow.log_metric("validation_loss", sum(running_loss) / len(running_loss), epoch)
                mlflow.log_metric("validation_bacc", balanced_accuracy_score(targets, predictions), epoch)


    if test_dataset:
        model.eval()
        running_loss = []
        targets = []
        predictions = []
        with torch.no_grad():
            for batch in tqdm(test_loader):
                data = batch["data"].to(device)
                target = batch["target"].to(device)
                output = model(data)
                running_loss.append(
                    loss(output, target).item()
                )
                targets.extend(batch["target"].numpy())
                predictions.extend(output.argmax(axis=1).detach().cpu().numpy())
            mlflow.log_metric("test_loss", sum(running_loss) / len(running_loss), epoch)
            mlflow.log_metric("test_bacc", balanced_accuracy_score(targets, predictions), epoch)


INFO: 'Práctico CNN' does not exist. Creating a new experiment


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38245.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9562.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38245.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9562.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38245.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9562.0), HTML(value='')))





HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=498.0), HTML(value='')))


