# Fase 1: Importar las dependencias.

Texto original en https://arxiv.org/pdf/1404.2188.pdf

In [None]:
import numpy as np
import math
import random
import re
import pandas as pd
from bs4 import BeautifulSoup

from google.colab import drive

In [None]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow.keras import layers
import tensorflow_datasets as tfds

# Fase 2: Preprocesado de Datos

## Carga de Ficheros

In [None]:
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
data = pd.read_csv(
    "/content/drive/My Drive/Curso de NLP/CNN/data/train.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

## Preprocessing

### Cleaning

In [None]:
data.drop(["id", "date", "query", "user"],
          axis=1,
          inplace=True)

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    if tweet == ' ':
        return ''
    if tweet[0] == ' ':
        tweet = tweet[1:]
    if tweet[-1] == ' ':
        tweet = tweet[:-1]
    return tweet

In [None]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

### Tokenization

In [None]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(
    data_clean, target_vocab_size=2**16
)

data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [None]:
data_with_len = [[sent, data_labels[i], len(sent)]
                 for i, sent in enumerate(data_inputs)]
random.shuffle(data_with_len)

data_with_len.sort(key=lambda x: x[2])
sorted_all = [(sent_lab[0], sent_lab[1])
              for sent_lab in data_with_len if sent_lab[2] > 2]

In [None]:
len(sorted_all)

1566799

In [None]:
all_dataset = tf.data.Dataset.from_generator(lambda: sorted_all,
                                             output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 32
all_batched = all_dataset.padded_batch(BATCH_SIZE, padded_shapes=([None,], ())

In [None]:
NB_BATCHES = len(sorted_all) // BATCH_SIZE
NB_BATCHES_TEST = NB_BATCHES // 10
all_batched.shuffle(NB_BATCHES)
test_dataset = all_batched.take(NB_BATCHES_TEST)
train_dataset = all_batched.skip(NB_BATCHES_TEST)

In [None]:
type(train_dataset)

tensorflow.python.data.ops.dataset_ops.SkipDataset

# Paso 3: Construcción del Modelo

## Capas

In [None]:
# (batches, seq_len, d_model, channels)

# Padding para una convolución 2D ampliada
class Widening(layers.Layer):
    
    def __init__(self, padding_size, padding_value=0., axis=1):
        super(Widening, self).__init__()
        self.padding_size = padding_size
        self.padding_value = padding_value
        self.axis = axis
    
    def build(self, input_shape):
        paddings_arr = np.array([[0, 0]]*len(input_shape))
        paddings_arr[self.axis, :] = [self.padding_size, self.padding_size]
        self.paddings = tf.convert_to_tensor(paddings_arr, dtype="int32")
    
    def call(self, inputs):
        return tf.pad(inputs,
                      self.paddings,
                      constant_values=self.padding_value)

class MyConv2D(layers.Layer):

    def __init__(self,
                 nb_filters, 
                 conv_width,
                 padding, 
                 emb_dim):
        super(MyConv2D, self).__init__()
        self.emb_dim = emb_dim
        self.conv_per_col = [layers.Conv2D(filters=nb_filters,
                                           kernel_size=[conv_width, 1],
                                           padding="valid",
                                           activation="tanh")
                             for _ in range(emb_dim)]

    def call(self, inputs):
        convolutions = []
        for i in range(self.emb_dim):
            convolutions.append(self.conv_per_col[i](
                tf.expand_dims(inputs[:, :, i, :], axis=-2)))
        return tf.concat(convolutions, axis=-2)


# k-max pooling
class KMaxPooling(layers.Layer):

    def __init__(self, ktop=4, L=None, l=None):
        super(KMaxPooling, self).__init__()
        self.ktop = ktop
        self.L = L
        self.l = l
    
    def build(self, input_shape):
        s = input_shape[1]
        if self.L is None or self.l is None or s is None:
            self.k = self.ktop
        else:
            self.k = max(self.ktop, math.ceil((self.L-self.l)/self.L*s))
    

    def call(self, inputs):
        inputs_trans = tf.transpose(inputs, [0, 3, 2, 1])
        inputs_trans_kmax = tf.math.top_k(inputs_trans, self.k).values
        inputs_kmax= tf.transpose(inputs_trans_kmax, [0, 3, 2, 1])
    
        return inputs_kmax
        

# Folding
class Folding(layers.Layer):
    
    def __init__(self):
        super(Folding, self).__init__()
        
    def call(self, inputs):
        folded_inputs = tf.math.add_n(
            [inputs[:, :, 0::2, :], inputs[:, :, 1::2, :]]
        ) / 2

        return folded_inputs
        

## Nuestro modelo de RNC

In [None]:
class DCNN(tf.keras.Model):
    
    def __init__(self,
                 vocab_size,
                 emb_dim=48,
                 nb_filters_1=6,
                 conv_width_1=7,
                 ktop_max=4,
                 nb_filters_2=14,
                 conv_width_2=5,
                 fold_patch=2,
                 nb_of_layers=2,
                 padding_value=0,
                 dropout_rate=0.1,
                 nb_classes=2,
                 name="dcnn",
                 **kwargs):
        super(DCNN, self).__init__(name=name, **kwargs)

        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)        
        self.widening_1 = Widening(conv_width_1-1,
                                   axis=1)
        self.conv_1 = MyConv2D(nb_filters=nb_filters_1,
                               conv_width=conv_width_1,
                               padding="valid",
                               emb_dim=emb_dim)
        self.pool_1 = KMaxPooling(ktop_max,
                                  nb_of_layers,
                                  1)
        
        self.widening_2 = Widening(conv_width_2-1,
                                   axis=1)
        self.conv_2 = MyConv2D(nb_filters=nb_filters_2,
                               conv_width=conv_width_2,
                               padding="valid",
                               emb_dim=emb_dim)
        #self.fold = Folding(patch_size=fold_patch)
        self.fold = Folding()
        self.pool_2 = KMaxPooling(ktop_max)
        
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.flatten = layers.Flatten()
        if nb_classes == 2:
            self.dense = layers.Dense(1, activation="sigmoid")
        else:
            self.dense = layers.Dense(nb_classes,
                                      activation="softmax")
    
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x = tf.expand_dims(x, axis=-1)
        
        x = self.widening_1(x)
        x = self.conv_1(x)
        x = self.pool_1(x)
        
        x = self.widening_2(x)
        x = self.conv_2(x)
        x = self.fold(x)
        x = self.pool_2(x)
        
        x = self.dropout(x, training)
        x = self.flatten(x)
        x = self.dense(x)
        return x

# Paso 4: Aplicación

## Configuración

In [None]:
VOCAB_SIZE = tokenizer.vocab_size #66125

EMB_DIM = 60
NB_FILTERS_1 = 6
CONV_WIDTH_1 = 7
KTOP_MAX = 4
NB_FILTERS_2 = 14
CONV_WIDTH_2 = 5
FOLD_PATCH = 2
NB_OF_LAYERS = 2
DROPOUT_RATE = 0.1

NB_CLASSES = 2

EPOCHS=5

## Entrenamiento

In [None]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters_1=NB_FILTERS_1,
            conv_width_1=CONV_WIDTH_1,
            ktop_max=KTOP_MAX,
            nb_filters_2=NB_FILTERS_2,
            conv_width_2=CONV_WIDTH_2,
            fold_patch=FOLD_PATCH,
            nb_of_layers=NB_OF_LAYERS,
            dropout_rate=DROPOUT_RATE,
            nb_classes=NB_CLASSES)

In [None]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adagrad",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adagrad",
                 metrics=["sparse_categorical_accuracy"])

In [None]:
checkpoint_path = "/content/drive/My Drive/Curso de NLP/CNN/ckpt_advanced"

ckpt = tf.train.Checkpoint(Dcnn=Dcnn)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=1)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print("Último checkpoint restaurado!!")

class MyCustomCallback(tf.keras.callbacks.Callback):

    def on_epoch_end(self, epoch, logs=None):
        ckpt_manager.save()
        print("Checkpoint guardado en {}.".format(checkpoint_path))


In [None]:
Dcnn.fit(train_dataset,
         epochs=EPOCHS,
         callbacks=[MyCustomCallback()])

Epoch 1/5
  44067/Unknown - 2283s 52ms/step - loss: 0.6909 - accuracy: 0.5241Checkpoint guardado en /content/drive/My Drive/Curso de NLP/CNN/ckpt_advanced.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f7e7558a710>

## Evaluación

In [None]:
results = Dcnn.evaluate(test_dataset)
print(results)

[0.5345908999443054, 0.7479830384254456]


In [None]:
print(Dcnn(tf.expand_dims(tf.cast(tokenizer.encode("You should try it too!"), dtype=tf.int32), axis=0), training=False))

tf.Tensor([[0.87635505]], shape=(1, 1), dtype=float32)
