In [183]:
# Comprobamos si está tensorflow-gpu==2.3.0
!pip freeze

absl-py==0.12.0
alabaster==0.7.12
albumentations==0.1.12
altair==4.1.0
appdirs==1.4.4
argon2-cffi==20.1.0
arviz==0.11.2
astor==0.8.1
astropy==4.2.1
astunparse==1.6.3
async-generator==1.10
atari-py==0.2.6
atomicwrites==1.4.0
attrs==21.2.0
audioread==2.1.9
autograd==1.3
Babel==2.9.1
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==3.3.0
blis==0.4.1
bokeh==2.3.2
Bottleneck==1.3.2
branca==0.4.2
bs4==0.0.1
CacheControl==0.12.6
cachetools==4.2.2
catalogue==1.0.0
certifi==2020.12.5
cffi==1.14.5
cftime==1.4.1
chainer==7.4.0
chardet==3.0.4
click==7.1.2
cloudpickle==1.3.0
cmake==3.12.0
cmdstanpy==0.9.5
colorcet==2.0.6
colorlover==0.3.0
community==1.0.0b1
contextlib2==0.5.5
convertdate==2.3.2
coverage==3.7.1
coveralls==0.5
crcmod==1.7
cufflinks==0.17.3
cupy-cuda101==7.4.0
cvxopt==1.2.6
cvxpy==1.0.31
cycler==0.10.0
cymem==2.0.5
Cython==0.29.23
daft==0.0.4
dask==2.12.0
datascience==0.10.6
debugpy==1.0.0
decorator==4.4.2
defusedxml==0.7.1
descartes==1.1.0
dill==0.3.3
distributed==1.25.3
dlib==19.18.0
d

In [184]:
# Esta es la tarjeta grafica
!nvidia-smi

Thu May 13 15:36:54 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    60W / 149W |   2212MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [185]:
# instalar librerías. Esta casilla es últil por ejemplo si se ejecuta el cuaderno en Google Colab
# Note que existen otras dependencias como tensorflow, etc. que en este caso se encontrarían ya instaladas

%%capture
# Libreríía transformers
try:
    import tranformers
    print("module 'tranformers' is installed")
except ModuleNotFoundError:
    print("module 'transformers' is being installed")
    !pip install transformers==4.2.1
# Por si se quiere usar la gpu verificamos si tenemos transformers-gpu
import sys
if "tensorflow-gpu" in sys.modules:
    print("tensorflow-gpu already in sys.modules")
else: 
  !pip install tensorflow-gpu==2.3.0

In [186]:
import pandas as pd
import tensorflow as tf
from transformers import DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn import preprocessing
from sklearn.metrics import f1_score
# Para trabajar teniendo encuenta el desbalanceo
from sklearn.utils import class_weight
import os
import tqdm
import numpy as np
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split

In [187]:
train_dataframe = pd.read_csv("https://raw.githubusercontent.com/jibt1/competition_group/main/Tareas/datasets/haha_2021_train.csv", sep=',')
train_dataframe.head()

Unnamed: 0,id,text,is_humor,votes_no,votes_1,votes_2,votes_3,votes_4,votes_5,humor_rating,humor_mechanism,humor_target
0,tweet1,Niveles de retraso mental: \n\n— Bajo.\n— Medi...,1,1,2,2,0,0,0,1.5,,
1,tweet2,"—Vamos Luke desenfunda tu sable, demuestra tu ...",1,1,3,0,1,0,0,1.5,,
2,tweet3,"- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...",1,0,2,1,0,1,1,2.6,,
3,tweet4,No se porqué me hago la cabeza deooos,0,3,0,0,0,0,0,,,
4,tweet5,Quisiera saber que hago durante la siesta de l...,0,4,0,1,0,0,0,,,


In [188]:
english_tweets = pd.read_csv("https://raw.githubusercontent.com/jibt1/competition_group/main/dataset_translated.csv", sep=',')
english_tweets = english_tweets[~english_tweets['texto'].isnull()]
english_tweets['humor'] = english_tweets['humor'].astype(int)
english_tweets.rename(columns={"humor": "is_humor"}, inplace=True)
english_tweets  = english_tweets[['texto', 'is_humor']]
english_tweets.rename(columns={"texto": "text"}, inplace=True)
english_tweets
# Clases ligeramente desbalanceadas
english_tweets.is_humor.value_counts()
english_tweets = english_tweets[1:25000]
english_tweets.is_humor.value_counts()

1    12545
0    12454
Name: is_humor, dtype: int64

In [189]:
cfg = {}
cfg["framework"] = "tf"
cfg["max_length"] = 256 # 380 caracteros maximo por tweet (Mirar maximo de longitud)
cfg["transformer_model_name"] = "dccuchile/bert-base-spanish-wwm-cased" # Este es el modelo Bert para Spanish, con mayusculas
cfg["num_labels"] = 1

In [225]:
# dim dimension del pooling layer de los outputs del encoder en la salida de la ultima capa
# https://huggingface.co/transformers/model_doc/distilbert.html
# https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased
# dropout es el dropout de las denses de las capas de encoders
# seq_classif_dropout es el dropout de la ultima densa ajena a Bert
# Reducimos la complejidad del problema, solo tenemos 25000 tweets

config_bert = DistilBertConfig(num_labels = cfg["num_labels"], attention_dropout=0.75,
                               seq_classif_dropout=0.75, dropout=0.4,
                               n_heads=4, dim=16, max_position_embeddings=cfg["max_length"],
                               n_layers=2, hidden_dim=32, vocab_size=30522)
model = TFDistilBertForSequenceClassification.from_pretrained(cfg["transformer_model_name"], config=config_bert)

Some layers from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing TFDistilBertForSequenceClassification: ['mlm___cls', 'bert']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['dropout_137', 'pre_classifier', 'classifier', 'distilbert']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

In [226]:
# Tripadvisor, aplicamos regresion sin reducir de 5 puntos a 0 1 2
# Cambiar las metricas, MSE

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1Score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [227]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits = True)
model.compile(optimizer=optimizer, loss=loss, metrics = [f1Score, 'accuracy'])
model.summary()

Model: "tf_distil_bert_for_sequence_classification_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  496928    
_________________________________________________________________
pre_classifier (Dense)       multiple                  272       
_________________________________________________________________
classifier (Dense)           multiple                  17        
_________________________________________________________________
dropout_137 (Dropout)        multiple                  0         
Total params: 497,217
Trainable params: 497,217
Non-trainable params: 0
_________________________________________________________________


In [228]:
# Cargamos el tokenizador correspondiente ¿Lematiza?
cfg['tokenizer'] = DistilBertTokenizer.from_pretrained(cfg['transformer_model_name'] )
# Proceso de scikit learn para hacer OHE a 0 1 de la salida
cfg['label_binarizer'] = preprocessing.LabelBinarizer()

In [229]:
def get_model_inputs(cfg, data):
    encodings = cfg['tokenizer'](data, truncation=True, padding='max_length', max_length=cfg['max_length'], return_tensors=cfg['framework'])
    inputs = {'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
            }
    return inputs

In [230]:
# Constriumos la matriz de diseño con sus correspondientes Masks
x_train = train_dataframe['text']
y_train = train_dataframe['is_humor']

In [231]:
train_spanish = train_dataframe[['text','is_humor']]
train, val = train_test_split(train_spanish, test_size=.15)

In [232]:
train = pd.concat([train, english_tweets]).reset_index(drop=True)
train

Unnamed: 0,text,is_humor
0,Tapa más la niña sin brazos que el arquero del...,1
1,Prego,0
2,Estoy enamorada de mi campera,0
3,Las amistades son como las bubis: a veces se c...,1
4,Corazón que miente,0
...,...,...
45394,Las monjas súper tazón de observación están de...,0
45395,Mi gran boda gitana ': exhibiciones de novias ...,0
45396,¿Cuál es la diferencia entre 3 pollas y una br...,1
45397,Incluso las celebridades perdieron su fría sob...,0


In [233]:
train_inputs = get_model_inputs(cfg, train["text"].to_list())

In [234]:
val_inputs =  get_model_inputs(cfg, val["text"].to_list())
val_inputs

{'attention_mask': <tf.Tensor: shape=(3600, 256), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_ids': <tf.Tensor: shape=(3600, 256), dtype=int32, numpy=
 array([[    4,  1198,  3257, ...,     1,     1,     1],
        [    4,  1110,  1486, ...,     1,     1,     1],
        [    4,  1872, 26887, ...,     1,     1,     1],
        ...,
        [    4, 12816,  1957, ...,     1,     1,     1],
        [    4,  1412, 21144, ...,     1,     1,     1],
        [    4,   968,  1248, ...,     1,     1,     1]], dtype=int32)>}

In [235]:
# Empleamos el labelizador
# Trabajo ahora la target
cfg['label_binarizer'].fit(train["is_humor"])
train_blabels = cfg['label_binarizer'].transform(train["is_humor"])
val_blabels = cfg['label_binarizer'].transform(val["is_humor"])
# Pasamos a tensor
train_blabels_t = tf.convert_to_tensor(train_blabels, dtype='int32')
val_blabels_t = tf.convert_to_tensor(val_blabels, dtype='int32')
val_blabels_t

<tf.Tensor: shape=(3600, 1), dtype=int32, numpy=
array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]], dtype=int32)>

In [236]:
cfg['tokenizer'].decode([4, 1, 5])

'[CLS] [PAD] [SEP]'

In [237]:
cfg['checkpoints_dir'] = 'checkpoints'
cfg['model_name'] = 'distilbert-humor'
cfg['trained_model_name'] = os.path.join(cfg['checkpoints_dir'], cfg['model_name'])

In [238]:
tlabels = train_blabels.reshape(train_blabels.shape[0])
tlabels

array([1, 0, 0, ..., 1, 0, 1])

In [239]:
val_inputs

{'attention_mask': <tf.Tensor: shape=(3600, 256), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_ids': <tf.Tensor: shape=(3600, 256), dtype=int32, numpy=
 array([[    4,  1198,  3257, ...,     1,     1,     1],
        [    4,  1110,  1486, ...,     1,     1,     1],
        [    4,  1872, 26887, ...,     1,     1,     1],
        ...,
        [    4, 12816,  1957, ...,     1,     1,     1],
        [    4,  1412, 21144, ...,     1,     1,     1],
        [    4,   968,  1248, ...,     1,     1,     1]], dtype=int32)>}

In [240]:
epochs_max = 30
epochs_to_save = 10
batch_size = 80

class_weights = class_weight.compute_class_weight('balanced', np.unique(tlabels), tlabels)
class_weights = dict(enumerate(class_weights))

for epoch in tqdm.tqdm(range(0, epochs_max, epochs_to_save)):
    print(f'Training model, epochs {epoch+1} - {epoch+epochs_to_save}')
    
    # entrenar el modelo. Opcionalmente, se puede suministrar datos de validación => validation_data=(val_inputs,val_blabels_t )
    model.fit(train_inputs, y=train_blabels_t, epochs=epochs_to_save, batch_size=batch_size, validation_data=(val_inputs, val_blabels_t), class_weight=class_weights)

    model.save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch+1:03d}-{epoch+epochs_to_save:03d}')
    cfg['tokenizer'].save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch+1:03d}-{epoch+epochs_to_save:03d}')







  0%|          | 0/3 [00:00<?, ?it/s][A[A[A[A[A[AThe parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Training model, epochs 1 - 10
Epoch 1/10


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10








 33%|███▎      | 1/3 [07:59<15:58, 479.08s/it][A[A[A[A[A[A

Training model, epochs 11 - 20
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10








 67%|██████▋   | 2/3 [15:52<07:57, 477.52s/it][A[A[A[A[A[A

Training model, epochs 21 - 30
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10








100%|██████████| 3/3 [23:49<00:00, 476.34s/it]
