In [165]:
# Comprobamos si está tensorflow-gpu==2.3.0
# !pip freeze

In [166]:
# Hay que instalar esto si se quiere utilizar la gpu
# !pip install tensorflow-gpu==2.3.0 

In [167]:
# Esta es la tarjeta grafica
# !nvidia-smi

In [168]:
# instalar librerías. Esta casilla es últil por ejemplo si se ejecuta el cuaderno en Google Colab
# Note que existen otras dependencias como tensorflow, etc. que en este caso se encontrarían ya instaladas
%%capture
!pip install transformers==4.2.1

print('Done!')

In [169]:
import pandas as pd
import tensorflow as tf
from transformers import DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn import preprocessing
from sklearn.metrics import f1_score
# Para trabajar teniendo encuenta el desbalanceo
from sklearn.utils import class_weight
import os
import tqdm
import numpy as np
from tensorflow.keras import backend as K

In [170]:
train_dataframe = pd.read_csv("https://raw.githubusercontent.com/jibt1/competition_group/main/datasets/haha_2021_train.csv", sep=',')
train_dataframe.head()

Unnamed: 0,id,text,is_humor,votes_no,votes_1,votes_2,votes_3,votes_4,votes_5,humor_rating,humor_mechanism,humor_target
0,tweet1,Niveles de retraso mental: \n\n— Bajo.\n— Medi...,1,1,2,2,0,0,0,1.5,,
1,tweet2,"—Vamos Luke desenfunda tu sable, demuestra tu ...",1,1,3,0,1,0,0,1.5,,
2,tweet3,"- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...",1,0,2,1,0,1,1,2.6,,
3,tweet4,No se porqué me hago la cabeza deooos,0,3,0,0,0,0,0,,,
4,tweet5,Quisiera saber que hago durante la siesta de l...,0,4,0,1,0,0,0,,,


In [171]:
cfg = {}
cfg["framework"] = "tf"
cfg["max_length"] = 256 # 380 caracteros maximo por tweet (Mirar maximo de longitud)
cfg["transformer_model_name"] = "dccuchile/bert-base-spanish-wwm-cased" # Este es el modelo Bert para Spanish, con mayusculas
cfg["num_labels"] = 1

In [172]:
# dim dimension del pooling layer de los outputs del encoder en la salida de la ultima capa
# https://huggingface.co/transformers/model_doc/distilbert.html
# https://huggingface.co/dccuchile/bert-base-spanish-wwm-uncased
# dropout es el dropout de las denses de las capas de encoders
# seq_classif_dropout es el dropout de la ultima densa ajena a Bert
# Reducimos la complejidad del problema, solo tenemos 25000 tweets

config_bert = DistilBertConfig(num_labels = cfg["num_labels"], 
                               attention_dropout=0.75, seq_classif_dropout=0, dropout=0.75,
                               n_heads=4, dim=128, max_position_embeddings=cfg["max_length"],
                               n_layers=2, hidden_dim=256)
model = TFDistilBertForSequenceClassification.from_pretrained(cfg["transformer_model_name"], config=config_bert)

Some layers from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing TFDistilBertForSequenceClassification: ['bert', 'mlm___cls']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['distilbert', 'pre_classifier', 'classifier', 'dropout_131']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

In [173]:
# Tripadvisor, aplicamos regresion sin reducir de 5 puntos a 0 1 2
# Cambiar las metricas, MSE

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1Score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [174]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits = True)
model.compile(optimizer=optimizer, loss=loss, metrics = [f1Score, 'accuracy'])
model.summary()

Model: "tf_distil_bert_for_sequence_classification_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  4204800   
_________________________________________________________________
pre_classifier (Dense)       multiple                  16512     
_________________________________________________________________
classifier (Dense)           multiple                  129       
_________________________________________________________________
dropout_131 (Dropout)        multiple                  0         
Total params: 4,221,441
Trainable params: 4,221,441
Non-trainable params: 0
_________________________________________________________________


In [175]:
# Cargamos el tokenizador correspondiente ¿Lematiza?
cfg['tokenizer'] = DistilBertTokenizer.from_pretrained(cfg['transformer_model_name'] )
# Proceso de scikit learn para hacer OHE a 0 1 de la salida
cfg['label_binarizer'] = preprocessing.LabelBinarizer()

In [176]:
# Empleamos el labelizador
cfg['label_binarizer'].fit(train_dataframe["is_humor"])
train_blabels = cfg['label_binarizer'].transform(train_dataframe["is_humor"])
print(train_blabels)
train_blabels_t = tf.convert_to_tensor(train_blabels, dtype='int32')

# Clases ligeramente desbalanceadas
df_clases = pd.DataFrame(train_blabels)
round(df_clases.value_counts()/df_clases.shape[0] * 100, 2)

[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [1]]


0    61.45
1    38.55
dtype: float64

In [177]:
def get_model_inputs(cfg, data):
    encodings = cfg['tokenizer'](data, truncation=True, padding='max_length', max_length=cfg['max_length'], return_tensors=cfg['framework'])
    inputs = {'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
            }
    return inputs

In [178]:
# Constriumos la matriz de diseño con sus correspondientes Masks
train_inputs = get_model_inputs(cfg, train_dataframe["text"].to_list())
train_inputs

{'attention_mask': <tf.Tensor: shape=(24000, 256), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_ids': <tf.Tensor: shape=(24000, 256), dtype=int32, numpy=
 array([[    4, 10682,  1019, ...,     1,     1,     1],
        [    4,     3,  1716, ...,     1,     1,     1],
        [    4,  1149,  1067, ...,     1,     1,     1],
        ...,
        [    4,  2448,  1030, ...,     1,     1,     1],
        [    4,  4596,  1512, ...,     1,     1,     1],
        [    4,  1153,  8386, ...,     1,     1,     1]], dtype=int32)>}

In [179]:
print(cfg['tokenizer'].pad_token)
cfg['tokenizer'].encode([cfg['tokenizer'].pad_token])

[PAD]


[4, 1, 5]

In [180]:
cfg['tokenizer'].decode([4, 1, 5])

'[CLS] [PAD] [SEP]'

In [181]:
cfg['checkpoints_dir'] = 'checkpoints'
cfg['model_name'] = 'distilbert-humor'
cfg['trained_model_name'] = os.path.join(cfg['checkpoints_dir'], cfg['model_name'])

In [182]:
tlabels = train_blabels.reshape(train_blabels.shape[0])

In [183]:
epochs_max = 10
epochs_to_save = 5
batch_size = 32

class_weights = class_weight.compute_class_weight('balanced', np.unique(tlabels), tlabels)
class_weights = dict(enumerate(class_weights))

for epoch in tqdm.tqdm(range(0, epochs_max, epochs_to_save)):
    print(f'Training model, epochs {epoch+1} - {epoch+epochs_to_save}')
    
    # entrenar el modelo. Opcionalmente, se puede suministrar datos de validación => validation_data=(val_inputs,val_blabels_t )
    model.fit(train_inputs, y=train_blabels_t, epochs=epochs_to_save, 
              batch_size=batch_size, validation_split=0.25, class_weight=class_weights)

    model.save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch+1:03d}-{epoch+epochs_to_save:03d}')
    cfg['tokenizer'].save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch+1:03d}-{epoch+epochs_to_save:03d}')







  0%|          | 0/2 [00:00<?, ?it/s][A[A[A[A[A[AThe parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Training model, epochs 1 - 5
Epoch 1/5


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.




The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5








 50%|█████     | 1/2 [01:55<01:55, 115.19s/it][A[A[A[A[A[A

Training model, epochs 6 - 10
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
125/563 [=====>........................] - ETA: 15s - loss: 0.2994 - f1Score: 0.8077 - accuracy: 0.8800

KeyboardInterrupt: ignored