In [2]:
# instalar librerías. Esta casilla es últil por ejemplo si se ejecuta el cuaderno en Google Colab
# Note que existen otras dependencias como tensorflow, etc. que en este caso se encontrarían ya instaladas
%%capture
!pip install transformers==4.2.1

print('Done!')

In [3]:
import pandas as pd
import tensorflow as tf
from transformers import DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn import preprocessing
from sklearn.metrics import f1_score
import os
import tqdm
from tensorflow.keras import backend as K

In [4]:
train_dataframe = pd.read_csv("https://raw.githubusercontent.com/jibt1/competition_group/main/datasets/haha_2021_train.csv", sep=',')
train_dataframe.head()

Unnamed: 0,id,text,is_humor,votes_no,votes_1,votes_2,votes_3,votes_4,votes_5,humor_rating,humor_mechanism,humor_target
0,tweet1,Niveles de retraso mental: \n\n— Bajo.\n— Medi...,1,1,2,2,0,0,0,1.5,,
1,tweet2,"—Vamos Luke desenfunda tu sable, demuestra tu ...",1,1,3,0,1,0,0,1.5,,
2,tweet3,"- ¿Te ofrezco algo?, ¿Agua, café, mi corazón, ...",1,0,2,1,0,1,1,2.6,,
3,tweet4,No se porqué me hago la cabeza deooos,0,3,0,0,0,0,0,,,
4,tweet5,Quisiera saber que hago durante la siesta de l...,0,4,0,1,0,0,0,,,


In [5]:
cfg = {}
cfg["framework"] = "tf"
cfg["max_length"] = 512
cfg["transformer_model_name"] = "distilbert-base-uncased"
cfg["num_labels"] = 1

In [6]:
config_bert = DistilBertConfig(num_labels = cfg["num_labels"], seq_classif_dropout=0.25)
model = TFDistilBertForSequenceClassification.from_pretrained(cfg["transformer_model_name"], config=config_bert)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1Score(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits = True)
model.compile(optimizer=optimizer, loss=loss, metrics = [f1Score])
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  769       
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,954,241
Trainable params: 66,954,241
Non-trainable params: 0
_________________________________________________________________


In [None]:
cfg['tokenizer'] = DistilBertTokenizer.from_pretrained(cfg['transformer_model_name'] )
cfg['label_binarizer'] = preprocessing.LabelBinarizer()

In [None]:
cfg['label_binarizer'].fit(train_dataframe["is_humor"])
train_blabels = cfg['label_binarizer'].transform(train_dataframe["is_humor"])
train_blabels_t = tf.convert_to_tensor(train_blabels, dtype='int32')

In [None]:
def get_model_inputs(cfg, data):
    encodings = cfg['tokenizer'](data, truncation=True, padding='max_length', max_length=cfg['max_length'], return_tensors=cfg['framework'])
    inputs = {'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
            }
    return inputs

In [None]:
train_inputs = get_model_inputs(cfg, train_dataframe["text"].to_list())
train_inputs

{'input_ids': <tf.Tensor: shape=(24000, 512), dtype=int32, numpy=
 array([[  101,  9152, 15985, ...,     0,     0,     0],
        [  101,  1517, 12436, ...,     0,     0,     0],
        [  101,  1011,  1094, ...,     0,     0,     0],
        ...,
        [  101, 25176,  2474, ...,     0,     0,     0],
        [  101,  2632, 25698, ...,     0,     0,     0],
        [  101,  2771,  8273, ...,     0,     0,     0]])>,
 'attention_mask': <tf.Tensor: shape=(24000, 512), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]])>}

In [None]:
cfg['checkpoints_dir'] = 'checkpoints'
cfg['model_name'] = 'distilbert-humor'
cfg['trained_model_name'] = os.path.join(cfg['checkpoints_dir'], cfg['model_name'])

In [None]:
for epoch in tqdm.tqdm(range(0, 1, 1)):
    print(f'Training model, epochs {epoch+1}')
    
    # entrenar el modelo. Opcionalmente, se puede suministrar datos de validación => validation_data=(val_inputs,val_blabels_t )
    model.fit(train_inputs, y=train_blabels_t, epochs=1, batch_size=16)

    #model.save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch+1:03d}-{epoch+epochs_to_save:03d}')
    #cfg['tokenizer'].save_pretrained(cfg['trained_model_name'] + f'-epochs-{epoch+1:03d}-{epoch+epochs_to_save:03d}')

  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Training model, epochs 1
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x0000029CEEDA6818> is not a module, class, method, function, traceback, frame, or code object
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <cyfunction Socket.send at 0x0000029CEEDA6818> is not a module, class, method, function, traceback, frame, or code object


The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.
The parameters `output_attentions`, `output_hidden_states` and `use_cache` cannot be updated when calling a model.They have to be set to True/False in the config object (i.e.: `config=XConfig.from_pretrained('name', output_attentions=True)`).
The parameter `return_dict` cannot be set in graph mode and will always be set to `True`.


 341/1500 [=====>........................] - ETA: 7:26 - loss: 0.4578 - f1Score: 0.5772

  0%|                                                                                            | 0/1 [02:19<?, ?it/s]


KeyboardInterrupt: 