# 1.- Instalar Dependencias

In [1]:
#se requiere tener transformers 2.8.0
!pip list #devuelve la lista de paquetes en el entorno actual junto con la versión de cada paquete

Package                       Version
----------------------------- ---------------------
absl-py                       1.0.0
alabaster                     0.7.12
albumentations                0.1.12
altair                        4.2.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arviz                         0.12.0
astor                         0.8.1
astropy                       4.3.1
astunparse                    1.6.3
atari-py                      0.2.9
atomicwrites                  1.4.0
attrs                         21.4.0
audioread                     2.1.9
autograd                      1.4
Babel                         2.10.1
backcall                      0.2.0
beautifulsoup4                4.6.3
bleach                        5.0.0
blis                          0.4.1
bokeh                         2.3.3
Bottleneck                    1.3.4
branca                        0.5.0
bs4                           0.0.1
Cache

In [2]:
pip list | grep -E 'tensorflow|transformers' #verificamos especificamente los paquetes de tensorflow y transformers

tensorflow                    2.8.0
tensorflow-datasets           4.0.1
tensorflow-estimator          2.8.0
tensorflow-gcs-config         2.8.0
tensorflow-hub                0.12.0
tensorflow-io-gcs-filesystem  0.25.0
tensorflow-metadata           1.7.0
tensorflow-probability        0.16.0


In [1]:
%%capture
!pip uninstall -y tensorflow
!pip install transformers==2.8.0
# tensorflow y transformers chocan

# 2 Obtener datos

In [2]:
import os

# Bajar y descomprimir el conjunto de  datos de subtitulos de las peliculas
if not os.path.exists('data/dataset.txt'):
  !wget "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/es.txt.gz" -O dataset.txt.gz
  !gzip -d dataset.txt.gz
  !mkdir data
  !mv dataset.txt data

--2022-04-30 14:39:05--  https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2016/mono/es.txt.gz
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1859673728 (1.7G) [application/gzip]
Saving to: ‘dataset.txt.gz’


2022-04-30 14:40:32 (20.6 MB/s) - ‘dataset.txt.gz’ saved [1859673728/1859673728]



In [3]:
# Verificar el número toal Total de líneas de los datos y visualizar algunos de los datos
!wc -l data/dataset.txt
!shuf -n 5 data/dataset.txt

179287150 data/dataset.txt
Michael, ¿vas entendiendo?
¿Sabes qué?
Bueno, para empezar, no sé dónde has estado.
Aún te queda uno
Ken, ¿estás bien?


In [4]:
# Obtener un subconjunto del primer 1,000,000 de lineas para el entrenamiento
TRAIN_SIZE = 1000000 #@param {type:"integer"}
!(head -n $TRAIN_SIZE data/dataset.txt) > data/train.txt

In [5]:
# Obtener un subconjunto de las siguientes 10,000 lineas para la validación 
VAL_SIZE = 10000 #@param {type:"integer"}
!(sed -n {TRAIN_SIZE + 1},{TRAIN_SIZE + VAL_SIZE}p data/dataset.txt) > data/dev.txt

# Entrenar el tokenizador

In [6]:
%%time
from tokenizers import ByteLevelBPETokenizer

path = "data/train.txt"

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=path,
                vocab_size=50265,
                min_frequency=2,
                special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])

# Save files to disk
!mkdir -p "models/roberta"
tokenizer.save("models/roberta")

CPU times: user 26.7 s, sys: 398 ms, total: 27.1 s
Wall time: 26.9 s


#Entrenamiento del modelo 

## 1 Arquitectura del modelo

In [7]:
import json
config = {
	"architectures": [
		"RobertaForMaskedLM"
	],
	"attention_probs_dropout_prob": 0.1,
	"hidden_act": "gelu",
	"hidden_dropout_prob": 0.1,
	"hidden_size": 768,
	"initializer_range": 0.02,
	"intermediate_size": 3072,
	"layer_norm_eps": 1e-05,
	"max_position_embeddings": 514,
	"model_type": "roberta",
	"num_attention_heads": 12,
	"num_hidden_layers": 12,
	"type_vocab_size": 1,
	"vocab_size": 50265
}

with open("models/roberta/config.json", 'w') as fp:
    json.dump(config, fp)

tokenizer_config = {"max_len": 512}

with open("models/roberta/tokenizer_config.json", 'w') as fp:
    json.dump(tokenizer_config, fp)

## 2 Inicio del entrenamiento

In [8]:
# Update April 22, 2020: Hugging Face updated run_language_modeling.py script.
# Please use this version which was before the update.
!wget -c https://raw.githubusercontent.com/chriskhanhtran/spanish-bert/master/run_language_modeling.py

--2022-04-30 14:46:38--  https://raw.githubusercontent.com/chriskhanhtran/spanish-bert/master/run_language_modeling.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34328 (34K) [text/plain]
Saving to: ‘run_language_modeling.py’


2022-04-30 14:46:38 (13.7 MB/s) - ‘run_language_modeling.py’ saved [34328/34328]



### Argumentos importantes

In [10]:
# Model paths
MODEL_TYPE = "roberta" #@param ["roberta", "bert"]
MODEL_DIR = "models/roberta" #@param {type: "string"}
OUTPUT_DIR = "models/roberta/output" #@param {type: "string"}
TRAIN_PATH = "data/train.txt" #@param {type: "string"}
EVAL_PATH = "data/dev.txt" #@param {type: "string"}

In [11]:
!nvidia-smi

Sat Apr 30 14:48:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8    31W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
# Command line
cmd = """python run_language_modeling.py \
    --output_dir {output_dir} \
    --model_type {model_type} \
    --mlm \
    --config_name {config_name} \
    --tokenizer_name {tokenizer_name} \
    {line_by_line} \
    {should_continue} \
    {model_name_or_path} \
    --train_data_file {train_path} \
    --eval_data_file {eval_path} \
    --do_train \
    {do_eval} \
    {evaluate_during_training} \
    --overwrite_output_dir \
    --block_size 512 \
    --max_step 25 \
    --warmup_steps 10 \
    --learning_rate 5e-5 \
    --per_gpu_train_batch_size 4 \
    --gradient_accumulation_steps 4 \
    --weight_decay 0.01 \
    --adam_epsilon 1e-6 \
    --max_grad_norm 100.0 \
    --save_total_limit 10 \
    --save_steps 10 \
    --logging_steps 2 \
    --seed 42
"""

In [13]:
# Arguments for training from scratch. I turn off evaluate_during_training,
#   line_by_line, should_continue, and model_name_or_path.
train_params = {
    "output_dir": OUTPUT_DIR,
    "model_type": MODEL_TYPE,
    "config_name": MODEL_DIR,
    "tokenizer_name": MODEL_DIR,
    "train_path": TRAIN_PATH,
    "eval_path": EVAL_PATH,
    "do_eval": "--do_eval",
    "evaluate_during_training": "",
    "line_by_line": "",
    "should_continue": "",
    "model_name_or_path": "",
}

In [14]:
!{cmd.format(**train_params)}

04/30/2022 14:48:47 - INFO - transformers.configuration_utils -   loading configuration file models/roberta/config.json
04/30/2022 14:48:47 - INFO - transformers.configuration_utils -   Model config RobertaConfig {
  "_num_labels": 2,
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": 0,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": 2,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-05,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 514,
  "min_length": 0,
  "model_type": "roberta",
  "no_repeat_ngram_size": 0,
  "num_attent

# Predecir palabras enmascaradas

In [15]:
%%capture
%%time
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="chriskhanhtran/spanberta",
    tokenizer="chriskhanhtran/spanberta"
)

In [16]:
fill_mask("Lavarse frecuentemente las manos con agua y <mask>.")

[{'score': 0.6469593644142151,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y jabón.</s>',
  'token': 18493},
 {'score': 0.06074436753988266,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y sal.</s>',
  'token': 619},
 {'score': 0.029788268730044365,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y vapor.</s>',
  'token': 11079},
 {'score': 0.02641025371849537,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y limón.</s>',
  'token': 12788},
 {'score': 0.017029233276844025,
  'sequence': '<s> Lavarse frecuentemente las manos con agua y vinagre.</s>',
  'token': 18424}]

In [17]:
fill_mask("Come frutas y <mask>.")

[{'score': 0.5663056969642639,
  'sequence': '<s> Come frutas y verduras.</s>',
  'token': 10223},
 {'score': 0.24918697774410248,
  'sequence': '<s> Come frutas y vegetales.</s>',
  'token': 12582},
 {'score': 0.10890160501003265,
  'sequence': '<s> Come frutas y hortalizas.</s>',
  'token': 25283},
 {'score': 0.009446275420486927,
  'sequence': '<s> Come frutas y verdura.</s>',
  'token': 31641},
 {'score': 0.005149755626916885,
  'sequence': '<s> Come frutas y bebidas.</s>',
  'token': 8767}]

In [22]:
fill_mask("Estoy viendo <mask>")

[{'score': 0.0782180055975914,
  'sequence': '<s> Estoy viendo.</s>',
  'token': 18},
 {'score': 0.04264475777745247,
  'sequence': '<s> Estoy viendo Netflix</s>',
  'token': 17125},
 {'score': 0.030524639412760735,
  'sequence': '<s> Estoy viendo películas</s>',
  'token': 4174},
 {'score': 0.0169787909835577,
  'sequence': '<s> Estoy viendo Estoy</s>',
  'token': 8679},
 {'score': 0.016631552949547768,
  'sequence': '<s> Estoy viendo:</s>',
  'token': 30}]