In [1]:
import torch

torch.cuda.is_available()

True

In [2]:
if torch.cuda.is_available() == False:
    print('Error, selecciona una instancia con GPU.')

In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
os.getcwd()

'/home/ec2-user/SageMaker'

In [9]:
FOLDER_BASE = './dataset'
FOLDER_OUTPUT = './output'

S3_BUCKET_NAME = 'sagemaker-readability'

In [10]:
if not os.path.exists(FOLDER_OUTPUT):
    os.makedirs(FOLDER_OUTPUT)

In [7]:
!pip install -q simpletransformers==0.51.0

In [13]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import AutoTokenizer, AutoModel

In [19]:
!pip show transformers # check transformers version

Name: transformers
Version: 4.15.0
Summary: State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
Home-page: https://github.com/huggingface/transformers
Author: Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Patrick von Platen, Sylvain Gugger, Suraj Patil, Stas Bekman, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors
Author-email: thomas@huggingface.co
License: Apache
Location: /home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages
Requires: filelock, huggingface-hub, importlib-metadata, numpy, packaging, pyyaml, regex, requests, sacremoses, tokenizers, tqdm
Required-by: simpletransformers


In [21]:
!pip show torch # check version

Name: torch
Version: 1.8.1+cu111
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /home/ec2-user/anaconda3/envs/pytorch_latest_p37/lib/python3.7/site-packages
Requires: numpy, typing-extensions
Required-by: fastai, torchvision


In [75]:
transformers_version='4.12.3' # la última no es compatible con el SDK de Sagemaker. Usar 4.12.3.
pytorch_version='1.9.1'
py_version='py38'

In [23]:
GLOBAL_SEED = 33

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    print('SEED: ' + str(seed))

In [24]:
class experiment: 
  def __init__(self, SUBMISSION_NUM, FOLDER, FORCE_LOWERCASE, CONFIG_CLEAN_CHARS, REMOVE_0_0, REMOVE_STOP_WORDS, MODEL_TYPE, MODEL_NAME, AUTO_TOKENIZER_NAME, P_TEST_SIZE, P_TRAIN_SIZE, LOAD_TRAIN_AND_TEST_SETS, FILENAME_INPUT_SUFIX, NUM_TRAIN_EPOCHS, IS_FP16_ENABLED, MAX_SEQ_LENGTH, EVAL_BATCH_SIZE, TRAIN_BATCH_SIZE, CONFIG_GRADIENT_ACCUMULATION_STEPS, LEARNING_RATE, CONFIG_NO_SAVE_MODEL, WARMUP_STEPS=0): 
    self.FORCE_LOWERCASE = FORCE_LOWERCASE
    self.CONFIG_CLEAN_CHARS = CONFIG_CLEAN_CHARS

    self.REMOVE_0_0 = REMOVE_0_0

    self.REMOVE_STOP_WORDS = REMOVE_STOP_WORDS

    self.MODEL_TYPE = MODEL_TYPE # Ejemplos: 'bert'
    self.MODEL_NAME = MODEL_NAME # Ejemplos: 'bert-large-uncased', 'bert-base-uncased','roberta-large'
    self.AUTO_TOKENIZER_NAME = AUTO_TOKENIZER_NAME

    self.P_TEST_SIZE = P_TEST_SIZE
    self.P_TRAIN_SIZE = P_TRAIN_SIZE

    self.LOAD_TRAIN_AND_TEST_SETS = LOAD_TRAIN_AND_TEST_SETS
    self.FILENAME_INPUT_SUFIX = FILENAME_INPUT_SUFIX

    self.NUM_TRAIN_EPOCHS = NUM_TRAIN_EPOCHS
    self.IS_FP16_ENABLED = IS_FP16_ENABLED
    self.MAX_SEQ_LENGTH = MAX_SEQ_LENGTH # Se rellena auto al tokenizar.
    self.EVAL_BATCH_SIZE = EVAL_BATCH_SIZE
    self.TRAIN_BATCH_SIZE = TRAIN_BATCH_SIZE
    self.CONFIG_GRADIENT_ACCUMULATION_STEPS = CONFIG_GRADIENT_ACCUMULATION_STEPS
    self.LEARNING_RATE = LEARNING_RATE # Ejemplos: 1e-05, 4e-05

    self.CONFIG_NO_SAVE_MODEL = CONFIG_NO_SAVE_MODEL

    self.WARMUP_STEPS = WARMUP_STEPS

    now = datetime.now()
    dt_string = now.strftime("%Y-%m-%d_%H_%M_%S")
    self.OUTPUT_FOLDER = os.path.join(FOLDER, 'out-' + MODEL_NAME + '_' + str(SUBMISSION_NUM) + '_' + dt_string)

  def info(self):
    return self.MODEL_NAME + '-N' + str(self.NUM_TRAIN_EPOCHS) + '_LR' + str(self.LEARNING_RATE) + '_lower' + str(self.FORCE_LOWERCASE) + '_cleanChars' + str(self.CONFIG_CLEAN_CHARS) + '_removeStops' + str(self.REMOVE_STOP_WORDS) + '_removeZero' + str(self.REMOVE_0_0) + '_warm' + str(self.WARMUP_STEPS) + '_' + str(self.P_TRAIN_SIZE) + '_' + str(self.TRAIN_BATCH_SIZE) + 'x' + str(self.EVAL_BATCH_SIZE) + '_' + str(self.MAX_SEQ_LENGTH) + '_fp16' + str(self.IS_FP16_ENABLED)

In [25]:
from datetime import datetime
import random

from sklearn.model_selection import train_test_split

In [27]:
df_orig = pd.read_csv(os.path.join(FOLDER_BASE, 'train.csv'), sep=",")

In [28]:
df_orig

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [29]:
SUBMISSION_ID = 1

aux_experimento = experiment(
  SUBMISSION_NUM = SUBMISSION_ID,
  FOLDER = FOLDER_OUTPUT,
  
  FORCE_LOWERCASE = False,
  CONFIG_CLEAN_CHARS = False,

  REMOVE_0_0 = False,
  REMOVE_STOP_WORDS = False,

  MODEL_TYPE = 'roberta', # 'bert', 'roberta'
  MODEL_NAME = 'roberta-base', # '../input/roberta-base', # 'roberta-base', # 'bert-large-uncased' # 'bert-base-uncased' #roberta-large'
  AUTO_TOKENIZER_NAME = 'roberta-base', # '../input/roberta-base', # igual que model_name

  P_TEST_SIZE = 0.1,
  P_TRAIN_SIZE = 0.9,
  LOAD_TRAIN_AND_TEST_SETS = False,
  FILENAME_INPUT_SUFIX = 'writing-in-the-blog',

  NUM_TRAIN_EPOCHS = 5,
  IS_FP16_ENABLED = False,
  MAX_SEQ_LENGTH = 10000, # Se rellena auto al tokenizar.
  EVAL_BATCH_SIZE = 8,
  TRAIN_BATCH_SIZE = 8,
  CONFIG_GRADIENT_ACCUMULATION_STEPS = 1,
  LEARNING_RATE = 4e-5,

  WARMUP_STEPS = 0,

  CONFIG_NO_SAVE_MODEL = False
)

In [30]:
import statistics
import logging

def prediction_error(labels, preds):
  errors = [
             abs(label - pred)
             for label, pred in zip(labels, preds)
  ]
  return statistics.mean(errors)


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [31]:
seed_everything(seed=GLOBAL_SEED)

SEED: 33


In [32]:
one = aux_experimento

if one.LOAD_TRAIN_AND_TEST_SETS == False:
  x_train, x_test = train_test_split(df_orig, test_size=one.P_TEST_SIZE, train_size=one.P_TRAIN_SIZE, shuffle=True)

  now = datetime.now()
  dt_string = now.strftime("%Y-%m-%d_%H_%M_%S")
  suffix = dt_string + '_' + str(one.P_TRAIN_SIZE) + 'x' + str(one.P_TEST_SIZE)
    
  x_train.to_csv(os.path.join(FOLDER_OUTPUT, "x_train_" + suffix +  ".csv"), sep=",", index=False)
  x_test.to_csv(os.path.join(FOLDER_OUTPUT, "x_test_" + suffix  + ".csv"), sep=",", index=False)

  print('Separado el dataset en train y test. Guardado en disco.')
else:
  x_train = pd.read_csv(os.path.join(FOLDER_OUTPUT, 'x_train_' + one.FILENAME_INPUT_SUFIX + ".csv"), sep=",", error_bad_lines=True)
  x_test = pd.read_csv(os.path.join(FOLDER_OUTPUT, 'x_test_' + one.FILENAME_INPUT_SUFIX + ".csv"), sep=",", error_bad_lines=True)
  print('Se ha cargado de disco x_train y x_test.')
  
# eliminar el dato anómalo?:
if one.REMOVE_0_0:
  x_train = x_train[(x_train['target'] != 0) | (x_train['standard_error'] != 0)]
  x_test = x_test[(x_test['target'] != 0) | (x_test['standard_error'] != 0)]
  print('Eliminado dato anómalo (0, 0). ' + str(len(x_train)))

x_train.rename(columns = {'excerpt':'text', 'target':'labels'}, inplace = True)
x_test.rename(columns = {'excerpt':'text', 'target':'labels'}, inplace = True)

if one.FORCE_LOWERCASE == True:
  x_train['text'] = x_train['text'].str.lower()
  x_test['text'] = x_test['text'].str.lower()
  print('Conversión a minúsculas.')

if one.CONFIG_CLEAN_CHARS:
  x_train['text'] = x_train['text'].apply(lambda x: clean_chars(x, stops_spaced_only))
  x_test['text'] = x_test['text'].apply(lambda x: clean_chars(x, stops_spaced_only))
  print('Limpiados los caracteres especiales.')

if one.REMOVE_STOP_WORDS:
  print('Limpiadas las stopwords.')
  for one in stops:
    x_train['text'] = x_train['text'].apply(lambda x: x.replace(' {} '.format(one), ' '))
    x_test['text'] = x_test['text'].apply(lambda x: x.replace(' {} '.format(one), ' '))

# calcular max_seq
tokenizer = AutoTokenizer.from_pretrained(one.AUTO_TOKENIZER_NAME)

x_train['excerpt_tokeniker'] = x_train['text'].apply(lambda x: tokenizer.tokenize(x))
x_train['excerpt_tokeniker_len'] = x_train['excerpt_tokeniker'].apply(lambda x: len(x))

x_test['excerpt_tokeniker'] = x_test['text'].apply(lambda x: tokenizer.tokenize(x))
x_test['excerpt_tokeniker_len'] = x_test['excerpt_tokeniker'].apply(lambda x: len(x))

one.MAX_SEQ_LENGTH = max(x_train['excerpt_tokeniker_len'].max(), x_test['excerpt_tokeniker_len'].max()) + 2 # max len de Train y Eval. (y +2)
print('MAX_SEQ_LENGTH (from Train & Eval): ' + str(one.MAX_SEQ_LENGTH))


print('==> ' + str(SUBMISSION_ID) + ': ' + one.info())

Separado el dataset en train y test. Guardado en disco.


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

MAX_SEQ_LENGTH (from Train & Eval): 322
==> 1: roberta-base-N5_LR4e-05_lowerFalse_cleanCharsFalse_removeStopsFalse_removeZeroFalse_warm0_0.9_8x8_322_fp16False


In [33]:
model_args = ClassificationArgs()
model_args.num_train_epochs = one.NUM_TRAIN_EPOCHS
model_args.output_dir = one.OUTPUT_FOLDER
model_args.regression = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True

model_args.fp16 = one.IS_FP16_ENABLED
model_args.max_seq_length = one.MAX_SEQ_LENGTH
model_args.eval_batch_size = one.EVAL_BATCH_SIZE
model_args.train_batch_size = one.TRAIN_BATCH_SIZE
model_args.gradient_accumulation_steps = one.CONFIG_GRADIENT_ACCUMULATION_STEPS

model_args.learning_rate = one.LEARNING_RATE

model_args.no_save = one.CONFIG_NO_SAVE_MODEL
model_args.best_model_dir = os.path.join(one.OUTPUT_FOLDER, 'best_model')

model_args.manual_seed = GLOBAL_SEED # para reproducible

model_args.warmup_steps = one.WARMUP_STEPS

# model_args.wandb_project = WANDB_PROJECT_NAME
# model_args.wandb_kwargs['name'] = one.info()

print(model_args)

ClassificationArgs(adam_epsilon=1e-08, best_model_dir='./output/out-roberta-base_1_2022-01-09_19_42_25/best_model', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=2, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=3, encoding=None, adafactor_eps=(1e-30, 0.001), adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_beta1=None, adafactor_scale_parameter=True, adafactor_relative_step=True, adafactor_warmup_init=True, eval_batch_size=8, evaluate_during_training=True, evaluate_during_training_silent=True, evaluate_during_training_steps=2000, evaluate_during_training_verbose=True, evaluate_each_epoch=True, fp16=False, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, manual_seed=33, max_grad_nor

In [34]:
model = ClassificationModel(
    one.MODEL_TYPE,
    one.MODEL_NAME,
    num_labels=1,
    args=model_args
)

model.train_model(x_train, eval_df=x_test, show_running_loss=True, verbose=True, prediction_error=prediction_error)

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.de

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/2550 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Epoch 0 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.4972742662001008, 'eval_loss': 0.37080445513129234}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Epoch 1 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.5295451188551813, 'eval_loss': 0.42812980131970513}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Epoch 2 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.6533960971832079, 'eval_loss': 0.6184557212723626}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Epoch 3 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.4459620239672212, 'eval_loss': 0.3230808104077975}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Epoch 4 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.47813418899616883, 'eval_loss': 0.36507454328238964}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to ./output/out-roberta-base_1_2022-01-09_19_42_25.


(1595,
 {'global_step': [319, 638, 957, 1276, 1595],
  'train_loss': [0.3251998722553253,
   0.07599388808012009,
   0.3134761154651642,
   0.0371391735970974,
   0.12416594475507736],
  'eval_loss': [0.37080445513129234,
   0.42812980131970513,
   0.6184557212723626,
   0.3230808104077975,
   0.36507454328238964],
  'prediction_error': [0.4972742662001008,
   0.5295451188551813,
   0.6533960971832079,
   0.4459620239672212,
   0.47813418899616883]})

In [35]:
result, model_outputs, wrong_predictions = model.eval_model(x_test, verbose=True)
result

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/284 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/36 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.47813418899616883, 'eval_loss': 0.36507454328238964}


{'eval_loss': 0.36507454328238964}

In [42]:
STR_SUBMISSION_ID = str(SUBMISSION_ID)

solutions = pd.DataFrame()
solutions['text_' + STR_SUBMISSION_ID] = x_test['text']
solutions['labels_' + STR_SUBMISSION_ID] = x_test['labels']
solutions['standard_error_' + STR_SUBMISSION_ID] = x_test['standard_error']

solutions['prediction_' + STR_SUBMISSION_ID] = model_outputs
solutions['prediction_error_abs_' + STR_SUBMISSION_ID] = solutions.apply(lambda x: abs(x['labels_' + STR_SUBMISSION_ID] - x['prediction_' + STR_SUBMISSION_ID]), axis=1)

print('(ABS) Mean prediction error for this experiment: ' + str(solutions['prediction_error_abs_' + STR_SUBMISSION_ID].mean(axis=0)))

(ABS) Mean prediction error for this experiment: 0.4459620252154145


## Cargamos el modelo entrenado previamente guardado en la instancia.

In [43]:
# cargando desde fichero el modelo entrenado
model_loaded = ClassificationModel(
    one.MODEL_TYPE,
    os.path.join(one.OUTPUT_FOLDER, 'best_model'),
    num_labels=1,
    args=model_args
)

In [44]:
result, model_outputs, wrong_predictions = model_loaded.eval_model(x_test, verbose=True, prediction_error=prediction_error)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/284 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/36 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.4459620239672212, 'eval_loss': 0.3230808104077975}


In [45]:
result

{'prediction_error': 0.4459620239672212, 'eval_loss': 0.3230808104077975}

In [46]:
# Ejemplos de predicción:

samples = ['A cat is an animal.',
           'Quantum computing is a type of computation that harnesses the collective properties of quantum states, such as superposition, interference, and entanglement, to perform calculations.',
           'The United States (U.S. or US), officially the United States of America (U.S.A. or USA) or America, is a country primarily located in North America.']
           
predictions, raw_outputs = model_loaded.predict(samples)
predictions

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

array([ 0.47816357, -2.13515425, -0.16586283])

## Enviamos el modelo entrenado a S3

In [55]:
import boto3, os

BEST_DIR = os.path.join(one.OUTPUT_FOLDER, 'best_model')
TARGZ_NAME = 'best_model.tar.gz'
BEST_DIR

'./output/out-roberta-base_1_2022-01-09_19_42_25/best_model'

In [48]:
# Opción 1: enviar todos los ficheros, descomprimidos.
# for (dirpath, dirnames, filenames) in os.walk(BEST_DIR):
#    for one in filenames:
#        path = os.path.join(dirpath, one)
#        print('enviando a S3: ' + path)
#        boto3.Session().resource('s3').Bucket(S3_BUCKET_NAME).Object(path).upload_file(path)

In [None]:
!ls

In [None]:
# Opción 2: comprimir y enviar el comprimido
%cd ~/SageMaker/

In [None]:
%cd {BEST_DIR}

In [None]:
!ls -l

In [None]:
!tar -zcvf {TARGZ_NAME} *

print('Comprimido :)')

boto3.Session().resource('s3').Bucket(S3_BUCKET_NAME).Object(TARGZ_NAME).upload_file(TARGZ_NAME)

print('Subido a S3 el tar.gz :)')

config.json
eval_results.txt
merges.txt
model_args.json
optimizer.pt
pytorch_model.bin
scheduler.pt
special_tokens_map.json
tokenizer_config.json
training_args.bin
vocab.json
Comprimido :)
Subido a S3 el tar.gz :)


In [62]:
model_data_url = 's3://' + S3_BUCKET_NAME + '/' + TARGZ_NAME
model_data_url

's3://sagemaker-readability/best_model.tar.gz'

## A partir de aquí, creamos el endpoint para inferencia serverless

Basado en el ejemplo de Julien Simon (Chief Evangelist @ Hugging Face): https://dev.to/juliensimon/aws-reinvent-2021-serverless-inference-on-sagemaker-for-real-g96

In [63]:
sm = boto3.client(service_name='sagemaker')

from time import gmtime, strftime

def name_with_timestamp(name):
    return '{}-{}'.format(name, strftime('%Y-%m-%d-%H-%M-%S', gmtime()))

In [64]:
huggingface_model_name    = name_with_timestamp('huggingface-serverless')
huggingface_epc_name      = name_with_timestamp('huggingface-serverless-epc')
huggingface_endpoint_name = name_with_timestamp('huggingface-serverless-ep')

In [67]:
import sagemaker

### Create model

In [76]:
region = boto3.session.Session().region_name

image_uri = sagemaker.image_uris.retrieve(
    framework='huggingface',
    base_framework_version=f'pytorch{pytorch_version}',
    region=region,
    version=transformers_version,
    py_version=py_version,
    instance_type='ml.m5.large',   # No GPU support on serverless inference
    image_scope='inference'
)

image_uri

'763104351884.dkr.ecr.eu-west-1.amazonaws.com/huggingface-pytorch-inference:1.9.1-transformers4.12.3-cpu-py38-ubuntu20.04'

In [78]:
role = sagemaker.get_execution_role()

create_model_response = sm.create_model(
    ModelName=huggingface_model_name,
    Containers=[
        {
            'Image': image_uri,
            'Mode': 'SingleModel',
            'ModelDataUrl': model_data_url,
            #'Environment': {
            #    'HF_TASK': 'text-classification',
            #    'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'true',
            #    'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'
            #}
        }
    ],
    ExecutionRoleArn=role,
)

create_model_response["ModelArn"]

'arn:aws:sagemaker:eu-west-1:587943841427:model/huggingface-serverless-2022-01-09-20-15-26'

### Create endpoint configuration

In [79]:
endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=huggingface_epc_name,
    ProductionVariants=[
        {
            'VariantName': 'single-variant',
            'ModelName': huggingface_model_name,
            
            # opción serverless:
            'ServerlessConfig': {
                'MemorySizeInMB': 6144,
                'MaxConcurrency': 8,
            },
            
            # opción no serverless:
            #'InstanceType': 'ml.t2.xlarge',
            #'InitialInstanceCount': 1,
        },
    ],
)

endpoint_config_response['EndpointConfigArn']

'arn:aws:sagemaker:eu-west-1:587943841427:endpoint-config/huggingface-serverless-epc-2022-01-09-20-15-26'

### Create endpoint

In [80]:
 create_endpoint_response = sm.create_endpoint(
    EndpointName=huggingface_endpoint_name,
    EndpointConfigName=huggingface_epc_name,
)

create_endpoint_response['EndpointArn']

'arn:aws:sagemaker:eu-west-1:587943841427:endpoint/huggingface-serverless-ep-2022-01-09-20-15-26'

In [81]:
waiter = sm.get_waiter('endpoint_in_service')
waiter.wait(EndpointName=huggingface_endpoint_name)

### Invoke endpoint

In [82]:
import threading, time, json

sm_rt = boto3.client(service_name='sagemaker-runtime')

In [84]:
test_data = { 'inputs': 'A cat is an animal.' }

In [None]:
tick = time.time()
response = sm_rt.invoke_endpoint(
            EndpointName=huggingface_endpoint_name,
            Body=json.dumps(test_data),
            ContentType='application/json'
)
tock = time.time()
print(tock-tick)
# print(response["Body"].read())
print(response)

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
  "code": 400,
  "type": "InternalServerException",
  "message": "No such file or directory (os error 2)"
}
". See https://eu-west-1.console.aws.amazon.com/cloudwatch/home?region=eu-west-1#logEventViewer:group=/aws/sagemaker/Endpoints/huggingface-serverless-ep-2022-01-10-17-21-20 in account 587943841427 for more information.

### La invocación anterior falla.

2022-01-09: En este hilo comentan que puede dar este error cuando el modelo > 512MB:
https://discuss.huggingface.co/t/sagemaker-serverless-inference/13246

2022-01-10: He corregido un error en la creación del fichero `best_model.tar.gz` (se estaba comprimiendo la carpeta y hay que comprimir los ficheros del modelo pero sin la carpeta). Aún así, ahora la invocación da otro error. He probado a cambiar el Endpoint Serverless por un Endpoint no Serverless y el error era el mismo. Finalmente he llegado a la conclusión de que el modelo que guarda la librería SimpleTransformers no es compatible con el modelo que lee la librería Transformers (HuggingFace). Existen 2 posibles soluciones:

1. En lugar de utilizar una imagen de contenedor de HuggingFace, crear una imagen personalizada para el contenedor de inferencia, que cargue el modelo y realice la predicción con la librería SimpleTransformers. Documentación: https://docs.aws.amazon.com/sagemaker/latest/dg/adapt-inference-container.html

2. Reimplementar el entrenamiento del modelo para usar la librería Transformers (HuggingFace), en lugar de SimpleTransformers.

### Por último, eliminamos el endpoint de inferencia que habíamos creado:

In [92]:
sm.delete_endpoint(EndpointName=huggingface_endpoint_name)
sm.delete_endpoint_config(EndpointConfigName=huggingface_epc_name)
sm.delete_model(ModelName=huggingface_model_name)

{'ResponseMetadata': {'RequestId': 'cce67e18-69be-45f5-a3d0-cde696f71523',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'cce67e18-69be-45f5-a3d0-cde696f71523',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Sun, 09 Jan 2022 20:41:33 GMT'},
  'RetryAttempts': 0}}