In [1]:
import torch

torch.cuda.is_available()

True

In [2]:
if torch.cuda.is_available() == False:
    print('Error, selecciona una instancia con GPU.')

In [3]:
import numpy as np
import pandas as pd
import os

In [4]:
os.getcwd()

'/home/ec2-user/SageMaker'

In [26]:
FOLDER_BASE = './dataset'
FOLDER_OUTPUT = './output'

S3_BUCKET_NAME = 'sagemaker-readability'

In [6]:
if not os.path.exists(FOLDER_OUTPUT):
    os.makedirs(FOLDER_OUTPUT)

In [7]:
!pip install -q simpletransformers==0.51.0

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p37/bin/python -m pip install --upgrade pip' command.[0m


In [8]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import AutoTokenizer, AutoModel

In [9]:
GLOBAL_SEED = 33

def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    print('SEED: ' + str(seed))

In [10]:
class experiment: 
  def __init__(self, SUBMISSION_NUM, FOLDER, FORCE_LOWERCASE, CONFIG_CLEAN_CHARS, REMOVE_0_0, REMOVE_STOP_WORDS, MODEL_TYPE, MODEL_NAME, AUTO_TOKENIZER_NAME, P_TEST_SIZE, P_TRAIN_SIZE, LOAD_TRAIN_AND_TEST_SETS, FILENAME_INPUT_SUFIX, NUM_TRAIN_EPOCHS, IS_FP16_ENABLED, MAX_SEQ_LENGTH, EVAL_BATCH_SIZE, TRAIN_BATCH_SIZE, CONFIG_GRADIENT_ACCUMULATION_STEPS, LEARNING_RATE, CONFIG_NO_SAVE_MODEL, WARMUP_STEPS=0): 
    self.FORCE_LOWERCASE = FORCE_LOWERCASE
    self.CONFIG_CLEAN_CHARS = CONFIG_CLEAN_CHARS

    self.REMOVE_0_0 = REMOVE_0_0

    self.REMOVE_STOP_WORDS = REMOVE_STOP_WORDS

    self.MODEL_TYPE = MODEL_TYPE # Ejemplos: 'bert'
    self.MODEL_NAME = MODEL_NAME # Ejemplos: 'bert-large-uncased', 'bert-base-uncased','roberta-large'
    self.AUTO_TOKENIZER_NAME = AUTO_TOKENIZER_NAME

    self.P_TEST_SIZE = P_TEST_SIZE
    self.P_TRAIN_SIZE = P_TRAIN_SIZE

    self.LOAD_TRAIN_AND_TEST_SETS = LOAD_TRAIN_AND_TEST_SETS
    self.FILENAME_INPUT_SUFIX = FILENAME_INPUT_SUFIX

    self.NUM_TRAIN_EPOCHS = NUM_TRAIN_EPOCHS
    self.IS_FP16_ENABLED = IS_FP16_ENABLED
    self.MAX_SEQ_LENGTH = MAX_SEQ_LENGTH # Se rellena auto al tokenizar.
    self.EVAL_BATCH_SIZE = EVAL_BATCH_SIZE
    self.TRAIN_BATCH_SIZE = TRAIN_BATCH_SIZE
    self.CONFIG_GRADIENT_ACCUMULATION_STEPS = CONFIG_GRADIENT_ACCUMULATION_STEPS
    self.LEARNING_RATE = LEARNING_RATE # Ejemplos: 1e-05, 4e-05

    self.CONFIG_NO_SAVE_MODEL = CONFIG_NO_SAVE_MODEL

    self.WARMUP_STEPS = WARMUP_STEPS

    now = datetime.now()
    dt_string = now.strftime("%Y-%m-%d_%H_%M_%S")
    self.OUTPUT_FOLDER = os.path.join(FOLDER, 'out-' + MODEL_NAME + '_' + str(SUBMISSION_NUM) + '_' + dt_string)

  def info(self):
    return self.MODEL_NAME + '-N' + str(self.NUM_TRAIN_EPOCHS) + '_LR' + str(self.LEARNING_RATE) + '_lower' + str(self.FORCE_LOWERCASE) + '_cleanChars' + str(self.CONFIG_CLEAN_CHARS) + '_removeStops' + str(self.REMOVE_STOP_WORDS) + '_removeZero' + str(self.REMOVE_0_0) + '_warm' + str(self.WARMUP_STEPS) + '_' + str(self.P_TRAIN_SIZE) + '_' + str(self.TRAIN_BATCH_SIZE) + 'x' + str(self.EVAL_BATCH_SIZE) + '_' + str(self.MAX_SEQ_LENGTH) + '_fp16' + str(self.IS_FP16_ENABLED)

In [11]:
from datetime import datetime
import random

from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import AutoTokenizer, AutoModel

In [12]:
df_orig = pd.read_csv(os.path.join(FOLDER_BASE, 'train.csv'), sep=",", error_bad_lines=True)

In [13]:
df_orig

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [14]:
SUBMISSION_ID = 1

aux_experimento = experiment(
  SUBMISSION_NUM = SUBMISSION_ID,
  FOLDER = FOLDER_OUTPUT,
  
  FORCE_LOWERCASE = False,
  CONFIG_CLEAN_CHARS = False,

  REMOVE_0_0 = False,
  REMOVE_STOP_WORDS = False,

  MODEL_TYPE = 'roberta', # 'bert', 'roberta'
  MODEL_NAME = 'roberta-base', # '../input/roberta-base', # 'roberta-base', # 'bert-large-uncased' # 'bert-base-uncased' #roberta-large'
  AUTO_TOKENIZER_NAME = 'roberta-base', # '../input/roberta-base', # igual que model_name

  P_TEST_SIZE = 0.1,
  P_TRAIN_SIZE = 0.9,
  LOAD_TRAIN_AND_TEST_SETS = False,
  FILENAME_INPUT_SUFIX = 'writing-in-the-blog',

  NUM_TRAIN_EPOCHS = 5,
  IS_FP16_ENABLED = False,
  MAX_SEQ_LENGTH = 10000, # Se rellena auto al tokenizar.
  EVAL_BATCH_SIZE = 8,
  TRAIN_BATCH_SIZE = 8,
  CONFIG_GRADIENT_ACCUMULATION_STEPS = 1,
  LEARNING_RATE = 4e-5,

  WARMUP_STEPS = 0,

  CONFIG_NO_SAVE_MODEL = False
)

In [15]:
import statistics
import logging

def prediction_error(labels, preds):
  errors = [
             abs(label - pred)
             for label, pred in zip(labels, preds)
  ]
  return statistics.mean(errors)


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [16]:
seed_everything(seed=GLOBAL_SEED)

SEED: 33


In [17]:
one = aux_experimento

if one.LOAD_TRAIN_AND_TEST_SETS == False:
  x_train, x_test = train_test_split(df_orig, test_size=one.P_TEST_SIZE, train_size=one.P_TRAIN_SIZE, shuffle=True)

  now = datetime.now()
  dt_string = now.strftime("%Y-%m-%d_%H_%M_%S")
  suffix = dt_string + '_' + str(one.P_TRAIN_SIZE) + 'x' + str(one.P_TEST_SIZE)
    
  x_train.to_csv(os.path.join(FOLDER_OUTPUT, "x_train_" + suffix +  ".csv"), sep=",", index=False)
  x_test.to_csv(os.path.join(FOLDER_OUTPUT, "x_test_" + suffix  + ".csv"), sep=",", index=False)

  print('Separado el dataset en train y test. Guardado en disco.')
else:
  x_train = pd.read_csv(os.path.join(FOLDER_OUTPUT, 'x_train_' + one.FILENAME_INPUT_SUFIX + ".csv"), sep=",", error_bad_lines=True)
  x_test = pd.read_csv(os.path.join(FOLDER_OUTPUT, 'x_test_' + one.FILENAME_INPUT_SUFIX + ".csv"), sep=",", error_bad_lines=True)
  print('Se ha cargado de disco x_train y x_test.')
  
# eliminar el dato anómalo?:
if one.REMOVE_0_0:
  x_train = x_train[(x_train['target'] != 0) | (x_train['standard_error'] != 0)]
  x_test = x_test[(x_test['target'] != 0) | (x_test['standard_error'] != 0)]
  print('Eliminado dato anómalo (0, 0). ' + str(len(x_train)))

x_train.rename(columns = {'excerpt':'text', 'target':'labels'}, inplace = True)
x_test.rename(columns = {'excerpt':'text', 'target':'labels'}, inplace = True)

if one.FORCE_LOWERCASE == True:
  x_train['text'] = x_train['text'].str.lower()
  x_test['text'] = x_test['text'].str.lower()
  print('Conversión a minúsculas.')

if one.CONFIG_CLEAN_CHARS:
  x_train['text'] = x_train['text'].apply(lambda x: clean_chars(x, stops_spaced_only))
  x_test['text'] = x_test['text'].apply(lambda x: clean_chars(x, stops_spaced_only))
  print('Limpiados los caracteres especiales.')

if one.REMOVE_STOP_WORDS:
  print('Limpiadas las stopwords.')
  for one in stops:
    x_train['text'] = x_train['text'].apply(lambda x: x.replace(' {} '.format(one), ' '))
    x_test['text'] = x_test['text'].apply(lambda x: x.replace(' {} '.format(one), ' '))

# calcular max_seq
tokenizer = AutoTokenizer.from_pretrained(one.AUTO_TOKENIZER_NAME)

x_train['excerpt_tokeniker'] = x_train['text'].apply(lambda x: tokenizer.tokenize(x))
x_train['excerpt_tokeniker_len'] = x_train['excerpt_tokeniker'].apply(lambda x: len(x))

x_test['excerpt_tokeniker'] = x_test['text'].apply(lambda x: tokenizer.tokenize(x))
x_test['excerpt_tokeniker_len'] = x_test['excerpt_tokeniker'].apply(lambda x: len(x))

one.MAX_SEQ_LENGTH = max(x_train['excerpt_tokeniker_len'].max(), x_test['excerpt_tokeniker_len'].max()) + 2 # max len de Train y Eval. (y +2)
print('MAX_SEQ_LENGTH (from Train & Eval): ' + str(one.MAX_SEQ_LENGTH))


print('==> ' + str(SUBMISSION_ID) + ': ' + one.info())

Separado el dataset en train y test. Guardado en disco.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


MAX_SEQ_LENGTH (from Train & Eval): 322
==> 1: roberta-base-N5_LR4e-05_lowerFalse_cleanCharsFalse_removeStopsFalse_removeZeroFalse_warm0_0.9_8x8_322_fp16False


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [18]:
model_args = ClassificationArgs()
model_args.num_train_epochs = one.NUM_TRAIN_EPOCHS
model_args.output_dir = one.OUTPUT_FOLDER
model_args.regression = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True

model_args.fp16 = one.IS_FP16_ENABLED
model_args.max_seq_length = one.MAX_SEQ_LENGTH
model_args.eval_batch_size = one.EVAL_BATCH_SIZE
model_args.train_batch_size = one.TRAIN_BATCH_SIZE
model_args.gradient_accumulation_steps = one.CONFIG_GRADIENT_ACCUMULATION_STEPS

model_args.learning_rate = one.LEARNING_RATE

model_args.no_save = one.CONFIG_NO_SAVE_MODEL
model_args.best_model_dir = os.path.join(one.OUTPUT_FOLDER, 'best_model')

model_args.manual_seed = GLOBAL_SEED # para reproducible

model_args.warmup_steps = one.WARMUP_STEPS

# model_args.wandb_project = WANDB_PROJECT_NAME
# model_args.wandb_kwargs['name'] = one.info()

print(model_args)

ClassificationArgs(adam_epsilon=1e-08, best_model_dir='./output/out-roberta-base_1_2021-10-28_18_29_39/best_model', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=2, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=3, encoding=None, adafactor_eps=(1e-30, 0.001), adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_beta1=None, adafactor_scale_parameter=True, adafactor_relative_step=True, adafactor_warmup_init=True, eval_batch_size=8, evaluate_during_training=True, evaluate_during_training_silent=True, evaluate_during_training_steps=2000, evaluate_during_training_verbose=True, evaluate_each_epoch=True, fp16=False, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, manual_seed=33, max_grad_nor

In [19]:
model = ClassificationModel(
    one.MODEL_TYPE,
    one.MODEL_NAME,
    num_labels=1,
    args=model_args
)

model.train_model(x_train, eval_df=x_test, show_running_loss=True, verbose=True, prediction_error=prediction_error)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_p

  0%|          | 0/2550 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.4972742662001008, 'eval_loss': 0.37080445513129234}


Running Epoch 1 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.5295451188551813, 'eval_loss': 0.42812980131970513}


Running Epoch 2 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.6533960971832079, 'eval_loss': 0.6184557212723626}


Running Epoch 3 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.4459620239672212, 'eval_loss': 0.3230808104077975}


Running Epoch 4 of 5:   0%|          | 0/319 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.
INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.47813418899616883, 'eval_loss': 0.36507454328238964}
INFO:simpletransformers.classification.classification_model: Training of roberta model complete. Saved to ./output/out-roberta-base_1_2021-10-28_18_29_39.


(1595,
 {'global_step': [319, 638, 957, 1276, 1595],
  'train_loss': [0.3251998722553253,
   0.07599388808012009,
   0.3134761154651642,
   0.0371391735970974,
   0.12416594475507736],
  'eval_loss': [0.37080445513129234,
   0.42812980131970513,
   0.6184557212723626,
   0.3230808104077975,
   0.36507454328238964],
  'prediction_error': [0.4972742662001008,
   0.5295451188551813,
   0.6533960971832079,
   0.4459620239672212,
   0.47813418899616883]})

In [20]:
result, model_outputs, wrong_predictions = model.eval_model(x_test, verbose=True)
result

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


  0%|          | 0/284 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/36 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.47813418899616883, 'eval_loss': 0.36507454328238964}


{'eval_loss': 0.36507454328238964}

In [21]:
STR_SUBMISSION_ID = str(SUBMISSION_ID)

solutions = pd.DataFrame()
solutions['text_' + STR_SUBMISSION_ID] = x_test['text']
solutions['labels_' + STR_SUBMISSION_ID] = x_test['labels']
solutions['standard_error_' + STR_SUBMISSION_ID] = x_test['standard_error']

solutions['prediction_' + STR_SUBMISSION_ID] = model_outputs
solutions['prediction_error_abs_' + STR_SUBMISSION_ID] = solutions.apply(lambda x: abs(x['labels_' + STR_SUBMISSION_ID] - x['prediction_' + STR_SUBMISSION_ID]), axis=1)

print('(ABS) Mean prediction error for this experiment: ' + str(solutions['prediction_error_abs_' + STR_SUBMISSION_ID].mean(axis=0)))

(ABS) Mean prediction error for this experiment: 0.4781341878344585


In [23]:
# cargando desde fichero el modelo entrenado
model_loaded = ClassificationModel(
    one.MODEL_TYPE,
    os.path.join(one.OUTPUT_FOLDER, 'best_model'),
    num_labels=1,
    args=model_args
)

In [24]:
result, model_outputs, wrong_predictions = model_loaded.eval_model(x_test, verbose=True, prediction_error=prediction_error)

INFO:simpletransformers.classification.classification_model: Converting to features started. Cache is not used.


  0%|          | 0/284 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/36 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_model:{'prediction_error': 0.4459620239672212, 'eval_loss': 0.3230808104077975}


In [25]:
result

{'prediction_error': 0.4459620239672212, 'eval_loss': 0.3230808104077975}

In [27]:
import boto3, os

BEST_DIR = os.path.join(one.OUTPUT_FOLDER, 'best_model')

for (dirpath, dirnames, filenames) in os.walk(BEST_DIR):
    for one in filenames:
        path = os.path.join(dirpath, one)
        print('enviando a S3: ' + path)
        boto3.Session().resource('s3').Bucket(S3_BUCKET_NAME).Object(path).upload_file(path)
print('Fin :)')

enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/training_args.bin
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/eval_results.txt
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/model_args.json
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/config.json
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/optimizer.pt
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/tokenizer_config.json
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/vocab.json
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/pytorch_model.bin
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/scheduler.pt
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/special_tokens_map.json
enviando a S3: ./output/out-roberta-base_1_2021-10-28_18_29_39/best_model/merges.txt
Fin :)
