# Development

In [9]:
from sklearn.model_selection import train_test_split
from transformers import AutoModelForMaskedLM, Trainer

from model.config import DISTILBERT_BASE_UNCASED, training_args
from model.data_collator import data_collator
from model.data_split import train_sentences
from model.tokenizer import tokenize_sentences, tokenizer

In [10]:
mini_train_sentences, mini_eval_sentences = train_test_split(train_sentences[0:100])
mini_train_tokenized = tokenize_sentences(mini_train_sentences)
mini_eval_tokenized = tokenize_sentences(mini_eval_sentences)

In [11]:
mini_model = AutoModelForMaskedLM.from_pretrained(DISTILBERT_BASE_UNCASED)
training_args.output_dir = "mini_model_training"

mini_trainer = Trainer(
    model=mini_model,
    args=training_args,
    train_dataset=mini_train_tokenized,
    eval_dataset=mini_eval_tokenized,
    data_collator=data_collator,
)

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/jacksonargo/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.5",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/jacksonargo/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0

In [12]:
mini_trainer.train()
mini_trainer.save_model("mini_trainer_checkpoint")

***** Running training *****
  Num examples = 75
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 25


Epoch,Training Loss,Validation Loss
1,No log,3.353678
2,No log,3.390263
3,No log,3.340618
4,No log,3.321254
5,No log,4.182182


***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to mini_trainer_checkpoint
Configuration saved in mini_trainer_checkpoint/config.json
Model weights saved in mini_trainer_checkpoint/pytorch_model.bin


In [13]:
mini_trainer.evaluate()

***** Running Evaluation *****
  Num examples = 25
  Batch size = 16


{'eval_loss': 3.3476290702819824,
 'eval_runtime': 0.5297,
 'eval_samples_per_second': 47.199,
 'eval_steps_per_second': 3.776,
 'epoch': 5.0}

In [14]:
from transformers import pipeline

unmasker = pipeline('fill-mask', model=mini_model, tokenizer=tokenizer)
unmasker("Hello I'm a [MASK] model.")

[{'sequence': "hello i'm a role model.",
  'score': 0.0874275416135788,
  'token': 2535,
  'token_str': 'r o l e'},
 {'sequence': "hello i'm a business model.",
  'score': 0.04861392825841904,
  'token': 2449,
  'token_str': 'b u s i n e s s'},
 {'sequence': "hello i'm a new model.",
  'score': 0.03383750095963478,
  'token': 2047,
  'token_str': 'n e w'},
 {'sequence': "hello i'm a fashion model.",
  'score': 0.029913591220974922,
  'token': 4827,
  'token_str': 'f a s h i o n'},
 {'sequence': "hello i'm a model model.",
  'score': 0.022853940725326538,
  'token': 2944,
  'token_str': 'm o d e l'}]

In [15]:
unmasker("A [MASK] tries to compress sounds that are too loud.")

[{'sequence': 'a speaker tries to compress sounds that are too loud.',
  'score': 0.221269890666008,
  'token': 5882,
  'token_str': 's p e a k e r'},
 {'sequence': 'a player tries to compress sounds that are too loud.',
  'score': 0.05172230675816536,
  'token': 2447,
  'token_str': 'p l a y e r'},
 {'sequence': 'a person tries to compress sounds that are too loud.',
  'score': 0.04319479689002037,
  'token': 2711,
  'token_str': 'p e r s o n'},
 {'sequence': 'a user tries to compress sounds that are too loud.',
  'score': 0.04149453341960907,
  'token': 5310,
  'token_str': 'u s e r'},
 {'sequence': 'a microphone tries to compress sounds that are too loud.',
  'score': 0.03399374708533287,
  'token': 15545,
  'token_str': 'm i c r o p h o n e'}]