# Development

In [1]:
import sys 
sys.path.append("../")

In [2]:
from sklearn.model_selection import train_test_split
from transformers import AutoModelForMaskedLM, Trainer

from model.config import DISTILBERT_BASE_UNCASED, training_args
from model.data_collator import data_collator
from model.data_split import train_sentences
from model.tokenizer import tokenize_sentences, tokenizer

In [3]:
mini_train_sentences, mini_eval_sentences = train_test_split(train_sentences[0:100])
mini_train_tokenized = tokenize_sentences(mini_train_sentences)
mini_eval_tokenized = tokenize_sentences(mini_eval_sentences)

In [4]:
mini_model = AutoModelForMaskedLM.from_pretrained(DISTILBERT_BASE_UNCASED)
training_args.output_dir = "mini_model_training"

mini_trainer = Trainer(
    model=mini_model,
    args=training_args,
    train_dataset=mini_train_tokenized,
    eval_dataset=mini_eval_tokenized,
    data_collator=data_collator,
)

In [5]:
mini_trainer.train()
mini_trainer.save_model("mini_trainer_checkpoint")

***** Running training *****
  Num examples = 75
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 25


Epoch,Training Loss,Validation Loss
1,No log,3.227912
2,No log,2.971653
3,No log,3.08518
4,No log,2.910129
5,No log,3.589861


***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25
  Batch size = 16
***** Running Evaluation *****
  Num examples = 25
  Batch size = 16


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to mini_trainer_checkpoint
Configuration saved in mini_trainer_checkpoint/config.json
Model weights saved in mini_trainer_checkpoint/pytorch_model.bin


In [6]:
mini_trainer.evaluate()

***** Running Evaluation *****
  Num examples = 25
  Batch size = 16


{'eval_loss': 2.918678045272827,
 'eval_runtime': 0.6,
 'eval_samples_per_second': 41.664,
 'eval_steps_per_second': 3.333,
 'epoch': 5.0}

In [7]:
from transformers import pipeline

unmasker = pipeline('fill-mask', model=mini_model, tokenizer=tokenizer)
unmasker("Hello I'm a [MASK] model.")

[{'sequence': "hello i'm a role model.",
  'score': 0.07971620559692383,
  'token': 2535,
  'token_str': 'r o l e'},
 {'sequence': "hello i'm a business model.",
  'score': 0.06690440326929092,
  'token': 2449,
  'token_str': 'b u s i n e s s'},
 {'sequence': "hello i'm a good model.",
  'score': 0.035355184227228165,
  'token': 2204,
  'token_str': 'g o o d'},
 {'sequence': "hello i'm a new model.",
  'score': 0.032671455293893814,
  'token': 2047,
  'token_str': 'n e w'},
 {'sequence': "hello i'm a fashion model.",
  'score': 0.03203974664211273,
  'token': 4827,
  'token_str': 'f a s h i o n'}]

In [8]:
unmasker("A [MASK] tries to compress sounds that are too loud.")

[{'sequence': 'a person tries to compress sounds that are too loud.',
  'score': 0.12533654272556305,
  'token': 2711,
  'token_str': 'p e r s o n'},
 {'sequence': 'a speaker tries to compress sounds that are too loud.',
  'score': 0.05781891942024231,
  'token': 5882,
  'token_str': 's p e a k e r'},
 {'sequence': 'a musician tries to compress sounds that are too loud.',
  'score': 0.05304500088095665,
  'token': 5455,
  'token_str': 'm u s i c i a n'},
 {'sequence': 'a user tries to compress sounds that are too loud.',
  'score': 0.052762314677238464,
  'token': 5310,
  'token_str': 'u s e r'},
 {'sequence': 'a player tries to compress sounds that are too loud.',
  'score': 0.034627556800842285,
  'token': 2447,
  'token_str': 'p l a y e r'}]