References:
- https://huggingface.co/learn/nlp-course/chapter4/1?fw=pt

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [1]:
!pip install -q datasets evaluate transformers[sentencepiece] accelerate

In [2]:
import numpy as np

import transformers
import datasets
import torch
import evaluate
import accelerate

print("Transformers version:", transformers.__version__)
print("Datasets version:", datasets.__version__)
print("PyTorch version:", torch.__version__)
print("Evaluate version:", evaluate.__version__)
print("Accelerate version:", accelerate.__version__)
print("NumPy version:", np.__version__)

Transformers version: 4.30.2
Datasets version: 2.12.0
PyTorch version: 2.0.1+cu118
Evaluate version: 0.4.0
Accelerate version: 0.20.3
NumPy version: 1.22.4


# Using Pre-trained Models

## Pipeline: fill-mask task

In [3]:
from transformers import pipeline

camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
results = camembert_fill_mask("Le camembert est <mask> :)")
results

[{'score': 0.4909118711948395,
  'token': 7200,
  'token_str': 'délicieux',
  'sequence': 'Le camembert est délicieux :)'},
 {'score': 0.10556942969560623,
  'token': 2183,
  'token_str': 'excellent',
  'sequence': 'Le camembert est excellent :)'},
 {'score': 0.034533195197582245,
  'token': 26202,
  'token_str': 'succulent',
  'sequence': 'Le camembert est succulent :)'},
 {'score': 0.033031314611434937,
  'token': 528,
  'token_str': 'meilleur',
  'sequence': 'Le camembert est meilleur :)'},
 {'score': 0.030076298862695694,
  'token': 1654,
  'token_str': 'parfait',
  'sequence': 'Le camembert est parfait :)'}]

## CamembertTokenizer, CamembertForMaskedLM
- Instantiate the checkpoint using the model architecture directly

In [4]:
from transformers import CamembertTokenizer, CamembertForMaskedLM

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertForMaskedLM.from_pretrained("camembert-base")
model

CamembertForMaskedLM(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

## AutoTokenizer, AutoModelForMaskedLM
- Recommend using the Auto* classes instead, as these are by design architecture-agnostic

In [5]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("camembert-base")
model = AutoModelForMaskedLM.from_pretrained("camembert-base")

# Sharing Pre-trained Models

## Using the `push_to_hub_` API

In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

checkpoint = "camembert-base"

model = AutoModelForMaskedLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [8]:
model.push_to_hub("dummy-model")

CommitInfo(commit_url='https://huggingface.co/iliyaML/dummy-model/commit/8f4948beab51d26208a715fbf9e546c459b23a6a', commit_message='Upload CamembertForMaskedLM', commit_description='', oid='8f4948beab51d26208a715fbf9e546c459b23a6a', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
tokenizer.push_to_hub("dummy-model")

CommitInfo(commit_url='https://huggingface.co/iliyaML/dummy-model/commit/65e33408f2597de62a3235a7f7492b3f60a0f7c2', commit_message='Upload tokenizer', commit_description='', oid='65e33408f2597de62a3235a7f7492b3f60a0f7c2', pr_url=None, pr_revision=None, pr_num=None)

# Dependencies

In [10]:
!pip install -q session-info

In [11]:
import session_info

session_info.show()