In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/image-classification-hugging-face/huggingface_image_classification.png


In [2]:
import huggingface_hub
huggingface_hub.__version__

'0.26.2'

In [3]:
# pip install huggingface_hub==0.23.5 # ModelFilter is deprecated in newer versions

In [4]:
from huggingface_hub import HfApi
from transformers import AutoModel


## HuggingFace Models

In [5]:
api = HfApi()
models = api.list_models(
    # filter = ModelFilter(task = 'text-classification'), ## deprecated from v>=0.24
    task = 'text-classification',
    sort = "downloads",
    direction= -1,
    limit = 5
)
list(models)[:2]

[ModelInfo(id='distilbert/distilbert-base-uncased-finetuned-sst-2-english', author=None, sha=None, created_at=datetime.datetime(2022, 3, 2, 23, 29, 4, tzinfo=datetime.timezone.utc), last_modified=None, private=False, disabled=None, downloads=9698401, downloads_all_time=None, gated=None, gguf=None, inference=None, likes=641, library_name='transformers', tags=['transformers', 'pytorch', 'tf', 'rust', 'onnx', 'safetensors', 'distilbert', 'text-classification', 'en', 'dataset:sst2', 'dataset:glue', 'arxiv:1910.01108', 'doi:10.57967/hf/0181', 'license:apache-2.0', 'model-index', 'autotrain_compatible', 'endpoints_compatible', 'region:us'], pipeline_tag='text-classification', mask_token=None, card_data=None, widget_data=None, model_index=None, config=None, transformers_info=None, trending_score=None, siblings=None, spaces=None, safetensors=None, security_repo_status=None),
 ModelInfo(id='papluca/xlm-roberta-base-language-detection', author=None, sha=None, created_at=datetime.datetime(2022, 3

In [6]:
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(model_id)
model.save_pretrained(save_directory=f"model/{model_id}")

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

### DistilBERT example

In [7]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

inputs = tokenizer("His cruel, killer dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

'POSITIVE'

## HuggingFace datasets

- Datasets on HuggingFace follow Apache Arrow data format i.e. data is stored in columnar format instead of traditional row format.

In [8]:
import datasets
from datasets import load_dataset_builder, load_dataset

In [9]:
data_builder= load_dataset_builder("imdb")
print(data_builder.info)
print("$$$")
print(data_builder.info.description)
print("$$$")
print(data_builder.info.features)

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='imdb', config_name='plain_text', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=33432823, num_examples=25000, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=32650685, num_examples=25000, shard_lengths=None, dataset_name=None), 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67106794, num_examples=50000, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=83446840, post_processing_size=None, dataset_size=133190302, size_in_bytes=None)
$$$

$$$
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


### Load and save data

In [10]:
ds = load_dataset("imdb", split="train")
ds.save_to_disk('/kaggle/output')

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [11]:
imdb = datasets.load_from_disk('/kaggle/output')
filtered_imdb = imdb.filter(lambda row : row['label']==0)

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [12]:
## Attributes of hugging face dataset
dir(filtered_imdb)

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contigu

### sliced data

In [13]:
filtered_imdb.select(range(3))

Dataset({
    features: ['text', 'label'],
    num_rows: 3
})

## HuggingFace Tokenizers

Tokenization involves:
- Normalization
    - Cleaning text
    - Removing whitespace
    - Lowercases
- Pre-tokenization
  - Split text into smaller tokens
  - Pre-tokekenization methods:
    - White Space Tokenization 
    - Punctuation Splitting
    - Byte Pair Encoding (BPE) => Compresses text by merging most frequent pair of characters or subwords iteratively.
    - Morphological Tokenization => splits words based on morphemes (roots, prefixes, suffixes)
    - SentencePiece => Generates subword units or characters based on input text. It treats the entire text corpus as a single sequence.
    - Rule Based Tokenization
    - Character Level Tokenization
- Tokenization model

Available HuggingFace Tokenizers:
- Byte-Pair Encoding
- WordPiece
- Unigram
- ...

In [14]:
from transformers import AutoTokenizer
input = "Höw's the Jösh, todäy?"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(tokenizer.backend_tokenizer.normalizer.normalize_str(input))  # Change ö -> o and lowercase 
print(tokenizer.tokenize(input))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

how's the josh, today?
['how', "'", 's', 'the', 'josh', ',', 'today', '?']


In [15]:
from transformers import DistilBertTokenizer

input = "Höw's the Jösh, todäy?"

distil_tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
distil_tokens = distil_tokenizer.tokenize(text=input)
print(distil_tokens)

['how', "'", 's', 'the', 'josh', ',', 'today', '?']


In [16]:
from transformers import GPT2Tokenizer
input = "Höw's the Jösh, todäy?"

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_tokens = gpt_tokenizer.tokenize(text = input)
print(gpt_tokens)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

['H', 'Ã¶', 'w', "'s", 'Ġthe', 'ĠJ', 'Ã¶', 'sh', ',', 'Ġto', 'd', 'Ã¤', 'y', '?']


## HuggingFace Pipelines

Available tasks with pipelines:
1. 'audio-classification'
2. 'automatic-speech-recognition'
3. 'conversational'
4. 'depth-estimation'
5. 'document-question-answering'
6. 'feature-extraction'
7. 'fill-mask'
8. 'image-classification'
9. 'image-segmentation'
10. 'image-to-text'
11. 'mask-generation'
12. 'ner'
13. 'object-detection'
14. 'question-answering'
15. 'sentiment-analysis'
16. 'summarization'
17. 'table-question-answering'
18. 'text-classification'
19. 'text-generation'
20. 'text2text-generation'
21. 'token-classification'
22. 'translation'
23. 'video-classification'
24. 'visual-question-answering'
25. 'vqa'
26. 'zero-shot-audio-classification'
27. 'zero-shot-classification'
28. 'zero-shot-image-classification'
29. 'zero-shot-object-detection'
30. 'translation_XX_to_YY'


In [17]:
from transformers import pipeline

### Sentiment Analysis

In [18]:
distil_pipeline = pipeline(
                    task = "sentiment-analysis",
                    model = "distilbert-base-uncased-finetuned-sst-2-english" 
                )
distil_pipeline("This is awesome practice session!")

[{'label': 'POSITIVE', 'score': 0.9998577833175659}]

### Text Classification

In [19]:
distil_pipeline = pipeline(
                    task = "text-classification"
                )

text = "Hugging Face, Inc. is an American company incorporated under the Delaware General Corporation Law[1] and based in New York City."
distil_pipeline(text)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'NEGATIVE', 'score': 0.9006103277206421}]

In [20]:
zero_short_classifier = pipeline(
                    task = "zero-shot-classification",
                    model = "facebook/bart-large-mnli"
                )

text = "Hugging Face, Inc. is an American company incorporated under the Delaware General Corporation Law[1] and based in New York City that develops computation tools for building applications using machine learning."
labels = ['Technology', 'Bussiness', 'Sports']
zero_short_classifier(text, labels)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'Hugging Face, Inc. is an American company incorporated under the Delaware General Corporation Law[1] and based in New York City that develops computation tools for building applications using machine learning.',
 'labels': ['Technology', 'Bussiness', 'Sports'],
 'scores': [0.8955796360969543, 0.06598116457462311, 0.03843914717435837]}

### Text Summarization

In [21]:
summarizer = pipeline(
    task = "summarization",
    model = "sshleifer/distilbart-cnn-12-6",
    min_length = 20,
    max_length = 50  ## input text length should be greater than output summary text length 
)

text = """Founded in 2022, Perplexity generates answers using sources from the web and cites links within the text response.[3] 
          Perplexity works on a freemium model; the free product uses the company's standalone LLM based on GPT-3.5 with browsing[4], 
          while the paid version Perplexity Pro has access to GPT-4, Claude 3.5, Grok-2, Llama 3 and in-house Perplexity LLMs."""

summary_text = summarizer(text)
summary_text

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

[{'summary_text': " Founded in 2022, Perplexity generates answers using sources from the web and cites links within the text response . The free product uses the company's standalone LLM based on GPT-3.5 with browsing[4] while"}]

### Image classification

In [22]:
from transformers import image_transforms
from PIL import Image

In [23]:
image_classifier = pipeline(
    task = "image-classification",
    model = "abhishek/autotrain_fashion_mnist_vit_base"
)

image = Image.open("/kaggle/input/image-classification-hugging-face/huggingface_image_classification.png")
results = image_classifier(image)
results

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

[{'label': 'Bag', 'score': 0.8303866982460022},
 {'label': 'T - shirt / top', 'score': 0.7647987604141235},
 {'label': 'Ankle boot', 'score': 0.7544252872467041},
 {'label': 'Trouser', 'score': 0.6787008047103882},
 {'label': 'Sandal', 'score': 0.5785821080207825}]