In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/image-classification-hugging-face/huggingface_image_classification.png
/kaggle/input/audio-qa-hugging-face/song_calm_down.mp3


In [2]:
import huggingface_hub
huggingface_hub.__version__

'0.26.2'

In [3]:
# pip install huggingface_hub==0.23.5 # ModelFilter is deprecated in newer versions

In [4]:
from huggingface_hub import HfApi
from transformers import AutoModel


## HuggingFace Models

In [5]:
api = HfApi()
models = api.list_models(
    # filter = ModelFilter(task = 'text-classification'), ## deprecated from v>=0.24
    task = 'text-classification',
    sort = "downloads",
    direction= -1,
    limit = 5
)
list(models)[:2]

[ModelInfo(id='distilbert/distilbert-base-uncased-finetuned-sst-2-english', author=None, sha=None, created_at=datetime.datetime(2022, 3, 2, 23, 29, 4, tzinfo=datetime.timezone.utc), last_modified=None, private=False, disabled=None, downloads=9698401, downloads_all_time=None, gated=None, gguf=None, inference=None, likes=642, library_name='transformers', tags=['transformers', 'pytorch', 'tf', 'rust', 'onnx', 'safetensors', 'distilbert', 'text-classification', 'en', 'dataset:sst2', 'dataset:glue', 'arxiv:1910.01108', 'doi:10.57967/hf/0181', 'license:apache-2.0', 'model-index', 'autotrain_compatible', 'endpoints_compatible', 'region:us'], pipeline_tag='text-classification', mask_token=None, card_data=None, widget_data=None, model_index=None, config=None, transformers_info=None, trending_score=None, siblings=None, spaces=None, safetensors=None, security_repo_status=None),
 ModelInfo(id='papluca/xlm-roberta-base-language-detection', author=None, sha=None, created_at=datetime.datetime(2022, 3

In [6]:
model_id = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(model_id)
model.save_pretrained(save_directory=f"model/{model_id}")

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

### DistilBERT example

In [7]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")

inputs = tokenizer("His cruel, killer dog is cute", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

'POSITIVE'

## HuggingFace datasets

- Datasets on HuggingFace follow Apache Arrow data format i.e. data is stored in columnar format instead of traditional row format.

In [8]:
import datasets
from datasets import load_dataset_builder, load_dataset

In [9]:
data_builder= load_dataset_builder("imdb")
print(data_builder.info)
print("$$$")
print(data_builder.info.description)
print("$$$")
print(data_builder.info.features)

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='imdb', config_name='plain_text', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=33432823, num_examples=25000, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=32650685, num_examples=25000, shard_lengths=None, dataset_name=None), 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67106794, num_examples=50000, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=83446840, post_processing_size=None, dataset_size=133190302, size_in_bytes=None)
$$$

$$$
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}


### Load and save data

In [10]:
ds = load_dataset("imdb", split="train")
ds.save_to_disk('/kaggle/output')

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [11]:
imdb = datasets.load_from_disk('/kaggle/output')
filtered_imdb = imdb.filter(lambda row : row['label']==0)

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [12]:
## Attributes of hugging face dataset
dir(filtered_imdb)

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getitems__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_generate_tables_from_cache_file',
 '_generate_tables_from_shards',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contigu

### sliced data

In [13]:
filtered_imdb.select(range(3))

Dataset({
    features: ['text', 'label'],
    num_rows: 3
})

## HuggingFace Tokenizers

Tokenization involves:
- Normalization
    - Cleaning text
    - Removing whitespace
    - Lowercases
- Pre-tokenization
  - Split text into smaller tokens
  - Pre-tokekenization methods:
    - White Space Tokenization 
    - Punctuation Splitting
    - Byte Pair Encoding (BPE) => Compresses text by merging most frequent pair of characters or subwords iteratively.
    - Morphological Tokenization => splits words based on morphemes (roots, prefixes, suffixes)
    - SentencePiece => Generates subword units or characters based on input text. It treats the entire text corpus as a single sequence.
    - Rule Based Tokenization
    - Character Level Tokenization
- Tokenization model

Available HuggingFace Tokenizers:
- Byte-Pair Encoding
- WordPiece
- Unigram
- ...

In [14]:
from transformers import AutoTokenizer
input = "Höw's the Jösh, todäy?"

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print(tokenizer.backend_tokenizer.normalizer.normalize_str(input))  # Change ö -> o and lowercase 
print(tokenizer.tokenize(input))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

how's the josh, today?
['how', "'", 's', 'the', 'josh', ',', 'today', '?']


In [15]:
from transformers import DistilBertTokenizer

input = "Höw's the Jösh, todäy?"

distil_tokenizer = DistilBertTokenizer.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english"
)
distil_tokens = distil_tokenizer.tokenize(text=input)
print(distil_tokens)

['how', "'", 's', 'the', 'josh', ',', 'today', '?']


In [16]:
from transformers import GPT2Tokenizer
input = "Höw's the Jösh, todäy?"

gpt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_tokens = gpt_tokenizer.tokenize(text = input)
print(gpt_tokens)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

['H', 'Ã¶', 'w', "'s", 'Ġthe', 'ĠJ', 'Ã¶', 'sh', ',', 'Ġto', 'd', 'Ã¤', 'y', '?']


## HuggingFace Pipelines

Available tasks with pipelines:
1. 'audio-classification'
2. 'automatic-speech-recognition'
3. 'conversational'
4. 'depth-estimation'
5. 'document-question-answering'
6. 'feature-extraction'
7. 'fill-mask'
8. 'image-classification'
9. 'image-segmentation'
10. 'image-to-text'
11. 'mask-generation'
12. 'ner'
13. 'object-detection'
14. 'question-answering'
15. 'sentiment-analysis'
16. 'summarization'
17. 'table-question-answering'
18. 'text-classification'
19. 'text-generation'
20. 'text2text-generation'
21. 'token-classification'
22. 'translation'
23. 'video-classification'
24. 'visual-question-answering'
25. 'vqa'
26. 'zero-shot-audio-classification'
27. 'zero-shot-classification'
28. 'zero-shot-image-classification'
29. 'zero-shot-object-detection'
30. 'translation_XX_to_YY'


In [17]:
from transformers import pipeline

### Sentiment Analysis

In [18]:
distil_pipeline = pipeline(
                    task = "sentiment-analysis",
                    model = "distilbert-base-uncased-finetuned-sst-2-english" 
                )
distil_pipeline("This is awesome practice session!")

[{'label': 'POSITIVE', 'score': 0.9998577833175659}]

### Text Classification

In [19]:
distil_pipeline = pipeline(
                    task = "text-classification"
                )

text = "Hugging Face, Inc. is an American company incorporated under the Delaware General Corporation Law[1] and based in New York City."
distil_pipeline(text)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'NEGATIVE', 'score': 0.9006103277206421}]

In [20]:
zero_short_classifier = pipeline(
                    task = "zero-shot-classification",
                    model = "facebook/bart-large-mnli"
                )

text = "Hugging Face, Inc. is an American company incorporated under the Delaware General Corporation Law[1] and based in New York City that develops computation tools for building applications using machine learning."
labels = ['Technology', 'Bussiness', 'Sports']
zero_short_classifier(text, labels)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'Hugging Face, Inc. is an American company incorporated under the Delaware General Corporation Law[1] and based in New York City that develops computation tools for building applications using machine learning.',
 'labels': ['Technology', 'Bussiness', 'Sports'],
 'scores': [0.8955796360969543, 0.06598116457462311, 0.03843914717435837]}

### Text Summarization

In [21]:
summarizer = pipeline(
    task = "summarization",
    model = "sshleifer/distilbart-cnn-12-6",
    min_length = 20,
    max_length = 50  ## input text length should be greater than output summary text length 
)

text = """Founded in 2022, Perplexity generates answers using sources from the web and cites links within the text response.[3] 
          Perplexity works on a freemium model; the free product uses the company's standalone LLM based on GPT-3.5 with browsing[4], 
          while the paid version Perplexity Pro has access to GPT-4, Claude 3.5, Grok-2, Llama 3 and in-house Perplexity LLMs."""

summary_text = summarizer(text)
summary_text

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

[{'summary_text': " Founded in 2022, Perplexity generates answers using sources from the web and cites links within the text response . The free product uses the company's standalone LLM based on GPT-3.5 with browsing[4] while"}]

### Image classification

In [22]:
from transformers import image_transforms
from PIL import Image

In [23]:
image_classifier = pipeline(
    task = "image-classification",
    model = "abhishek/autotrain_fashion_mnist_vit_base"
)

image = Image.open("/kaggle/input/image-classification-hugging-face/huggingface_image_classification.png")
results = image_classifier(image)
results

config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

[{'label': 'Bag', 'score': 0.8303866982460022},
 {'label': 'T - shirt / top', 'score': 0.7647987604141235},
 {'label': 'Ankle boot', 'score': 0.7544252872467041},
 {'label': 'Trouser', 'score': 0.6787008047103882},
 {'label': 'Sandal', 'score': 0.5785821080207825}]

### Multi Modal QA

#### Image QA

In [24]:
vqa = pipeline(
    task = "visual-question-answering",
    model = "dandelin/vilt-b32-finetuned-vqa"
)

result = vqa(
    image = "/kaggle/input/image-classification-hugging-face/huggingface_image_classification.png",
    question = "Who's the person?"
)
result

config.json:   0%|          | 0.00/136k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/470M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/343M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

[{'score': 0.6498638987541199, 'answer': 'man'},
 {'score': 0.023679355159401894, 'answer': 'unknown'},
 {'score': 0.023101873695850372, 'answer': 'nobody'},
 {'score': 0.01929292269051075, 'answer': "don't know"},
 {'score': 0.019094819203019142, 'answer': 'photographer'}]

#### Audio QA

- Typically speech models are trained at 16kHz

In [25]:
from datasets import Dataset, Audio 

In [26]:
audio_file = [{"audio":"/kaggle/input/audio-qa-hugging-face/song_calm_down.mp3"}]

audio_file = Dataset.from_list(audio_file)
audio_file = audio_file.cast_column("audio", Audio())

In [27]:
sampling_rate = audio_file[0]["audio"]["array"]
print(f"Resampled audio shape: {sampling_rate.shape}")

model.safetensors:   0%|          | 0.00/470M [00:00<?, ?B/s]

Resampled audio shape: (10554368,)


In [28]:
target_sampling_rate = 16_000  
dataset = audio_file.cast_column("audio", Audio(sampling_rate=target_sampling_rate))


resampled_audio = dataset[0]["audio"]["array"]
print(f"Resampled audio shape: {resampled_audio.shape}")


Resampled audio shape: (3829250,)


In [29]:
audio_file = [{"audio":"/kaggle/input/audio-qa-hugging-face/song_calm_down.mp3"}]
dataset = Dataset.from_list(audio_file)
dataset = dataset.cast_column("audio", Audio())
dataset

Dataset({
    features: ['audio'],
    num_rows: 1
})

In [30]:
sampling_rate = dataset[0]['audio']['sampling_rate']
print("Audio File info: ", dataset[0]['audio'])
print("Sampling Rate: ", sampling_rate)

Audio File info:  {'path': '/kaggle/input/audio-qa-hugging-face/song_calm_down.mp3', 'array': array([-1.68163939e-02, -2.31381878e-02, -1.09714820e-02, ...,
       -3.69735716e-07, -5.03296747e-08,  6.95829350e-07]), 'sampling_rate': 44100}
Sampling Rate:  44100


In [31]:
print(type(audio_file))

<class 'list'>


In [32]:
audio_file = dataset.cast_column('audio', Audio(sampling_rate = 16_000))
new_sampling_rate = audio_file[0]['audio']['sampling_rate']
print("New Sampling Rate: ", new_sampling_rate)

New Sampling Rate:  16000


In [33]:
audio_file[0]['audio']

{'path': '/kaggle/input/audio-qa-hugging-face/song_calm_down.mp3',
 'array': array([-2.40959432e-02,  1.47958668e-02, -1.46308104e-02, ...,
        -6.57786643e-08,  1.84805415e-07,  0.00000000e+00]),
 'sampling_rate': 16000}

In [34]:
# classifier = pipeline(task="audio-classification", model="facebook/mms-lid-126")
# audio = dataset[0]["audio"]["array"]

# prediction = classifier(audio)
# prediction

### Automatic Speech Recognition

In [35]:
transcriber =  pipeline(
    task ="automatic-speech-recognition",
    model = "facebook/wav2vec2-base-960h"
)

transcriber("/kaggle/input/audio-qa-hugging-face/song_calm_down.mp3")


config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

{'text': 'EO A EE OMON AODOAOSOBEE IE A O OOE O OODO OHEL A O  O O EA TEA  AEEE O ON A O OA AA TA E LO O OOO OOOOOOOOOE EEEOAELEEEOOOO AOOOOOOOO EEALEOELE OOOOOA OI AEE  A AOEEOOA E OMAMAAAET T  AN EAAWOEEOE O  EONE OOO EOO OOOI O AOSIA OOA EEEOOO AA MAAEOOEMAA OOOO HANOEOOOO N AOOO OOA EOESUEA OO O O EOEEI OEI AI O IHIAO O E OTEA OAO O OIOE OOO OOEEEEOOOOOOOOOOOOO OEEEAAOOO EI   OS AEEAIS OOI AOWEOIM OOINI EA E OOOEOOO EOOOA EAE OOE OAEIIIAAAAEEAEAIAIS A EE COME O DOAOSBE WII A OO D OO DOOODOIELA O AO D EA TEAEEELE EN A O OA A A AE AOOOOOOOOO OOOOAEEOOAOOOEEEEEEE'}

In [36]:
audio_list = [{"audio": "/kaggle/input/audio-qa-hugging-face/song_calm_down.mp3"}]
sampling_rate = 16_000
audio_file = Dataset.from_list(audio_list)
audio_file = audio_file.cast_column("audio", Audio(sampling_rate = 16_000))
audio_file = audio_file[0]['audio']['array']
prediction = transcriber({'sampling_rate': sampling_rate, 'raw':audio_file})
prediction

{'text': 'EO A EE MON AODOAOSOBEE II A O DOOE DO OODO OE L O O  O D EA TEA ALEEE O EN A O OA OO TA E LO O OOO OOOOOOOOOOOE EELEOAELEEEOOOO A AOOOOOOO EEAEELELEE OOOOOOA OI AEE  A AOEEOOOA E OMAMAAETE T  AN EAEWOEEOE O ENE OOO EOOO OOI O AOSIE OOAEEEOO AA MAAEOOEMAAE OOOO HAOOOEOOOO NANOOOO OO A EOESUEA OO O O AOEI OEOI AI O IHIAO  E OOTEA OO  O  O IOE O OOO  OOOEEEE OOOOOO OOEOOOOOOOOEEAEAAOOOOI EI   OS AEEAIS OOIN AASOEOIMA OOINI EAN E OOOEOOO EOOOA EAE OOEOAAEIIEIA AAAEEAEAEIAIS A EE COMED AO DOAOSOBE WII A OO D OE DOOO DOEE LA O AO D EA TEIAEEELE EN A O O O A AE OOOOOOOOOO OOOAEEAEEOOAOOOOO OEEEEEEEEE'}

## Fine Tune Model

In [37]:
from transformers import AutoModelForSequenceClassification

model_name = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(model_name)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
from transformers import AutoTokenizer
from datasets import Dataset

data = {
    "train": [
        {"text": "A large language model (LLM) is a type of computational model designed for natural language processing tasks such as language generation.", "label": "Neutral"},
        {"text": "As language models, LLMs acquire these abilities by learning statistical relationships from vast amounts of text during a self-supervised and semi-supervised training process.", "label": "Positive"},
        {"text": "The largest and most capable LLMs are artificial neural networks built with a decoder-only transformer-based architecture, enabling efficient processing and generation of large-scale text data.", "label": "Negative"}   
    ],
    "test": [
        {"text": "Before 2017, there were a few language models that were large as compared to capacities then available.", "label": "Neutral"},
        {"text": "In 2009, in most language processing tasks, statistical language models dominated over symbolic language models, as they can usefully ingest large datasets.", "label": "Positive"},
        {"text": "Some commenters expressed concern over accidental or deliberate creation of misinformation, or other forms of misuse.", "label": "Negative"}   
    ]
}

dataset = Dataset.from_dict(data)
dataset

Dataset({
    features: ['train', 'test'],
    num_rows: 3
})

In [39]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# dataset = dataset.map(
#     lambda row: tokenizer(row['train'])
# )

In [40]:
# from transformers import (
#     Trainer,
#     TrainingArguments
# )

# training_args = TrainingArguments(
#     output_dir = "./results"
# )
# trainer = Trainer(
#     model = model,
#     args = training_args,
#     train_dataset = dataset['train'],
#     eval_dataset = dataset['test']
# )

# trainer.train()

# local_path = './fine_tuned_model'

# trainer.save_model(local_path)

In [41]:
# from transformers import pipeline

# classifier = pipeline(task = "text-classification", model = local_path)
# classifier[dataset['text']]

## Text Generation

In [42]:
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoProcessor,
    AutoModelForCausalLM
)
from PIL import Image


In [43]:
generate_text = pipeline(task="text-generation")
generate_text("This session is")

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


[{'generated_text': "This session is open to students from all backgrounds, with an emphasis on the history of Israel's relationship with the Palestinians. Topics include: the struggle to secure Israeli sovereignty across the entire occupied Palestinian territory – which has resulted in the recent Israeli-Palestinian armed"}]

In [44]:
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [45]:
prompt = "It's dawn of Age of"
input_ids = tokenizer.encode(prompt, return_tensors = "pt")

output = model.generate(input_ids, num_return_sequences = 1)
generated_text = tokenizer.decode(output[0])
generated_text

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


"It's dawn of Age of Empires II, and the world is a mess. The world is full"

### Image Caption

In [46]:
proc = AutoProcessor.from_pretrained("microsoft/git-base-coco")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

img = Image.open("/kaggle/input/image-classification-hugging-face/huggingface_image_classification.png")
pixel_values = proc(images=img, return_tensors = "pt").pixel_values
generated_ids = model.generate(pixel_values= pixel_values, max_length = 50)

generated_caption = proc.batch_decode(
    generated_ids,
    skip_special_tokens = True
)
generated_caption

preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/707M [00:00<?, ?B/s]

['actor in a yellow t - shirt']

## HuggingFace Embeddings

In [47]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m81.2 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1
Note: you may need to restart the kernel to use updated packages.


In [48]:
from sentence_transformers import SentenceTransformer

In [49]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")
embedder.encode(["Hi, there! How are you"])

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([[-4.73372266e-02,  1.28479889e-02,  6.89421073e-02,
         2.79045627e-02, -3.68847474e-02, -5.06990775e-02,
         2.46483926e-02,  1.40651753e-02, -6.92005977e-02,
         2.40949411e-02, -5.84290363e-02, -8.98118969e-03,
        -2.84604263e-02, -3.73937823e-02,  2.82644704e-02,
        -9.25137568e-03,  2.21133791e-02, -3.47095579e-02,
        -1.31012321e-01, -5.18218987e-03,  1.93765275e-02,
         2.19400786e-02, -3.93150598e-02,  4.15240675e-02,
        -6.35348186e-02, -1.75214279e-02, -3.19502689e-02,
        -1.71411980e-03, -1.49460500e-02, -6.39731735e-02,
        -3.52690904e-03,  7.22459853e-02, -7.08622634e-02,
         3.78122441e-02,  5.93326613e-02,  1.54881971e-02,
        -5.83831333e-02, -1.06431358e-01,  1.92941390e-02,
        -7.27019878e-03, -5.91114862e-04,  9.11501143e-03,
        -1.79864112e-02, -2.94607207e-02,  6.36720881e-02,
        -7.88972080e-02,  3.19960117e-02, -3.56776710e-03,
         7.07356855e-02,  8.11422169e-02, -8.75114799e-0