## Example model of sentiment 🥰


In [57]:
from transformers import pipeline

In [58]:
# clasification task
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [59]:
res = classifier("I love you")
print(res)

[{'label': 'POSITIVE', 'score': 0.9998656511306763}]


In [60]:
res = classifier("i do not like your clothes but you are nice guy")
print(res)

[{'label': 'POSITIVE', 'score': 0.9998207688331604}]


## Info datasets 👀

In [61]:
from datasets import list_datasets, load_dataset, DatasetInfo

all_datasets = list_datasets()
print(f"Exist {len(all_datasets)} datasets")

Exist 169993 datasets


In [62]:
from huggingface_hub import list_datasets, dataset_info

all_datasets = list_datasets(direction=1, limit=5)

In [63]:
next(all_datasets)

DatasetInfo(id='amirveyseh/acronym_identification', author='amirveyseh', sha='15ef643450d589d5883e289ffadeb03563e80a9e', created_at=datetime.datetime(2022, 3, 2, 23, 29, 22, tzinfo=datetime.timezone.utc), last_modified=datetime.datetime(2024, 1, 9, 11, 39, 57, tzinfo=datetime.timezone.utc), private=False, gated=False, disabled=False, downloads=190, likes=18, paperswithcode_id='acronym-identification', tags=['task_categories:token-classification', 'annotations_creators:expert-generated', 'language_creators:found', 'multilinguality:monolingual', 'source_datasets:original', 'language:en', 'license:mit', 'size_categories:10K<n<100K', 'parquet', 'text', 'datasets', 'pandas', 'mlcroissant', 'arxiv:2010.14678', 'region:us', 'acronym-identification', 'croissant'], card_data=None, siblings=None)

## Load dataset 👨‍🦽‍➡️

In [64]:
dataset = load_dataset("yelp_review_full")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [65]:
dataset["train"][42]

{'label': 4,
 'text': 'What a find! I stopped in here for breakfast while in town for business. The service is so friendly I thought I was down south. The service was quick, frankly and felt like I was with family. \\nFantastic poached eggs, Cajun homefries and crispy bacon. Gab and Eat is definitely a place I world recommend to locals. I was stuffed and the bill was only $8.00.'}

In [66]:
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(80))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(40))

## Tokenizer 👾

In [67]:
from transformers import AutoTokenizer

In [None]:
modelo = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(modelo)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True)

tokenized_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)

print(tokenized_train_dataset[0])


## Model ✅

In [81]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(modelo, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## training 🏋️‍♂️

In [82]:
from huggingface_hub import notebook_login

#notebook_login()

In [83]:
import numpy as np
import evaluate

In [84]:
metric = evaluate.load("accuracy")

In [85]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [86]:
from transformers import TrainingArguments, Trainer

In [94]:
training_args = TrainingArguments(
    'my-super-model',
    evaluation_strategy="steps",
    logging_steps=5,
    num_train_epochs=1,
    push_to_hub=True,
)

In [97]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
)

In [None]:
trainer.train()


In [99]:
# Upload the model to the hub

trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/juan071/my-super-model/commit/5b35eb70ebca850124f2fd83fa71a7762cfd8d3b', commit_message='End of training', commit_description='', oid='5b35eb70ebca850124f2fd83fa71a7762cfd8d3b', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("juan071/my-super-model")