# Предобученная модель

## Импорт библиотек

In [39]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from tqdm import tqdm

## Загрузка датасета

In [40]:
dataset = load_dataset("sms_spam")["train"]

## Загрузка предобученной модели

In [41]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Создание объекта pipeline

In [42]:
classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,
    max_length=128,
)

Device set to use cuda:0


## Оценка точности

In [43]:
correct = 0
total = len(dataset)

for example in tqdm(dataset, desc="Evaluating"):
    text = example["sms"]
    true_label = example["label"]  # 0 = not spam, 1 = spam

    prediction = classifier(text, truncation=True, max_length=128)[0]
    predicted_label = 1 if prediction["label"] == "LABEL_1" else 0

    if predicted_label == true_label:
        correct += 1

accuracy = correct / total

print(f"\n\033[92mPipeline Evaluation Accuracy: {correct} / {total} = {accuracy:.4f}\033[0m")

Evaluating: 100%|██████████| 5574/5574 [00:12<00:00, 449.00it/s]


[92mPipeline Evaluation Accuracy: 4827 / 5574 = 0.8660[0m



