In [4]:
from datasets import load_dataset

In [5]:
raw_datasets_1=load_dataset("amazon_polarity")
raw_datasets_2=load_dataset("glue", "sst2")

In [6]:
raw_datasets_2

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [20]:
raw_datasets_2["train"].features

{'sentence': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'positive'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [None]:
raw_datasets_2["train"]["label"]

AttributeError: 'list' object has no attribute 'isunique'

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")



In [10]:
def tokenizer_fun(data):
    return tokenizer(data["sentence"], truncation=True)

In [11]:
tokenized_dataset=raw_datasets_2.map(tokenizer_fun,batched=True)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [24]:
tokenized_dataset["train"]

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
    num_rows: 67349
})

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="results",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=1,
)





In [27]:
from transformers import AutoModelForSequenceClassification

model=AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    pred=predictions.argmax(axis=1)
    acc=accuracy_score(labels, pred)
    return {"accuracy": acc}

In [29]:
from transformers import Trainer, AutoModelForSequenceClassification

trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [31]:
from transformers import pipeline

classifier=pipeline("text-classification", model="mymodel", tokenizer=tokenizer)

In [33]:
classifier("This is a bad movie")  # Example usage of the classifier

[{'label': 'LABEL_0', 'score': 0.9982823133468628}]