<a href="https://colab.research.google.com/github/iSriBalaji/PracticeCodes/blob/main/hackmit_sentiment_analysis_huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

adapted from https://huggingface.co/docs/transformers/tasks/sequence_classification.

In [None]:
!pip install transformers datasets evaluate
!pip install transformers[torch]

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import evaluate
import numpy as np
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Settings
# Finding datasets: https://huggingface.co/datasets?task_ids=task_ids:sentiment-classification&sort=trending
# Finding models: https://huggingface.co/models
# Finding tokenizers:

dataset_name = 'imdb'
tokenizer_name = 'distilbert-base-uncased'
model_name = 'distilbert-base-uncased'
metric = evaluate.load("accuracy")

# You will need to change these based on your dataset.
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
# functions
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
data = load_dataset(dataset_name)

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenized_data = data.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(id2label), id2label=id2label, label2id=label2id
)

In [None]:
# ADJUST HYPERPARAMETERS

training_args = TrainingArguments(
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
text = 'This product was very good.'

In [None]:
classifier = pipeline("sentiment-analysis", model=model)
classifier(text)