<a href="https://colab.research.google.com/github/hasbiazif/NLP_hugging_face/blob/main/hugging_face.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setup awal: install dan import

In [None]:
!pip install -q transformers datasets acceleratem

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# 2. Cek GPU

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())

# 3. Contoh inference cepat (tanpa training)

In [None]:
classifier = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english"
)

print(classifier("I love Hugging Face!"))
print(classifier("This is really bad..."))

In [None]:
# 4. Load dataset kecil untuk fine-tuning
# Gunakan dataset kecil agar cepat training
# =======================================
dataset = load_dataset("imdb")

# Ambil subset kecil (contoh 2000 data train, 1000 test)
small_train = dataset["train"].select(range(2000))
small_test = dataset["test"].select(range(1000))

In [None]:
# 5. Tokenizer dan preprocessing
# =======================================
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

train_enc = small_train.map(tokenize, batched=True)
test_enc = small_test.map(tokenize, batched=True)

train_enc = train_enc.remove_columns(["text"])
test_enc = test_enc.remove_columns(["text"])

train_enc.set_format("torch")
test_enc.set_format("torch")

In [None]:
# 6. Load model dan TrainingArguments
# =======================================
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch", # Changed from evaluation_strategy to eval_strategy
    save_strategy="no",            # supaya cepat
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,            # 1 epoch saja biar cepat
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=False
)

In [None]:
# 7. Metric untuk evaluasi
# =======================================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:
# 8. Trainer dan Training
# =======================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_enc,
    eval_dataset=test_enc,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

In [None]:
# 9. Test model hasil training
# =======================================
text = "This movie was awesome! I loved it."
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
# Move inputs to the same device as the model
inputs = {key: val.to(model.device) for key, val in inputs.items()}
outputs = model(**inputs)
pred = torch.argmax(outputs.logits)
print("Label prediksi:", "Positive" if pred.item() == 1 else "Negative")