In [1]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AdamW

# Load the dataset
df = pd.read_csv("hausa.csv")

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize the text data
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the text data with right padding
train_encodings = tokenizer(
    list(train_df["text"]),
    truncation=True,
    padding=True,
    max_length=32,
    return_tensors="pt"
)

val_encodings = tokenizer(
    list(val_df["text"]),
    truncation=True,
    padding=True,
    max_length=32,
    return_tensors="pt"
)

# Convert the labels to categorical values
train_labels = torch.tensor(train_df["label"].map({"negative": 0, "positive": 1}).values)
val_labels = torch.tensor(val_df["label"].map({"negative": 0, "positive": 1}).values)

# Define the model and optimizer
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-5,
    save_total_limit=2,
    save_steps=500,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_strategy="steps",
    save_on_each_node=True,
)

# Define the Trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,  # Use train_encodings here
    eval_dataset=val_encodings,      # Use val_encodings here
    compute_metrics=lambda pred: {"accuracy": accuracy_score(val_labels, pred.predictions.argmax(1))},
)
trainer.train()




  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 0/3 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,labels,output_attentions,output_hidden_states,return_dict,label_ids,labels,label.