In [1]:
!pip install transformers
!pip install torch
!pip install datasets

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting regex!=2019.12.17
  Downloading regex-2023.6.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (772 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.3/772.3 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
Collecting safeten

In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Step 2: Prepare the dataset
# Assuming your dataset has 'text' and 'label' columns in a CSV file
df = pd.read_csv('/kaggle/input/arabichatespeech/hatespeech.csv')

# Step 3: Tokenize the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.texts = dataframe['sentence']
        self.labels = dataframe['type']
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.texts[index])
        label = int(self.labels[index])

        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Step 4: Create DataLoader
dataset = CustomDataset(df, tokenizer, max_length=128)
train_dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Step 5: Define the model and training loop
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}/{epochs}, Batch Loss: {loss.item():.4f}")
        
model.save_pretrained('modelWeights')

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 5.14MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 5.55kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 531kB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:02<00:00, 212MB/s]  
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Batch Loss: 0.6979
Epoch 1/3, Batch Loss: 0.5960
Epoch 1/3, Batch Loss: 0.6277
Epoch 1/3, Batch Loss: 0.5196
Epoch 1/3, Batch Loss: 0.6539
Epoch 1/3, Batch Loss: 0.4139
Epoch 1/3, Batch Loss: 0.6203
Epoch 1/3, Batch Loss: 0.7025
Epoch 1/3, Batch Loss: 0.4105
Epoch 1/3, Batch Loss: 0.5961
Epoch 1/3, Batch Loss: 0.3548
Epoch 1/3, Batch Loss: 0.4725
Epoch 1/3, Batch Loss: 0.5161
Epoch 1/3, Batch Loss: 0.4804
Epoch 1/3, Batch Loss: 0.2737
Epoch 1/3, Batch Loss: 0.3013
Epoch 1/3, Batch Loss: 0.5024
Epoch 1/3, Batch Loss: 0.7031
Epoch 1/3, Batch Loss: 0.2694
Epoch 1/3, Batch Loss: 0.4839
Epoch 1/3, Batch Loss: 0.3585
Epoch 1/3, Batch Loss: 0.6107
Epoch 1/3, Batch Loss: 0.3429
Epoch 1/3, Batch Loss: 0.2123
Epoch 1/3, Batch Loss: 0.7340
Epoch 1/3, Batch Loss: 0.5540
Epoch 1/3, Batch Loss: 0.7447
Epoch 1/3, Batch Loss: 0.8450
Epoch 1/3, Batch Loss: 0.3993
Epoch 1/3, Batch Loss: 0.5741
Epoch 1/3, Batch Loss: 0.5286
Epoch 1/3, Batch Loss: 0.4700
Epoch 1/3, Batch Loss: 0.6511
Epoch 1/3,

In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Load the trained model and tokenizer
model_path = 'path_to_save_model'  # Replace with the path to your saved model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(model_path)

# Function to classify new text
def classify_text(text):
    model.eval()
    with torch.no_grad():
        inputs = tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1).item()

        # Assuming your model is binary classification (0 or 1)
        if predicted_label == 1:
            return "Hate Speech"
        else:
            return "Not Hate Speech"

OSError: path_to_save_model is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [None]:

new_text = "لا"
predicted_label = classify_text(new_text)
print("Predicted Label:", predicted_label)