<a href="https://colab.research.google.com/github/lalanglarano/TAKLUBAN-FILIPINO-NATIVE-LANGUAGE-PROFANE-DETECTION/blob/old-main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install torch



Mounted at /content/drive


In [2]:
!pip install transformers



In [1]:
!pip install datasets



In [4]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

In [9]:
# Load dataset from CSV files
dataset = load_dataset('csv', data_files={
    'train': '/content/datasets/BERT_TRAIN.csv',
    'validation': '/content/datasets/BERT_VAL.csv'
})

# Load pre-trained BERT tokenizer and model for sequence classification (2 output classes)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenization function for dataset
def tokenize_function(examples):
    # Tokenize the input text
    tokens = tokenizer(
        examples['sentence'],  # Replace 'sentence' with the name of your text column if different
        padding='max_length',
        max_length=128,
        truncation=True
    )
    # Add labels to the tokenized output
    tokens['labels'] = examples['profane']  # Map the 'profane' column to labels
    return tokens

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Set format for PyTorch tensors
tokenized_datasets.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=0.00001,  # 1e-5
    per_device_train_batch_size=64,  # Batch size
    per_device_eval_batch_size=64,
    num_train_epochs=3,  # Number of epochs
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
)

# Train the model
trainer.train()

# Save the model
trainer.save_model('./trained_bert')

# Evaluate the model
results = trainer.evaluate()

# Print results
print(results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/11111 [00:00<?, ? examples/s]

Map:   0%|          | 0/1378 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,No log,0.12878
2,No log,0.102879
3,0.113400,0.105308


{'eval_loss': 0.10530832409858704, 'eval_runtime': 8.6631, 'eval_samples_per_second': 159.065, 'eval_steps_per_second': 2.539, 'epoch': 3.0}


In [10]:
!pip install safetensors



In [18]:
from safetensors.torch import load_file
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [20]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load model configuration and weights
model = BertForSequenceClassification.from_pretrained(
    './trained_bert/',
    config='./trained_bert/config.json',
    from_tf=False
)

# Load the weights from safetensors format
state_dict = load_file("./trained_bert/model.safetensors")
model.load_state_dict(state_dict)

# Set the model to evaluation mode
model.eval()

# Initialize lists to store true labels and predictions for metrics calculation
true_labels = []
predictions = []

# Loop for user input
while True:
    # Ask user for a sentence
    sentence = input("Enter a sentence to classify or type 'exit' to stop: ")

    # Exit condition
    if sentence.lower() == 'exit':
        break

    # User input: tokenize and make prediction
    inputs = tokenizer(sentence, return_tensors="pt")

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted class (0: Not Profane, 1: Profane)
    predicted_class = torch.argmax(logits, dim=-1).item()

    # Simulate true label for demonstration purposes (You can replace this with actual labels if available)
    true_label = int(input("Enter the true label (0 for Not Profane, 1 for Profane): "))

    # Append to the list
    true_labels.append(true_label)
    predictions.append(predicted_class)

    # Print the result
    if predicted_class == 1:
        print("The sentence is classified as: Profane")
    else:
        print("The sentence is classified as: Not Profane")

# Calculate and display evaluation metrics
if true_labels and predictions:
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

    print("\nEvaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
else:
    print("No sentences classified, no metrics to display.")

Enter a sentence to classify or type 'exit' to stop: putangina nadapa yung bata
Enter the true label (0 for Not Profane, 1 for Profane): 1
The sentence is classified as: Profane
Enter a sentence to classify or type 'exit' to stop: asan na naman ang hayop na yan
Enter the true label (0 for Not Profane, 1 for Profane): 1
The sentence is classified as: Not Profane
Enter a sentence to classify or type 'exit' to stop: tangina mali ang nakuha
Enter the true label (0 for Not Profane, 1 for Profane): 1
The sentence is classified as: Profane
Enter a sentence to classify or type 'exit' to stop: dahil yon sa dataset wala ata non
Enter the true label (0 for Not Profane, 1 for Profane): 0
The sentence is classified as: Not Profane
Enter a sentence to classify or type 'exit' to stop: damihan ko lang wala na magawa eh
Enter the true label (0 for Not Profane, 1 for Profane): 0
The sentence is classified as: Not Profane
Enter a sentence to classify or type 'exit' to stop: pucha natalo tuloy yung pambat