<a href="https://colab.research.google.com/github/haoli000/ColabPlayground/blob/main/H24_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset


Load data

In [34]:
# Load the dataset
data = pd.read_csv('/content/messages_labelled.csv', delimiter=";")

# Split the data into features (X) and target (y)
X = data['message'].tolist()
y = data['label'].tolist()

Clean up messages

In [35]:
def clean_message(message):
    # Convert to lowercase
    message = message.lower()

    # Remove URLs
    message = re.sub(r'http\S+|www\S+|https\S+', '<URL>', message, flags=re.MULTILINE)

    # Remove email addresses
    message = re.sub(r'\S*@\S*\s?', '<EMAIL>', message)

    # Replace numbers with a placeholder (e.g., "<NUM>")
    message = re.sub(r'\d+', '<NUM>', message)

    # Remove non-alphanumeric characters (excluding spaces)
    message = re.sub(r'[^a-zA-Z0-9\s]', '', message)

    # Remove extra whitespaces
    message = ' '.join(message.split())

    return message

# Apply the function to your list of messages
# X = [clean_message(message) for message in X]


Train

In [36]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train = [clean_message(message) for message in X_train]

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Tokenize and encode sequences
MAX_LEN = 160

def encode_texts(texts):
    return tokenizer.batch_encode_plus(
        texts,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_encodings = encode_texts(X_train)
test_encodings = encode_texts(X_test)

# Convert labels to tensors
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

# Create DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 8

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{num_epochs} completed')

# Evaluation
model.eval()
with torch.no_grad():
    input_ids = test_encodings['input_ids'].to(device)
    attention_mask = test_encodings['attention_mask'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()

print(classification_report(y_test, predictions))
print(f"Accuracy: {accuracy_score(y_test, predictions)}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10 completed
Epoch 2/10 completed
Epoch 3/10 completed
Epoch 4/10 completed
Epoch 5/10 completed
Epoch 6/10 completed
Epoch 7/10 completed
Epoch 8/10 completed
Epoch 9/10 completed
Epoch 10/10 completed


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.57 GiB. GPU 0 has a total capacity of 14.75 GiB of which 2.38 GiB is free. Process 12630 has 12.36 GiB memory in use. Of the allocated memory 7.95 GiB is allocated by PyTorch, and 4.28 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Test

In [28]:

# Function to classify new messages
def classify_message(message):
    encoding = tokenizer.encode_plus(
        message,
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=-1).cpu().numpy()[0]

    return prediction

# Example usage
new_message = "Win lottery! Call 123123!"
print(new_message)
print(f"Classification: {classify_message(new_message)}")

Win lottery! Call 123123!
Classification: 1
