In [1]:
from datasets import Dataset, DatasetDict
import pandas as pd
import torch
import re
from transformers import BertForSequenceClassification, BertTokenizer

datasets = DatasetDict({
    'train': Dataset.from_pandas(pd.read_csv('../data/train.csv')),
    'eval': Dataset.from_pandas(pd.read_csv('../data/eval.csv')),
    'test': Dataset.from_pandas(pd.read_csv('../data/test.csv'))
})



model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
def remove_urls(text):
    """Remove URLs from a text string."""
    url_pattern = r'http\S+|www\.\S+'
    return re.sub(url_pattern, '<link>', text)

def remove_mentions(text):
    """Remove user mentions (e.g., @username) and channels (e.g., #channel)."""
    mention_pattern = r'[@#]\w+'
    return re.sub(mention_pattern, '<user>', text)

def remove_special_chars(text):
    """Remove special characters except basic punctuation."""
    return re.sub(r'[^\w\s.,!?]', '', text)

def clean_text(row):
    """
    Clean the input text by:
      - Removing URLs.
      - Removing mentions.
      - Lowercasing text.
      - Removing unwanted special characters.
      - Trimming extra spaces.
    """
    text = row['message']
    text = remove_urls(text)
    text = remove_mentions(text)
    text = text.lower()
    text = remove_special_chars(text)
    row['cleaned_message'] = text 
    return row


def generate_tokenized_text(row):
    output = tokenizer(row['message'], truncation=True, padding="max_length", max_length=128)
    row['attention_mask'] = output['attention_mask']
    row['input_ids'] = output['input_ids']
    return row

def label_mapping(row):
    row['labels'] = 0 if row['calendar_event'] == False else 1
    return row


datasets = datasets.map(clean_text)
datasets = datasets.map(generate_tokenized_text)
datasets = datasets.map(label_mapping)

Map:   0%|          | 0/3924 [00:00<?, ? examples/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/1120 [00:00<?, ? examples/s]

Map:   0%|          | 0/3924 [00:00<?, ? examples/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/1120 [00:00<?, ? examples/s]

Map:   0%|          | 0/3924 [00:00<?, ? examples/s]

Map:   0%|          | 0/560 [00:00<?, ? examples/s]

Map:   0%|          | 0/1120 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=1)
    
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='binary')
    recall = recall_score(labels, preds, average='binary')
    f1 = f1_score(labels, preds, average='binary')
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="steps",
    logging_steps=5,
    save_steps=10,
    max_steps=50,
    load_best_model_at_end=True,
    report_to="none",
    remove_unused_columns=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=datasets['train'],
    eval_dataset=datasets['eval'],
    compute_metrics=compute_metrics
)

In [4]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
5,0.6521,0.634056,0.617857,1.0,0.235714,0.381503
10,0.6228,0.527139,0.926786,0.97992,0.871429,0.922495
15,0.4937,0.464321,0.869643,0.808955,0.967857,0.881301
20,0.4986,0.433345,0.875,0.816265,0.967857,0.885621
25,0.3873,0.350406,0.935714,0.893548,0.989286,0.938983
30,0.346,0.306786,0.964286,0.942177,0.989286,0.965157
35,0.2974,0.275589,0.966071,0.945392,0.989286,0.966841
40,0.2671,0.251781,0.971429,0.955172,0.989286,0.97193
45,0.2664,0.240891,0.967857,0.94863,0.989286,0.968531
50,0.2325,0.237283,0.967857,0.94863,0.989286,0.968531


TrainOutput(global_step=50, training_loss=0.40639047384262084, metrics={'train_runtime': 30.3471, 'train_samples_per_second': 26.362, 'train_steps_per_second': 1.648, 'total_flos': 52622211072000.0, 'train_loss': 0.40639047384262084, 'epoch': 0.2032520325203252})

In [8]:
output = trainer.predict(datasets['test'])
output

PredictionOutput(predictions=array([[-0.97039884,  1.3453104 ],
       [-1.1152496 ,  1.3940566 ],
       [-0.5414445 ,  0.8503644 ],
       ...,
       [ 0.6406925 , -0.7862517 ],
       [ 0.54445815, -0.2564761 ],
       [ 0.5858267 , -0.75551957]], shape=(1120, 2), dtype=float32), label_ids=array([1, 1, 1, ..., 0, 0, 0], shape=(1120,)), metrics={'test_loss': 0.23213626444339752, 'test_accuracy': 0.9714285714285714, 'test_precision': 0.9474576271186441, 'test_recall': 0.9982142857142857, 'test_f1': 0.9721739130434782, 'test_runtime': 2.0204, 'test_samples_per_second': 554.338, 'test_steps_per_second': 69.292})

In [9]:
output.metrics

{'test_loss': 0.23213626444339752,
 'test_accuracy': 0.9714285714285714,
 'test_precision': 0.9474576271186441,
 'test_recall': 0.9982142857142857,
 'test_f1': 0.9721739130434782,
 'test_runtime': 2.0204,
 'test_samples_per_second': 554.338,
 'test_steps_per_second': 69.292}

In [10]:
trainer.save_model('./bert_classifier_v1')