# Transformers Classification

### 1. Setup and Installation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set_style('darkgrid')

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EvalPrediction
)
from tqdm.auto import tqdm
from datasets import Dataset as HFDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    precision_recall_fscore_support, 
    confusion_matrix,
    classification_report
)
from sklearn.preprocessing import LabelEncoder

In [4]:
import time
import os

from dotenv import load_dotenv
ENV_PATH = os.path.join(os.path.dirname(os.getcwd()), '.env')
load_dotenv(ENV_PATH)

import warnings
warnings.filterwarnings('ignore')

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### 2. Load and Prepare the Data

In [6]:
DATA_PATH = os.path.join(os.path.dirname(os.getcwd()), 'data', 'RHMD_Engineered.csv')
df = pd.read_csv(DATA_PATH)

In [7]:
df['combined_text'] = df['title'] + " " + df['text']

In [8]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['subreddit'])

In [9]:
for i, subreddit in enumerate(label_encoder.classes_):
    print(f"{subreddit} -> {i}")

Anxiety -> 0
SuicideWatch -> 1
depression -> 2
lonely -> 3
mentalhealth -> 4


In [10]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['label'])

In [11]:
print(f"Training set shape: {train_df.shape}")
print(f"Validation set shape: {val_df.shape}")
print(f"Testing set shape: {test_df.shape}")

Training set shape: (13027, 42)
Validation set shape: (3257, 42)
Testing set shape: (4071, 42)


### 3. Making Dataset Classes for Huggingface Transformers

In [12]:
train_dataset = HFDataset.from_pandas(train_df[['combined_text', 'label']])
val_dataset = HFDataset.from_pandas(val_df[['combined_text', 'label']])
test_dataset = HFDataset.from_pandas(test_df[['combined_text', 'label']])

In [13]:
def preprocess_function(examples, tokenizer, max_length=512):
    return tokenizer(
        examples['combined_text'],
        truncation=True,
        padding='max_length',
        max_length=max_length
    )

### 4. Classifier using BERT

In [14]:
access_token = os.environ.get("ACCESS_TOKEN")

bert_model_name = "mental/mental-bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name, token=access_token)

In [15]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [16]:
tokenized_train_dataset = train_dataset.map(
    lambda examples: preprocess_function(examples, bert_tokenizer),
    batched=True
)
tokenized_val_dataset = val_dataset.map(
    lambda examples: preprocess_function(examples, bert_tokenizer),
    batched=True
)
tokenized_test_dataset = test_dataset.map(
    lambda examples: preprocess_function(examples, bert_tokenizer),
    batched=True
)


Map: 100%|██████████| 13027/13027 [00:05<00:00, 2275.55 examples/s]
Map: 100%|██████████| 3257/3257 [00:01<00:00, 2763.85 examples/s]
Map: 100%|██████████| 4071/4071 [00:01<00:00, 2614.96 examples/s]


In [17]:
training_args = TrainingArguments(
    output_dir='../results/bert',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='../logs/bert',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [19]:
num_labels = len(label_encoder.classes_)
bert_model = AutoModelForSequenceClassification.from_pretrained(
    bert_model_name, 
    num_labels=num_labels,
    token=access_token
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
bert_trainer = Trainer(
    model=bert_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
)

In [21]:
start_time = time.time()
bert_trainer.train()
end_time = time.time()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 