### **Train a deep neural network model in PyTorch for multi-label text classification** 

In [1]:
import pandas as pd

tweet = pd.read_csv('mLabel_tweets.csv', usecols=[1,2], names=['tweet','labels'], skiprows=1)

### **Preprocess text before loding to model**


In [None]:
# import re
# import nltk
# from nltk.corpus import stopwords
# from nltk.stem import WordNetLemmatizer, PorterStemmer

# nltk.download('stopwords')
# nltk.download('wordnet')

# stopwords = set(stopwords.words('english'))
# lematizer = WordNetLemmatizer()

# def preprocess_text(text):
#     text = text.lower()
#     text = re.sub(r"http\S+|www\S+|https\S+", "", text)
#     text = re.sub(r"[^a-zA-Z\s]", "", text)
#     words = text.split()

#     clean_words = [ lematizer.lemmatize(w) for w in words if w not in stopwords ]

#     return " ".join(clean_words)

# tweet['tweet'] = tweet['tweet'].astype(str).apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NCS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\NCS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from sklearn.model_selection import train_test_split

X = tweet['tweet'].astype(str)
y = tweet['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### **Multi Label Binarizer**


In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert "label1,label2" â†’ ['label1','label2']
y_train = y_train.apply(lambda x: x.split(','))
y_test  = y_test.apply(lambda x: x.split(','))

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_test  = mlb.transform(y_test)

num_classes = len(mlb.classes_)
num_classes



245

### **BERT Tokenize**

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 128

def encode_text(text_list):
    return tokenizer(
        list(text_list),
        padding=True,
        truncation=True,
        max_length=MAX_LEN,
        return_tensors='pt'
    )

train_encodings = encode_text(X_train)
test_encodings  = encode_text(X_test)


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### **Creating DataLoaders to feed into MLP**

In [5]:
import torch
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(y_train, dtype=torch.float32)
)

test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask'],
    torch.tensor(y_test, dtype=torch.float32)
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)


### **Text Classifier MLP**

In [6]:
from transformers import BertModel
import torch.nn as nn

class BertForMultiLabel(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token
        x = self.dropout(cls_embedding)
        return self.classifier(x)
    

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using:", device)

model = BertForMultiLabel(num_classes).to(device)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

Using: cpu


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


### **Training Loop**

In [7]:
epochs = 50

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for input_ids, att_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        att_mask = att_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        logits = model(input_ids, att_mask)
        
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = torch.sigmoid(logits) > 0.5
        correct += (preds == labels).sum().item()
        total += labels.numel()

    acc = correct / total
    print(f"Epoch {epoch+1}: Loss={total_loss:.4f}, Accuracy={acc:.4f}")

print("Training Finished ðŸŽ‰")

KeyboardInterrupt: 