# Assignment 3 - Project: Multi-class Text Classification using Transformers

## Libraries

In [1]:
import re
import string
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from tqdm.notebook import tqdm

## Model 1: Multi-class Text Classification using BERT

### Load dataset

In [2]:
newsgroups = fetch_20newsgroups(subset='all')
data = newsgroups.data
labels = newsgroups.target

### Text preprocessing

In [3]:
def text_preprocessing(text):
    text = re.sub(r'\W', ' ', text) # remove non-word characters
    text = re.sub(r'\s+', ' ', text) # replace multiple spaces with a single space
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuations
    text = text.lower() # lower case
    return text

data = [text_preprocessing(doc) for doc in data]

### Tokenization

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_data = tokenizer(data, padding=True, truncation=True, return_tensors='pt')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Label encoding

In [5]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

### Split into train and test sets

In [6]:
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']

input_ids_train, input_ids_test, attention_mask_train, attention_mask_test, y_train, y_test = train_test_split(
    input_ids, attention_mask, encoded_labels, test_size=0.2, random_state=0
)

### Create dataloader

In [7]:
train_data = TensorDataset(input_ids_train, attention_mask_train, torch.tensor(y_train))
test_data = TensorDataset(input_ids_test, attention_mask_test, torch.tensor(y_test))

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

### Load pre-trained BERT model

In [8]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=20)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Model training

In [9]:
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    with tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}", leave=False) as progress_bar:
        for batch in train_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            model.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix(loss=loss.item())
            progress_bar.update(1)

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss:.3f}")

    # save the model weights
    torch.save(model.state_dict(), f"bert_model_epoch_{epoch + 1}.pt")

Epoch 1/3:   0%|          | 0/943 [00:00<?, ?it/s]

Epoch 1, Loss: 1.037


Epoch 2/3:   0%|          | 0/943 [00:00<?, ?it/s]

Epoch 2, Loss: 0.316


Epoch 3/3:   0%|          | 0/943 [00:00<?, ?it/s]

Epoch 3, Loss: 0.156


### Model evaluation

In [10]:
# evaluation
model.eval()
preds = []
true_labels = []

with tqdm(total=len(test_loader), desc="Evaluating", leave=False) as progress_bar:
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            progress_bar.update(1)

# calculate metrics
accuracy = accuracy_score(true_labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")

# detailed classification report
report = classification_report(true_labels, preds, target_names=newsgroups.target_names, digits=3)
print(report)

Evaluating:   0%|          | 0/236 [00:00<?, ?it/s]

Accuracy: 0.917
Precision: 0.918
Recall: 0.917
F1 score: 0.917
                          precision    recall  f1-score   support

             alt.atheism      0.867     0.877     0.872       163
           comp.graphics      0.844     0.884     0.864       190
 comp.os.ms-windows.misc      0.871     0.880     0.876       200
comp.sys.ibm.pc.hardware      0.803     0.791     0.797       196
   comp.sys.mac.hardware      0.893     0.871     0.882       201
          comp.windows.x      0.936     0.960     0.948       198
            misc.forsale      0.870     0.908     0.888       206
               rec.autos      0.933     0.938     0.935       177
         rec.motorcycles      0.951     0.915     0.933       189
      rec.sport.baseball      0.977     0.982     0.980       171
        rec.sport.hockey      0.991     0.987     0.989       233
               sci.crypt      0.984     0.947     0.965       190
         sci.electronics      0.893     0.884     0.888       207
            

## Model 2: Multi-class Text Classification using DistilBERT

In [11]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_scheduler

### Load dataset

In [12]:
newsgroups = fetch_20newsgroups(subset='all')
data = newsgroups.data
labels = newsgroups.target

### Text preprocessing

In [13]:
def text_preprocessing(text):
    text = re.sub(r'\W', ' ', text) # remove non-word characters
    text = re.sub(r'\s+', ' ', text) # replace multiple spaces with a single space
    text = text.translate(str.maketrans('', '', string.punctuation)) # remove punctuations
    text = text.lower() # lower case
    return text

data = [text_preprocessing(doc) for doc in data]

### Tokenization

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tokenized_data = tokenizer(data, padding=True, truncation=True, return_tensors='pt')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

### Label encoding

In [15]:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

### Split into train and test sets

In [16]:
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']

input_ids_train, input_ids_test, attention_mask_train, attention_mask_test, y_train, y_test = train_test_split(
    input_ids, attention_mask, encoded_labels, test_size=0.2, random_state=0
)

### Create dataloader

In [17]:
train_data = TensorDataset(input_ids_train, attention_mask_train, torch.tensor(y_train))
test_data = TensorDataset(input_ids_test, attention_mask_test, torch.tensor(y_test))

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

### Load pre-trained DistilBERT model

In [18]:
model = DistilBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=20)

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.b

### Model training

In [19]:
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3
total_steps = len(train_loader) * epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    with tqdm(total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}", leave=False) as progress_bar:
        for batch in train_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            model.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            progress_bar.set_postfix(loss=loss.item())
            progress_bar.update(1)

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_train_loss:.3f}")

    # save the model weights
    torch.save(model.state_dict(), f"bert_model_epoch_{epoch + 1}.pt")

Epoch 1/3:   0%|          | 0/943 [00:00<?, ?it/s]

Epoch 1, Loss: 2.542


Epoch 2/3:   0%|          | 0/943 [00:00<?, ?it/s]

Epoch 2, Loss: 1.827


Epoch 3/3:   0%|          | 0/943 [00:00<?, ?it/s]

Epoch 3, Loss: 1.306


### Model evaluation

In [20]:
# evaluation
model.eval()
preds = []
true_labels = []

with tqdm(total=len(test_loader), desc="Evaluating", leave=False) as progress_bar:
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

            progress_bar.update(1)

# calculate metrics
accuracy = accuracy_score(true_labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")

# detailed classification report
report = classification_report(true_labels, preds, target_names=newsgroups.target_names, digits=3)
print(report)

Evaluating:   0%|          | 0/236 [00:00<?, ?it/s]

Accuracy: 0.562
Precision: 0.533
Recall: 0.562
F1 score: 0.543
                          precision    recall  f1-score   support

             alt.atheism      0.375     0.331     0.352       163
           comp.graphics      0.422     0.358     0.387       190
 comp.os.ms-windows.misc      0.540     0.580     0.559       200
comp.sys.ibm.pc.hardware      0.387     0.378     0.382       196
   comp.sys.mac.hardware      0.486     0.358     0.413       201
          comp.windows.x      0.712     0.874     0.785       198
            misc.forsale      0.752     0.811     0.780       206
               rec.autos      0.494     0.458     0.475       177
         rec.motorcycles      0.564     0.466     0.510       189
      rec.sport.baseball      0.600     0.684     0.639       171
        rec.sport.hockey      0.785     0.798     0.791       233
               sci.crypt      0.728     0.789     0.758       190
         sci.electronics      0.376     0.396     0.386       207
            

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
