
<font size = "6"> "Τext Classification with Pretrained Transformer models - Small Demo"

## IMPORT THE DATA

In [1]:
import pandas as pd

# Load training set
df = pd.read_csv("datasets/IMDB-Dataset.csv")  # place your dataset
print(df.head())
print('------------------------------------------------')
print(df.isna().sum())
print('----------------------------------------------------')
print(df.columns)
print('----------------------------------------------------')
labels = df.sentiment.unique()
print(f'The labels of the dataset are: {labels}')

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
------------------------------------------------
review       0
sentiment    0
dtype: int64
----------------------------------------------------
Index(['review', 'sentiment'], dtype='object')
----------------------------------------------------
The labels of the dataset are: ['positive' 'negative']


In [2]:
from sklearn.model_selection import train_test_split

#X = df["review"]
#y = df["sentiment"]

X = [str(i) for i in df["review"].values]
y = [str(i) for i in df["sentiment"].values]
# Create a train/test split
# Adjust test_size and random_state as you prefer
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,       # 20% of data goes to test
    random_state=42,     # for reproducibility
    stratify=y           # keeps class distribution balanced
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

# For example, take 10% of the original training data to form a validation set
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=0.1,
    random_state=42,
    stratify=y_train
)

print("Updated Train size:", len(X_train))
print("Validation size:", len(X_val))

Train size: 40000
Test size: 10000
Updated Train size: 36000
Validation size: 4000


### COUNT THE TOTAL NUMBER OF THE TRAINING, DEVELOPMENT AND TEST SET EXAMPLES

In [3]:
print(f'The total number of the training examples is: {len(X_train)}')
print(f'The total number of the Validation examples is: {len(X_val)}')
print(f'The total number of the test examples is: {len(X_test)}')

The total number of the training examples is: 36000
The total number of the development examples is: 4000
The total number of the test examples is: 10000


### EDA FOR THE DATASET...YOU KNOW WHAT TO DO!

## PREPARE OUR DATASET

### AFTER THE PREPROCESS ENCODE THE LABELS (Y_TRAIN, Y_DEV, Y_TEST)

We are going to use BertTokenizer in order to prepare our datasets for the training and evaluation of our model

In [4]:
from sklearn.preprocessing import MultiLabelBinarizer

# Convert your list of lists of labels into a binarized form
mlb = MultiLabelBinarizer()
y_train_encoded = mlb.fit_transform(y_train)

# Check the classes that were encoded
print("Classes:", mlb.classes_)
print('------------------------------------------------------------------------------------------------------')
# Check the shape of the encoded labels
print("Shape of encoded labels:", y_train_encoded.shape)
print('------------------------------------------------------------------------------------------------------')
#print(y_train[5])
print(y_train_encoded[5])

Classes: ['a' 'e' 'g' 'i' 'n' 'o' 'p' 's' 't' 'v']
------------------------------------------------------------------------------------------------------
Shape of encoded labels: (36000, 10)
------------------------------------------------------------------------------------------------------
[0 1 0 1 0 1 1 1 1 1]


In [5]:
# Convert your list of lists of labels into a binarized form
y_val_encoded = mlb.transform(y_val)

# Check the classes that were encoded
print("Classes:", mlb.classes_)
print('------------------------------------------------------------------------------------------------------')
# Check the shape of the encoded labels
print("Shape of encoded labels:", y_val_encoded.shape)
print('------------------------------------------------------------------------------------------------------')
print(y_val[5])
print(y_val_encoded[5])

Classes: ['a' 'e' 'g' 'i' 'n' 'o' 'p' 's' 't' 'v']
------------------------------------------------------------------------------------------------------
Shape of encoded labels: (4000, 10)
------------------------------------------------------------------------------------------------------
negative
[1 1 1 1 1 0 0 0 1 1]


In [6]:
# Convert your list of lists of labels into a binarized form
y_test_encoded = mlb.transform(y_test)

# Check the classes that were encoded
print("Classes:", mlb.classes_)
print('------------------------------------------------------------------------------------------------------')
# Check the shape of the encoded labels
print("Shape of encoded labels:", y_test_encoded.shape)
print('------------------------------------------------------------------------------------------------------')
#print(y_test[5])
print(y_test_encoded[5])

Classes: ['a' 'e' 'g' 'i' 'n' 'o' 'p' 's' 't' 'v']
------------------------------------------------------------------------------------------------------
Shape of encoded labels: (10000, 10)
------------------------------------------------------------------------------------------------------
[1 1 1 1 1 0 0 0 1 1]


### TRAIN A BASELINE MODEL (LOGISTIC REGRESSION) IN ORDER TO COMPARE IT WITH OUR MAIN MODEL

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

X_train_final = X_train
X_val_final = X_val

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vectorized = vectorizer.fit_transform(X_train_final)
X_val_vectorized = vectorizer.transform(X_val_final)


# Use OneVsRestClassifier for multi-label classification
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train_vectorized, y_train_encoded)

# Predict on the training set
y_train_pred = model.predict(X_train_vectorized)

# Print the classification report for the training set
print('Classification Report for the training set')
print(classification_report(y_train_encoded, y_train_pred, target_names=mlb.classes_))
print('---------------------------------------------------------------------------------')
# Predict on the development set
y_val_pred = model.predict(X_val_vectorized)

# Print the classification report for the development set
print('Classification Report for the training set')
print(classification_report(y_val_encoded, y_val_pred, target_names=mlb.classes_))



Classification Report for the training set
              precision    recall  f1-score   support

           a       0.92      0.90      0.91     18000
           e       1.00      1.00      1.00     36000
           g       0.92      0.90      0.91     18000
           i       1.00      1.00      1.00     36000
           n       0.92      0.90      0.91     18000
           o       0.91      0.92      0.91     18000
           p       0.91      0.92      0.91     18000
           s       0.91      0.92      0.91     18000
           t       1.00      1.00      1.00     36000
           v       1.00      1.00      1.00     36000

   micro avg       0.96      0.96      0.96    252000
   macro avg       0.95      0.95      0.95    252000
weighted avg       0.96      0.96      0.96    252000
 samples avg       0.96      0.96      0.96    252000

---------------------------------------------------------------------------------
Classification Report for the training set
              preci

### INSTALL/IMPORT THE TWO MOST IMPORTANT LIBRARIES HERE

In [8]:
# %%capture
# !pip install transformers==4.38.1
# # Check the version of our package *transformer*
import transformers

In [9]:
# %%capture
# !pip install torch==2.2.0 torchvision==0.14.1 torchaudio==2.2.0 --index-url https://download.pytorch.org/whl/cu118
# !pip install torchmetrics torchtext --index-url https://download.pytorch.org/whl/cu118
import torch

In [10]:
print("Transformers version:", transformers.__version__)
print("Torch version:", torch.__version__)

Transformers version: 4.49.0
Torch version: 2.6.0+cpu


### TOKENIZE THE DATASETS VIA PRETRAINED TRANSFORMER MODEL

In [11]:
import transformers
from transformers import BertTokenizer

# Instantiate the BERT tokenizer with WordPiece tokenization
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_SEQUENCE_LENGTH = 512

# Function for the tokenization
def tokenize_text(data, tokenizer, max_length=MAX_SEQUENCE_LENGTH):
    return tokenizer(data, padding='max_length',
                     max_length=max_length, truncation=True, return_tensors='pt')


bert_val = tokenize_text(X_val, bert_tokenizer)
bert_val

{'input_ids': tensor([[ 101, 1045, 2428,  ...,    0,    0,    0],
        [ 101, 3100, 1010,  ...,    0,    0,    0],
        [ 101, 2158, 1010,  ...,    0,    0,    0],
        ...,
        [ 101, 2728, 2091,  ...,    0,    0,    0],
        [ 101, 1044, 1012,  ...,    0,    0,    0],
        [ 101, 1045, 2347,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [12]:
bert_train = tokenize_text(X_train, bert_tokenizer)
bert_train

{'input_ids': tensor([[ 101, 1996, 3185,  ...,    0,    0,    0],
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 2295, 1045,  ..., 2009, 1005,  102],
        ...,
        [ 101, 2023, 3185,  ...,    0,    0,    0],
        [ 101, 3374, 1010,  ...,    0,    0,    0],
        [ 101, 1045, 2428,  ..., 2136, 2988,  102]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [13]:
bert_test = tokenize_text(X_test, bert_tokenizer)
bert_test

{'input_ids': tensor([[ 101, 2748, 1010,  ...,    0,    0,    0],
        [ 101, 1996, 2466,  ...,    0,    0,    0],
        [ 101, 1037, 2136,  ...,    0,    0,    0],
        ...,
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 6506, 9413,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

### LOAD THE PRETRAINED TRANSFORMER MODEL OF YOUR CHOICE

In [14]:
from transformers import BertForSequenceClassification

# Load BERT model and tokenizer
model_name = 'bert-base-uncased'
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(labels))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Move your model to the selected device
bert_model = bert_model.to(device)

Using device: cpu


### TRANSFORM THE DATA INTO DATA LOADERS

In [16]:
import torch
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset

# Make y_train, y_dev tensors
y_train_encoded = torch.tensor(y_train_encoded)
y_val_encoded = torch.tensor(y_val_encoded)

# Define batch size
batch_size = 16

# Create TensorDatasets
train_dataset = TensorDataset(bert_train['input_ids'], bert_train['attention_mask'], y_train_encoded)
val_dataset = TensorDataset(bert_val['input_ids'], bert_val['attention_mask'], y_val_encoded)

# Create DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

### UNFREEZE ALL THE TRANSFORMER LAYERS EXCEPT FOR EMBEDDING LAYER - YOU CAN MODIFY THAT

In [17]:
param_freeze = ['bert.embeddings', 'bert.encoder.layer.0',
                'bert.encoder.layer.1.', 'bert.encoder.layer.2',
                'bert.encoder.layer.3', 'bert.encoder.layer.4',
                'bert.encoder.layer.5']

# Unfreeze the parameters of the classification head except the above
for name, param in bert_model.named_parameters():
    param.requires_grad = not any(name.startswith(prefix) for prefix in param_freeze)


# Print the names of trainable parameters
for name, param in bert_model.named_parameters():
    if param.requires_grad:
        print(name)

bert.encoder.layer.6.attention.self.query.weight
bert.encoder.layer.6.attention.self.query.bias
bert.encoder.layer.6.attention.self.key.weight
bert.encoder.layer.6.attention.self.key.bias
bert.encoder.layer.6.attention.self.value.weight
bert.encoder.layer.6.attention.self.value.bias
bert.encoder.layer.6.attention.output.dense.weight
bert.encoder.layer.6.attention.output.dense.bias
bert.encoder.layer.6.attention.output.LayerNorm.weight
bert.encoder.layer.6.attention.output.LayerNorm.bias
bert.encoder.layer.6.intermediate.dense.weight
bert.encoder.layer.6.intermediate.dense.bias
bert.encoder.layer.6.output.dense.weight
bert.encoder.layer.6.output.dense.bias
bert.encoder.layer.6.output.LayerNorm.weight
bert.encoder.layer.6.output.LayerNorm.bias
bert.encoder.layer.7.attention.self.query.weight
bert.encoder.layer.7.attention.self.query.bias
bert.encoder.layer.7.attention.self.key.weight
bert.encoder.layer.7.attention.self.key.bias
bert.encoder.layer.7.attention.self.value.weight
bert.encode

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Print the number of trainable parameters
print("Number of trainable parameters:", count_parameters(bert_model))

Number of trainable parameters: 43119362


### CUSTOM F1 SCORE CALCULATION

In [19]:
from sklearn.metrics import f1_score
import torch

def f1_metric(logits, labels, threshold=0.5, zero_division=1):
    """
    Calculate F1 score for multi-label classification.
    Args:
    - logits (torch.Tensor): Predicted logits for each class (batch_size x num_classes).
    - labels (torch.Tensor): True labels for each class (batch_size x num_classes).
    - threshold (float): Threshold for converting probabilities to binary predictions (default: 0.5).
    - zero_division (int): Sets the value to return when there is a zero division. Use 0 or 1 (default: 1).

    Returns:
    - f1 (float): Average F1 score across all samples and classes.
    """
    preds = torch.sigmoid(logits)  # Convert logits to probabilities using sigmoid
    preds_binary = (preds > threshold).cpu().numpy()
    labels = labels.cpu().numpy()

    # Calculate F1 score for each class and then average
    f1_per_class = f1_score(labels, preds_binary, average=None, zero_division=zero_division)
    average_f1 = f1_per_class.mean()

    return average_f1


### MAIN TRAINING SECTION - READ AGAIN CAREFULLY THE CODE

In [None]:
from IPython.display import display, update_display
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score

# Define the number of epochs
num_epochs = 20

# Prepare optimizer and loss function
optimizer = Adam(bert_model.parameters(), lr=1e-5)

# Set up the ReduceLROnPlateau scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3)

# Updated loss function with weights
loss_function = torch.nn.BCEWithLogitsLoss()

# Initialize a display object for updating output in-place
display_id = 'batch_update'
display_obj = display("", display_id=display_id)

# Initialize lists to store history
train_losses = []
train_f1_scores = []
val_losses = []
val_f1_scores = []
patience = 0

# Initialize variables to track best validation loss and corresponding model weights
best_val_loss = float('inf')
best_val_f1 = float('-inf')
best_model_weights = None

# Adjust the model's output layer, if not already adjusted
bert_model.classifier = torch.nn.Linear(bert_model.config.hidden_size, 10)#len(labels)
bert_model.to(device)

# Training loop
for epoch in range(num_epochs):
    bert_model.train()
    total_loss = 0
    total_f1 = 0
    cnt = 0
    for batch in train_loader:
        cnt += 1
        optimizer.zero_grad()  # Reset gradients to zero for each batch
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)  # Move the tensors to the specified device
        # Get model outputs (logits)
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        #logits = torch.randn(16, 2)  # [batch_size, num_classes]
        #labels = torch.randint(0, 2, (16,2))  # [batch_size] - class labels as integers
        
        logits = outputs.logits  # Ensure your model configuration aligns with this, or adjust as needed
        
        # Compute loss using the logits from the model and the labels from your dataset
        loss = loss_function(logits, labels.float())
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_f1 += f1_metric(logits, labels)
        # Update the displayed output for this batch
        output = f'For batch "{cnt}": Training - Loss: {total_loss/cnt:.3f}, F1 Score: {total_f1/cnt:.3f}'
        update_display(output, display_id=display_id)
    avg_train_loss = total_loss / len(train_loader)
    avg_train_f1 = total_f1 / len(train_loader)
    train_losses.append(avg_train_loss)
    train_f1_scores.append(avg_train_f1)
    print(f'\nEpoch {epoch+1}/{num_epochs}: Training - Loss: {avg_train_loss:.3f}, F1 Score: {avg_train_f1:.3f}')

    # Validation loop
    bert_model.eval()
    total_val_loss = 0
    total_val_f1 = 0
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)  # Move the tensors to the specified device
        with torch.no_grad():
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            val_loss = loss_function(logits, labels.float())

        total_val_loss += val_loss.item()
        total_val_f1 += f1_metric(logits, labels)

    avg_val_loss = total_val_loss / len(val_loader)
    avg_val_f1 = total_val_f1 / len(val_loader)
    val_losses.append(avg_val_loss)
    val_f1_scores.append(avg_val_f1)

    if avg_val_loss < best_val_loss or avg_val_f1 > best_val_f1:
        best_val_loss = avg_val_loss
        best_val_f1 = avg_val_f1
        best_model_weights = bert_model.state_dict()
        torch.save(best_model_weights, '/content/drive/MyDrive/best_model.pth')  # Make sure the path is correct and accessible
        print(f'Validation improved, saving model to /content/drive/MyDrive/best_model.pth')
        patience = 0
    else:
        print(f'Validation did not improve')
        patience += 1
        if patience >= 5:
            print('Early stopping')
            break

    print(f'Validation - Loss: {avg_val_loss:.3f}, F1 Score: {avg_val_f1:.3f}')
    # Check the average validation loss and update the learning rate accordingly
    scheduler.step(avg_val_loss)


'For batch "144": Training - Loss: 0.478, F1 Score: 0.738'

#### TRAINING CURVES FOR OUR TRANSFORMER

In [1]:
%%capture
!pip install matplotlib

import matplotlib.pyplot as plt

def plot_history(train_losses, train_f1_scores, val_losses, val_f1_scores):
    """
    Plot the history of epochs for loss and accuracy
    :param train_losses: List of training losses for each epoch
    :param train_f1_scores: List of training f1 scores for each epoch
    :param val_losses: List of validation losses for each epoch
    :param val_f1_scores: List of validation f1 scores for each epoch
    :return: None
    """
    epochs = range(1, len(train_losses) + 1)

    # Plot training and validation losses
    plt.plot(epochs, train_losses, 'b', label='Training loss')
    plt.plot(epochs, val_losses, 'r', label='Validation loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # Plot training and validation f1 scores
    plt.plot(epochs,train_f1_scores, 'b', label='Training f1-score')
    plt.plot(epochs, val_f1_scores, 'r', label='Validation f1-score')
    plt.title('Training and Validation =f1-scores')
    plt.xlabel('Epochs')
    plt.ylabel('f1_score')
    plt.legend()
    plt.show()


In [2]:
history_dict = {
    'train_losses': train_losses,
    'train_f1_scores': train_f1_scores,
    'val_losses': val_losses,
    'f1_scores': val_f1_scores,
}

history_df = pd.DataFrame(history_dict)

history_df.to_csv('/content/drive/MyDrive/history_results.csv', index=False)

plot_history(train_losses, train_f1_scores, val_losses, val_f1_scores)

NameError: name 'train_losses' is not defined

#### EVALUATE THE TRAINING SET

In [None]:
# Evaluation on training data
bert_model.eval()
predictions_test = []
true_labels_test = []

for batch in train_loader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Apply sigmoid to the logits and then threshold to get binary predictions
    predictions = torch.sigmoid(logits).cpu().numpy() > 0.3
    labels = labels.cpu().numpy()

    # Extend the lists with the batch predictions and true labels
    predictions_test.extend(predictions)
    true_labels_test.extend(labels)

# At this point, you have the true labels and predictions
# You can then flatten these lists and use them in sklearn's classification_report
true_labels_test = np.array(true_labels_test)
predictions_test = np.array(predictions_test)

# Assuming you want to evaluate at the individual label level
print('Classification Report for the training set')
print(classification_report(true_labels_test, predictions_test, target_names=mlb.classes_))


#### EVALUATE THE DEVELOPMENT SET

In [None]:
from sklearn.metrics import classification_report
import torch
import numpy as np

# Evaluation on development data
bert_model.eval()
predictions_test = []
true_labels_test = []

for batch in val_loader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Apply sigmoid to the logits and then threshold to get binary predictions
    probabilities = torch.sigmoid(logits).cpu().numpy()
    predictions = probabilities > 0.3
    labels = labels.cpu().numpy()

    # Extend the lists with the batch predictions and true labels
    predictions_test.extend(predictions)
    true_labels_test.extend(labels)

# Flatten these lists and use them in sklearn's classification_report
true_labels_test = np.array(true_labels_test)
predictions_test = np.array(predictions_test)

# Evaluate at the individual label level
print('Classification Report for the development set')
print(classification_report(true_labels_test, predictions_test, target_names=mlb.classes_))
