In [45]:
import pandas as pd

# Reload the datasets
train_df = pd.read_csv('/home/administrator/personal/data/train.csv', header=None)
validation_df = pd.read_csv('/home/administrator/personal/data/validation.csv', header=None)

# Rename the columns for clarity
train_df.columns = ['text', 'label']
validation_df.columns = ['text', 'label']

# Display the first few rows to verify the structure
print(train_df.head())
print(validation_df.head())

                                                text label
0    We extend to natural deduction the approach ...    LO
1    Over the last decade, the IEEE 802.11 has em...    NI
2    Motivated by the problem of storing coloured...    DS
3    We consider the downlink of a cellular syste...    NI
4    Meroitic is the still undeciphered language ...    CL
                                                text label
0    Manne et al. designed the first algorithm co...    DC
1    We consider the challenge of creating guidel...    SE
2    Network virtualization techniques allow for ...    NI
3    In the Min $k$-Cut problem, input is an edge...    DS
4    We introduce the notion of being Weihrauch-c...    LO


In [40]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK data files
nltk.download('punkt')  # Tokenizer models
nltk.download('stopwords')  # Stopwords list
nltk.download('wordnet')  # Lemmatizer models

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and non-alphabetic characters
    words = word_tokenize(text)  # Tokenize the text into words
    stop_words = set(stopwords.words('english'))  # Define the set of English stopwords
    words = [word for word in words if word not in stop_words]  # Remove stopwords from the tokenized words
    lemmatizer = WordNetLemmatizer()  # Initialize the lemmatizer
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatize the words
    return ' '.join(words)  # Join the processed words back into a single string

# Apply preprocessing to the 'text' column of the training and validation DataFrames
train_df['text'] = train_df['text'].apply(preprocess_text)
validation_df['text'] = validation_df['text'].apply(preprocess_text)

# Verify the preprocessing by printing the first few rows of the DataFrames
print(train_df.head())
print(validation_df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/administrator/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                text label
0  extend natural deduction approach linear neste...    LO
1  last decade ieee emerged popular protocol wire...    NI
2  motivated problem storing coloured de bruijn g...    DS
3  consider downlink cellular system address prob...    NI
4  meroitic still undeciphered language ancient c...    CL
                                                text label
0  manne et al designed first algorithm computing...    DC
1  consider challenge creating guideline evaluate...    SE
2  network virtualization technique allow coexist...    NI
3  min kcut problem input edge weighted graph g i...    DS
4  introduce notion weihrauchcomplete layerwise c...    LO


In [41]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  # Initialize the BERT tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=7)  # Load BERT model for sequence classification with 7 labels

# Encode labels
label_encoder = LabelEncoder()  # Initialize the label encoder
train_labels_encoded = label_encoder.fit_transform(train_df['label'])  # Fit label encoder on training labels and transform them to integers
validation_labels_encoded = label_encoder.transform(validation_df['label'])  # Transform validation labels using the fitted encoder

# Custom Dataset Class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts  # List of texts
        self.labels = labels  # Corresponding labels
        self.tokenizer = tokenizer  # BERT tokenizer
        self.max_len = max_len  # Maximum sequence length for BERT input

    def __len__(self):
        return len(self.texts)  # Return the number of samples

    def __getitem__(self, idx):
        text = self.texts[idx]  # Get the text at index `idx`
        label = self.labels[idx]  # Get the corresponding label
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Add special tokens ([CLS], [SEP]) to the text
            max_length=self.max_len,  # Specify the maximum length of the sequence
            return_token_type_ids=False,  # Do not return token type IDs
            padding='max_length',  # Pad sequences to the maximum length
            truncation=True,  # Truncate sequences longer than the maximum length
            return_attention_mask=True,  # Return the attention mask to distinguish padded elements
            return_tensors='pt',  # Return PyTorch tensors
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),  # Flatten input_ids tensor
            'attention_mask': encoding['attention_mask'].flatten(),  # Flatten attention_mask tensor
            'labels': torch.tensor(label, dtype=torch.long)  # Convert label to tensor of type long
        }

# Create DataLoader objects
train_dataset = CustomDataset(train_df['text'].tolist(), train_labels_encoded, tokenizer, max_len=128)  # Initialize training dataset with texts, labels, and tokenizer
validation_dataset = CustomDataset(validation_df['text'].tolist(), validation_labels_encoded, tokenizer, max_len=128)  # Initialize validation dataset with texts, labels, and tokenizer

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)  # Create DataLoader for training data, with batch size of 16 and shuffling enabled
validation_loader = DataLoader(validation_dataset, batch_size=16)  # Create DataLoader for validation data, with batch size of 16 (no shuffling)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
from torch.optim import AdamW  # Import AdamW optimizer from PyTorch
from transformers import get_linear_schedule_with_warmup  # Import learning rate scheduler from Hugging Face's Transformers
from sklearn.metrics import accuracy_score  # Import accuracy_score from scikit-learn for calculating accuracy

# Set up training parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Set the device to GPU if available, otherwise use CPU
model = model.to(device)  # Move the model to the chosen device (GPU or CPU)
optimizer = AdamW(model.parameters(), lr=2e-5)  # Initialize the AdamW optimizer with the model parameters and a learning rate of 2e-5
total_steps = len(train_loader) * 3  # Calculate the total number of training steps (number of batches per epoch * number of epochs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)  # Set up a linear learning rate scheduler with warmup

# Training loop
for epoch in range(3):  # Loop over the number of epochs (3 in this case)
    model.train()  # Set the model to training mode
    
    total_loss = 0  # Initialize the total loss for the epoch
    correct_predictions = 0  # Initialize the count of correct predictions
    total_samples = 0  # Initialize the count of total samples
    
    for batch in train_loader:  # Loop over each batch in the training DataLoader
        input_ids = batch['input_ids'].to(device)  # Move the input_ids to the selected device
        attention_mask = batch['attention_mask'].to(device)  # Move the attention_mask to the selected device
        labels = batch['labels'].to(device)  # Move the labels to the selected device

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)  # Forward pass: compute model output and loss
        loss = outputs.loss  # Extract the loss from the model output
        logits = outputs.logits  # Extract the logits (raw predictions) from the model output

        total_loss += loss.item()  # Accumulate the loss
        _, preds = torch.max(logits, dim=1)  # Get the predicted class with the highest score
        correct_predictions += torch.sum(preds == labels)  # Count the number of correct predictions
        total_samples += labels.size(0)  # Update the total number of samples

        loss.backward()  # Backward pass: compute gradients
        optimizer.step()  # Update model parameters using the optimizer
        scheduler.step()  # Update the learning rate using the scheduler
        optimizer.zero_grad()  # Clear the gradients for the next step

    avg_loss = total_loss / len(train_loader)  # Calculate the average loss over the epoch
    accuracy = correct_predictions.double() / total_samples  # Calculate the accuracy over the epoch

    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')  # Print the loss and accuracy for the current epoch


Epoch 1, Loss: 0.4883, Accuracy: 0.8489
Epoch 2, Loss: 0.2260, Accuracy: 0.9289
Epoch 3, Loss: 0.1305, Accuracy: 0.9627


In [43]:
from sklearn.metrics import classification_report  # Import classification_report to generate a detailed performance report

# Set the model to evaluation mode
model.eval()  # Disable dropout and other training-specific layers

predictions = []  # List to store model predictions
true_labels = []  # List to store true labels

# No gradient calculation is needed during evaluation
with torch.no_grad():  # Disable gradient calculations for efficiency
    for batch in validation_loader:  # Iterate over the validation DataLoader
        input_ids = batch['input_ids'].to(device)  # Move input_ids to the selected device (GPU)
        attention_mask = batch['attention_mask'].to(device)  # Move attention_mask to the selected device
        labels = batch['labels'].to(device)  # Move labels to the selected device

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)  # Get model outputs
        logits = outputs.logits  # Extract logits (raw predictions)
        preds = torch.argmax(logits, dim=1)  # Get the index of the max logit to determine the predicted class

        predictions.append(preds)  # Store predictions in GPU tensors
        true_labels.append(labels)  # Store true labels in GPU tensors

# Concatenate all predictions and true labels from all batches
predictions = torch.cat(predictions)  # Concatenate all the predictions
true_labels = torch.cat(true_labels)  # Concatenate all the true labels

# Convert predictions and true labels to CPU-based NumPy arrays
predictions = predictions.cpu().numpy()  # Move to CPU and convert to NumPy array
true_labels = true_labels.cpu().numpy()  # Move to CPU and convert to NumPy array

# Decode predictions and true labels back to their original class labels
predictions_decoded = label_encoder.inverse_transform(predictions)  # Convert predicted integers back to original labels
true_labels_decoded = label_encoder.inverse_transform(true_labels)  # Convert true label integers back to original labels

# Generate a classification report
report = classification_report(true_labels_decoded, predictions_decoded, output_dict=True)  # Generate a detailed classification report as a dictionary
weighted_f1_score = report['weighted avg']['f1-score']  # Extract the weighted F1 score from the report

# Print the weighted F1 score and the full classification report
print(f"Weighted F1 Score on the validation set: {weighted_f1_score:.4f}")  # Print the weighted F1 score with 4 decimal places
print("\nFull Classification Report:\n")
print(classification_report(true_labels_decoded, predictions_decoded))  # Print the full classification report in a readable format


Weighted F1 Score on the validation set: 0.9127

Full Classification Report:

              precision    recall  f1-score   support

          CL       0.98      0.98      0.98      1866
          CR       0.91      0.92      0.91      1835
          DC       0.84      0.80      0.82      1355
          DS       0.92      0.94      0.93      1774
          LO       0.92      0.91      0.91      1217
          NI       0.92      0.91      0.91      1826
          SE       0.88      0.91      0.90      1327

    accuracy                           0.91     11200
   macro avg       0.91      0.91      0.91     11200
weighted avg       0.91      0.91      0.91     11200

