In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
import numpy as np
import pandas as pd

df_n = np.load('drive/MyDrive/cleaned_train_dataset_megafinal_processed.npy', allow_pickle=True)
df = pd.DataFrame(df_n)

In [3]:
df[2].apply(len).mean()

45782.897051941974

In [4]:
df_eval_n = np.load('drive/MyDrive/cleaned_eval_dataset_training.npy', allow_pickle=True)
df_eval = pd.DataFrame(df_eval_n)

In [5]:
df_eval[1].apply(len).mean()

44984.56007751938

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from tqdm.auto import tqdm  # Added tqdm import

import torch.optim as optim
import torch.nn as nn
import random

class TweetSentenceClassifier:
    def __init__(self, model_name='distilbert-base-uncased', num_labels=2):
        """
        Initialize the Tweet Sentence Classifier

        :param model_name: Pretrained model to use as base
        :param num_labels: Number of classification categories
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # If num_labels is not specified, it will be inferred from the dataset
        self.num_labels = num_labels
        self.model = None
        self.model_name = model_name

        # Infer number of labels if not specified
        if self.num_labels is None:
            self.num_labels = len(df[label_column].unique())

        # Initialize model with correct number of labels
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=self.num_labels
        ).to(self.device)

    def prepare_dataset(self, df, text_column, label_column, test_size=0.2, random_state=42):
        """
        Prepare dataset for training

        :param df: Pandas DataFrame containing tweets
        :param text_column: Name of column with tweet text
        :param label_column: Name of column with labels
        :param test_size: Proportion of dataset to use for testing
        :param random_state: Random seed for reproducibility
        :return: Tuple of train and test datasets
        """

        # Tokenization function
        def tokenize_function(text):
            return self.tokenizer(
                text,
                padding='max_length',
                truncation=True,
                max_length=4096
            )

        # Split the data
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_state,
        )

        # Tokenize datasets
        train_df['tokenized'] = train_df[text_column].apply(tokenize_function)
        test_df['tokenized'] = test_df[text_column].apply(tokenize_function)

        return train_df, test_df

    def prepare_general_dataset(self, df, text_column, label_column, test_size=0.2, random_state=42):
        """
        Prepare general dataset for training

        :param df: Pandas DataFrame containing tweets
        :param text_column: Name of column with tweet text
        :param label_column: Name of column with labels
        :param test_size: Proportion of dataset to use for testing
        :param random_state: Random seed for reproducibility
        :return: Tuple of train and test datasets
        """


        # Tokenization function
        def tokenize_function(text):
            return self.tokenizer(
                text,
                truncation=True,
                max_length=len(text)*10
            )

        # Split the data
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=random_state,
        )

        # Tokenize datasets
        train_df['tokenized'] = train_df[text_column].apply(tokenize_function)
        test_df['tokenized'] = test_df[text_column].apply(tokenize_function)

        return train_df, test_df


    def prepare_loaders_from_generals(self, train_df, test_df, label_column, max_length=4096):
        def process_tokens(tokenized_items):
            processed_input_ids = []
            processed_attention_masks = []

            for item in tokenized_items:
                input_ids = item['input_ids']
                attention_mask = item['attention_mask']

                # If sequence is shorter than 512, pad
                if len(input_ids) <= max_length:
                    pad_length = max_length - len(input_ids)
                    padded_input_ids = input_ids + [self.tokenizer.pad_token_id] * pad_length
                    padded_attention_mask = attention_mask + [0] * pad_length
                    processed_input_ids.append(padded_input_ids)
                    processed_attention_masks.append(padded_attention_mask)

                # If sequence is longer than 512, choose random start
                else:
                    # Calculate the maximum possible start index
                    max_start_index = len(input_ids) - max_length

                    # Choose a random start index
                    start_index = random.randint(0, max_start_index)

                    # Extract 512 consecutive tokens
                    windowed_input_ids = input_ids[start_index:start_index + max_length]
                    windowed_attention_mask = attention_mask[start_index:start_index + max_length]

                    processed_input_ids.append(windowed_input_ids)
                    processed_attention_masks.append(windowed_attention_mask)

            return processed_input_ids, processed_attention_masks

        # Ensure random seed for reproducibility
        # random.seed(42)

        # Process train and test data
        train_input_ids, train_attention_mask = process_tokens(train_df['tokenized'])
        test_input_ids, test_attention_mask = process_tokens(test_df['tokenized'])

        # Convert to tensors
        train_input_ids = torch.tensor(train_input_ids)
        train_attention_mask = torch.tensor(train_attention_mask)
        # train_labels = torch.tensor(np.array(train_df[label_column].to_list(), dtype=np.int8))

        test_input_ids = torch.tensor(test_input_ids)
        test_attention_mask = torch.tensor(test_attention_mask)
        # test_labels = torch.tensor(np.array(test_df[label_column].to_list(),dtype=np.int8))

        train_labels = torch.tensor(np.array(train_df[label_column].to_list(), dtype=np.int64), dtype=torch.long)
        test_labels = torch.tensor(np.array(test_df[label_column].to_list(), dtype=np.int64), dtype=torch.long)


        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(
            train_input_ids,
            train_attention_mask,
            train_labels
        )
        test_dataset = torch.utils.data.TensorDataset(
            test_input_ids,
            test_attention_mask,
            test_labels
        )

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=16,
            shuffle=False
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=16,
            shuffle=False
        )

        return train_loader, test_loader


    def compute_metrics(self, pred):
        """
        Compute evaluation metrics

        :param pred: Predictions from the model
        :return: Dictionary of metrics
        """
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)

        # Compute classification report
        report = classification_report(labels, preds, output_dict=True)

        return {
            'accuracy': report['accuracy'],
            'macro_f1': report['macro avg']['f1-score'],
            'weighted_f1': report['weighted avg']['f1-score']
        }

    def custom_loader_train(self, train_df, test_df, label_column, epochs=1, learning_rate=2e-5):
        """
        Custom training method with tqdm progress bars

        :param train_df: Training dataframe with tokenized column
        :param test_df: Test dataframe with tokenized column
        :param label_column: Name of the label column
        :param epochs: Number of training epochs
        :param learning_rate: Learning rate for optimizer
        """

        # test_input_ids = torch.tensor([x['input_ids'] for x in test_df['tokenized']])
        # test_attention_mask = torch.tensor([x['attention_mask'] for x in test_df['tokenized']])
        # test_labels = torch.tensor(test_df[label_column].values)

        # # Create data loaders

        # test_dataset = torch.utils.data.TensorDataset(
        #     test_input_ids,
        #     test_attention_mask,
        #     test_labels
        # )

        # test_loader = torch.utils.data.DataLoader(
        #     test_dataset,
        #     batch_size=32,
        #     shuffle=False
        # )



        # Prepare optimizer and loss
        optimizer = optim.AdamW(
            self.model.parameters(),
            lr=learning_rate
        )
        criterion = nn.CrossEntropyLoss()

        # Training loop with tqdm progress bars
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0

            train_loader, test_loader = self.prepare_loaders_from_generals(train_df, test_df, label_column)


            # Wrap train_loader with tqdm for progress bar
            train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Training)", leave=False)
            for batch in train_progress_bar:
                # Unpack batch
                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss
                logits = outputs.logits

                # Backward pass
                loss.backward()
                optimizer.step()

                # Compute accuracy
                train_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()

                # Update progress bar
                train_progress_bar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Accuracy': f'{100 * train_correct / train_total:.2f}%'
                })

            # Validation phase
            self.model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            all_preds = []
            all_labels = []

            # Wrap test_loader with tqdm for progress bar
            val_progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} (Validation)", leave=False)
            with torch.no_grad():
                for batch in val_progress_bar:
                    input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    loss = outputs.loss
                    logits = outputs.logits

                    val_loss += loss.item()
                    _, predicted = torch.max(logits, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

                    # Update progress bar
                    val_progress_bar.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Accuracy': f'{100 * val_correct / val_total:.2f}%'
                    })

            # Print epoch statistics
            # print(f"Epoch {epoch+1}/{epochs}")
            # print(f"Train Loss: {train_loss/len(train_loader):.4f}")
            # print(f"Train Accuracy: {100 * train_correct / train_total:.2f}%")
            # print(f"Val Loss: {val_loss/len(test_loader):.4f}")
            # print(f"Val Accuracy: {100 * val_correct / val_total:.2f}%")

            # Compute detailed classification report
            # report = classification_report(all_labels, all_preds)
            # print("Classification Report:")
            # print(report)

        return self.model

    def custom_train(self, train_df, test_df, label_column, epochs=1, learning_rate=2e-5):
        """
        Custom training method with tqdm progress bars

        :param train_df: Training dataframe with tokenized column
        :param test_df: Test dataframe with tokenized column
        :param label_column: Name of the label column
        :param epochs: Number of training epochs
        :param learning_rate: Learning rate for optimizer
        """
        # Prepare data tensors
        train_input_ids = torch.tensor([x['input_ids'] for x in train_df['tokenized']])
        train_attention_mask = torch.tensor([x['attention_mask'] for x in train_df['tokenized']])
        train_labels = torch.tensor(train_df[label_column].values)

        test_input_ids = torch.tensor([x['input_ids'] for x in test_df['tokenized']])
        test_attention_mask = torch.tensor([x['attention_mask'] for x in test_df['tokenized']])
        test_labels = torch.tensor(test_df[label_column].values)

        # Create data loaders
        train_dataset = torch.utils.data.TensorDataset(
            train_input_ids,
            train_attention_mask,
            train_labels
        )
        test_dataset = torch.utils.data.TensorDataset(
            test_input_ids,
            test_attention_mask,
            test_labels
        )

        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=32,
            shuffle=True
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset,
            batch_size=32,
            shuffle=False
        )

        # Prepare optimizer and loss
        optimizer = optim.AdamW(
            self.model.parameters(),
            lr=learning_rate
        )
        criterion = nn.CrossEntropyLoss()

        # Training loop with tqdm progress bars
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            train_loss = 0
            train_correct = 0
            train_total = 0

            # Wrap train_loader with tqdm for progress bar
            train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} (Training)", leave=False)
            for batch in train_progress_bar:
                # Unpack batch
                input_ids, attention_mask, labels = [b.to(self.device) for b in batch]

                # Zero gradients
                optimizer.zero_grad()

                # Forward pass
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss
                logits = outputs.logits

                # Backward pass
                loss.backward()
                optimizer.step()

                # Compute accuracy
                train_loss += loss.item()
                _, predicted = torch.max(logits, 1)
                train_total += labels.size(0)
                train_correct += (predicted == labels).sum().item()

                # Update progress bar
                train_progress_bar.set_postfix({
                    'Loss': f'{loss.item():.4f}',
                    'Accuracy': f'{100 * train_correct / train_total:.2f}%'
                })

            # Validation phase
            self.model.eval()
            val_loss = 0
            val_correct = 0
            val_total = 0
            all_preds = []
            all_labels = []

            # Wrap test_loader with tqdm for progress bar
            val_progress_bar = tqdm(test_loader, desc=f"Epoch {epoch+1}/{epochs} (Validation)", leave=False)
            with torch.no_grad():
                for batch in val_progress_bar:
                    input_ids, attention_mask, labels = [b.to(self.device) for b in batch]
                    print(labels.shape)
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    loss = outputs.loss
                    logits = outputs.logits

                    val_loss += loss.item()
                    _, predicted = torch.max(logits, 1)
                    val_total += labels.size(0)
                    val_correct += (predicted == labels).sum().item()

                    all_preds.extend(predicted.cpu().numpy())
                    all_labels.extend(labels.cpu().numpy())

                    # Update progress bar
                    val_progress_bar.set_postfix({
                        'Loss': f'{loss.item():.4f}',
                        'Accuracy': f'{100 * val_correct / val_total:.2f}%'
                    })

            # Print epoch statistics
            print(f"Epoch {epoch+1}/{epochs}")
            print(f"Train Loss: {train_loss/len(train_loader):.4f}")
            print(f"Train Accuracy: {100 * train_correct / train_total:.2f}%")
            print(f"Val Loss: {val_loss/len(test_loader):.4f}")
            print(f"Val Accuracy: {100 * val_correct / val_total:.2f}%")

            # Compute detailed classification report
            # report = classification_report(all_labels, all_preds)
            print("Classification Report:")
            # print(report)

        return self.model

    def predict(self, texts, start=0):
        """
        Make predictions on new texts

        :param texts: List of text strings to classify
        :return: Predictions and their probabilities
        """
        # Tokenize inputs
        inputs = self.tokenizer(
            texts[start:],
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        # Get model predictions
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(probs, dim=1)

        return predictions.cpu().numpy(), probs.cpu().numpy()


    def freeze(self):
        for name, param in self.model.named_parameters():
            if 'classifier' not in name:
                param.requires_grad = False

    def predict_iterate(self, texts):
        """
        Make predictions on new texts

        :param texts: List of text strings to classify
        :return: Predictions and their probabilities
        """
        # Tokenize inputs
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(self.device)

        # Get model predictions
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs)
            probs = torch.softmax(outputs.logits, dim=1)
            predictions = torch.argmax(probs, dim=1)

        return predictions.cpu().numpy(), probs.cpu().numpy()

In [7]:
# Initialize classifier
classifier = TweetSentenceClassifier(model_name='allenai/longformer-base-4096', num_labels=2)  # Specify number of labels

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
classifier.freeze()

In [9]:
# Prepare dataset (adjust column names as needed)
train_dataset, test_dataset = classifier.prepare_general_dataset(
    df,
    text_column=2,
    label_column=1
)

In [23]:
train_dataset['tokenized'].apply(lambda x: len(x['input_ids'])).mean()

1345.9375

In [26]:
classifier.model(input_ids=torch.tensor(train_dataset['tokenized'][0]['input_ids']).unsqueeze(0).to('cuda'),
                 labels=torch.tensor(1).unsqueeze(0).to('cuda'),)

Input ids are automatically padded to be a multiple of `config.attention_window`: 512


LongformerSequenceClassifierOutput(loss=tensor(0.7026, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0362, -0.0551]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None, global_attentions=None)

In [16]:
# import torch

# def hook_fn(module, input, output):
#     print(f"Module name: {module}, Output shape: {output}")

# # Assuming 'model' is your PyTorch model
# for name, module in classifier.model.named_modules():
#     module.register_forward_hook(hook_fn)

In [10]:
x = classifier.prepare_loaders_from_generals(train_dataset, test_dataset, 1)[0]

In [10]:
# Train the model
model = classifier.custom_loader_train(train_dataset, test_dataset, 1, 10, learning_rate=1e-3)

Epoch 1/10 (Training):   0%|          | 0/107 [00:00<?, ?it/s]

Initializing global attention on CLS token...


Epoch 1/10 (Validation):   0%|          | 0/27 [00:00<?, ?it/s]

Epoch 2/10 (Training):   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 2/10 (Validation):   0%|          | 0/27 [00:00<?, ?it/s]

Epoch 3/10 (Training):   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 3/10 (Validation):   0%|          | 0/27 [00:00<?, ?it/s]

Epoch 4/10 (Training):   0%|          | 0/107 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:
torch.cuda.empty_cache()

In [16]:
from collections import Counter


Counter(classifier.predict(df_eval[1].apply(lambda x: x[0:]).to_list())[0])

OutOfMemoryError: CUDA out of memory. Tried to allocate 6.06 GiB. GPU 0 has a total capacity of 14.75 GiB of which 2.14 GiB is free. Process 3505 has 12.61 GiB memory in use. Of the allocated memory 12.23 GiB is allocated by PyTorch, and 256.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [17]:
torch.save(classifier.model.state_dict(), 'drive/MyDrive/maybe_poor_bert.pth')

In [1]:
import numpy as np

# Define batch size
batch_size = 32

# Split the data into batches
batches = np.array_split(df_eval[1].apply(lambda x: x[0:]).to_list(), len(df_eval[1]) // batch_size + 1)

# Process each batch individually
predictions = []
for batch in batches:
    predictions.extend(classifier.predict(batch))

# Now, you can count the occurrences
from collections import Counter
counter = Counter(predictions)
print(counter)

NameError: name 'df_eval' is not defined