<a href="https://colab.research.google.com/github/henrycgbaker/nlp_research_note/blob/main/nlp_research_note.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
import kagglehub
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import spacy
import spacy.cli
from sklearn.feature_extraction.text import CountVectorizer
import fasttext.util as fasttext_util
import fasttext
from sklearn.metrics import f1_score
from collections import Counter
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import tqdm
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
from sklearn.metrics import f1_score, accuracy_score
import warnings
import sys

warnings.filterwarnings("ignore")

In [7]:
# Hertie server
'''
import os
# Set the environment variables
os.environ['HOME_CONFIG'] = './/workspace/workspace'
os.environ['KAGGLE_CONFIG'] = './workspace/workspace/.kaggle'
os.environ['SPACY_CACHE'] = '/workspace/workspace/cache'

# Optionally, check if the environment variables were set correctly
print(os.getenv('HOME_CONFIG'))
print(os.getenv('KAGGLE_CONFIG'))
print(os.getenv('SPACY_CACHE'))
'''

"\nimport os\n# Set the environment variables\nos.environ['HOME_CONFIG'] = './/workspace/workspace'\nos.environ['KAGGLE_CONFIG'] = './workspace/workspace/.kaggle'\nos.environ['SPACY_CACHE'] = '/workspace/workspace/cache'\n\n# Optionally, check if the environment variables were set correctly\nprint(os.getenv('HOME_CONFIG'))\nprint(os.getenv('KAGGLE_CONFIG'))\nprint(os.getenv('SPACY_CACHE'))\n"

In [8]:
# download pretrained embeddings -----------------------------------------------
# for local
fasttext.util.download_model('en', if_exists='ignore')
ft_path = "./cc.en.300.bin"

# for Colab
# !pip install datasets fasttext evaluate
#from google.colab import drive
#drive.mount('/content/drive')
#os.chdir('/content/drive/Othercomputers/My MacBook Pro/Documents/repositories/nlp/nlp_research_note')
#ft_path = "./cc.en.300.bin"

ft = fasttext.load_model(ft_path)

# download spacy model for tokenization ----------------------------------------
cache_path = './cache/'
os.makedirs(cache_path, exist_ok=True)
os.environ['SPACY_DATA'] = cache_path
spacy.cli.download("en_core_web_sm")

# load helper functions & scripts ----------------------------------------------
sys.path.append('./aux_scripts')
from  misinfo_tokenizer import (get_trained_tokenizer,
                                batch_tokenize,
                                vocab_mapping,
                                custom_analyzer
                                )
from data_loader_helpers import (Collator,
                                 embedding_mapping_fasttext
                                 )

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


---
# Import & process Hugging Face `misinfo` dataset

In [9]:
hf_cache_dir = os.getenv("HF_DATASETS_CACHE", "./cache/huggingface/datasets")
dataset_path = os.path.join(hf_cache_dir, "roupenminassian", "twitter-misinformation")

if os.path.exists(dataset_path):
    print(f"Dataset found in cache: {dataset_path}")
    ds = load_dataset("roupenminassian/twitter-misinformation", cache_dir=hf_cache_dir)
else:
    print(f"Dataset not found in cache. Downloading...")
    ds = load_dataset("roupenminassian/twitter-misinformation")

print(f'Cache Directory: \n{hf_cache_dir}')
print(f'\nExternal Structure: \n{ds.shape}')
print(f'\nInternal Structure: \n{ds["train"]}')

Dataset not found in cache. Downloading...
Cache Directory: 
./cache/huggingface/datasets

External Structure: 
{'train': (92394, 4), 'test': (10267, 4)}

Internal Structure: 
Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'text', 'label'],
    num_rows: 92394
})


In [10]:
# DATA PARTITIONING =====================================================================
ds_cloned = ds.copy()

ds_cloned['train'] = ds_cloned['train'].remove_columns(['Unnamed: 0', 'Unnamed: 0.1'])
ds_cloned['test'] = ds_cloned['test'].remove_columns(['Unnamed: 0', 'Unnamed: 0.1'])

df_misinfo_train = pd.DataFrame(ds_cloned['train'], columns=["text", "label"])
df_misinfo_test = pd.DataFrame(ds_cloned['test'], columns=["text", "label"])

print("PRE-BALANCING:\n")
print(f"Train shape {df_misinfo_train.shape} \n")
print("Training positive vs negative examples: \n", df_misinfo_train.value_counts("label")/df_misinfo_train.shape[0])
print("\nTesting positive vs negative examples: \n",df_misinfo_test.value_counts("label")/df_misinfo_test.shape[0])

# balance train split -------------------------------------------------------------------

balancer = RandomUnderSampler(random_state=42, sampling_strategy = 'majority')
df_misinfo_train_balanced = pd.concat(balancer.fit_resample(X = df_misinfo_train.iloc[:,[0]],
                                                           y = df_misinfo_train.iloc[:,[1]]),
                                     axis=1).sample(frac = 1).reset_index(drop=True)
df_misinfo_train_balanced.value_counts("label")/df_misinfo_train_balanced.shape[0]

print("\nPost-BALANCING:\n")
print(f"Train shape {df_misinfo_train_balanced.shape} \n")
df_misinfo_train_balanced.head()

PRE-BALANCING:

Train shape (92394, 2) 

Training positive vs negative examples: 
 label
0    0.652737
1    0.347263
Name: count, dtype: float64

Testing positive vs negative examples: 
 label
0    0.659686
1    0.340314
Name: count, dtype: float64

Post-BALANCING:

Train shape (64170, 2) 



Unnamed: 0,text,label
0,While a lot of the nation seems to not give a ...,1
1,#WHO Admits That #BillGates-Backed \n💥VACCINES...,1
2,fine....*sounds a bit annoyed* can I try flyin...,0
3,Set off by a comment about his small hands on ...,1
4,MANILA (Reuters) - The Philippine capital s po...,0


In [None]:
# TOKENISATION ==========================================================================

tokenizer_file = './cache/misinfo_tokenizer.pkl'
train_tokens_file = './cache/misinfo_train_tokens.pkl'
test_tokens_file = './cache/misinfo_test_tokens.pkl'

if os.path.exists(train_tokens_file) and os.path.exists(test_tokens_file):
    print("Tokenized text pkl files found: loading data...")
    # Load pre-saved tokenized data
    with open(train_tokens_file, 'rb') as f:
        misinfo_train_tokens = pickle.load(f)
    with open(test_tokens_file, 'rb') as f:
        misinfo_test_tokens = pickle.load(f)

else:
    print("Pickle files not found. Running tokenization...")

    # 1) Get the trained tokenizer (will create if it doesn't exist)
    #    'df_misinfo_train["text"]' is used to fit the vocabulary
    misinfo_tokenizer = get_trained_tokenizer(
        df_misinfo_train["text"],
        tokenizer_file=tokenizer_file,
        min_df=3
    )

    # 2) Build an analyzer from the trained tokenizer
    #    Alternatively, you can directly use your custom_analyzer
    misinfo_tokenizer_analyzer = misinfo_tokenizer.build_analyzer()

    # 3) Tokenize train data in batches
    print("Tokenizing Train Data in Batches...")
    misinfo_train_tokens = batch_tokenize(
        df_misinfo_train["text"],
        32,
        misinfo_tokenizer_analyzer
    )

    # 4) Tokenize test data in batches (using a lambda to leverage custom_analyzer)
    print("Tokenizing Test Data in Batches...")
    misinfo_test_tokens = batch_tokenize(
        df_misinfo_test["text"],
        32,
        custom_analyzer
    )

    # 5) Save the tokenized train and test data
    with open(train_tokens_file, 'wb') as f:
        pickle.dump(misinfo_train_tokens, f)
    with open(test_tokens_file, 'wb') as f:
        pickle.dump(misinfo_test_tokens, f)

print("Done! Train tokens count:", len(misinfo_train_tokens))
print("Test tokens count:", len(misinfo_test_tokens))

Pickle files not found. Running tokenization...
Tokenizer file './cache/misinfo_tokenizer.pkl' found. Loading it...
Tokenizing Train Data in Batches...
Tokenizing batch 20 of 2888...
Tokenizing batch 40 of 2888...
Tokenizing batch 60 of 2888...
Tokenizing batch 80 of 2888...
Tokenizing batch 100 of 2888...
Tokenizing batch 120 of 2888...
Tokenizing batch 140 of 2888...
Tokenizing batch 160 of 2888...


KeyboardInterrupt: 

In [35]:
# STEP 1: INPUT PIPELINE ================================================================

# vocabulary indexing -------------------------------------------------------------------
print ("Vocab indexing...")

#getting vocab_mapping() from misinfo_tokenizer.py

vocab_idx = vocab_mapping(tokenized_text=misinfo_train_tokens)

# create data loaders -------------------------------------------------------------------

print("Creating data loaders...")

max_seq_length = 300
batch_size = 32

train_dl = DataLoader(
    dataset=list(zip(misinfo_train_tokens, df_misinfo_train_balanced["label"])),
    batch_size=batch_size,
    shuffle=True,
    collate_fn=Collator(vocab_idx, max_seq_length)
)

test_dl = DataLoader(
    dataset=list(zip(misinfo_test_tokens, df_misinfo_test["label"])),
    batch_size=batch_size,
    shuffle=False,
    collate_fn=Collator(vocab_idx, max_seq_length)
)

print("Created data loaders!")

# map pretrained fasttext embeddings to vocabulary indices ------------------------------
print("Mapping pretrained fasttext embeddings to vocabulary indices...")

# Define the file path for the pickle file
embeddings_file_path = "./cache/mapped_pretrained_embeddings.pkl"

# for Hertie GPU:
#pickle_file_path = "/workspace/workspace/mapped_pretrained_embeddings.pkl"

# Check if the pickle file already exists
if os.path.exists(embeddings_file_path):
    # If the file exists, load it from the pickle file
    with open(embeddings_file_path, 'rb') as f:
        embedding_tensor = pickle.load(f)
    print(f"Emebddings pre-exists: loaded embeddings from {embeddings_file_path}. Shape: {embedding_tensor.shape}")
else:
    # If the file does not exist, proceed with creating the embeddings and save them
    # Load pre-trained FastText model
    print("Embeddings do not pre-exist: mapping pretrained fasttext embeddings to vocabulary indices")


    # Map pretrained FastText embeddings to vocabulary indices
    mapped_pretrained_embeddings = embedding_mapping_fasttext(vocabulary=vocab_idx,
                                                              pre_trained_embeddings=ft)
    print("Mapped!")

    # Convert mapped embeddings to a tensor
    embedding_tensor = torch.FloatTensor(mapped_pretrained_embeddings)

    # Save the embeddings to a pickle file
    with open(embeddings_file_path, 'wb') as f:
        pickle.dump(embedding_tensor, f)
    print(f"Saved embeddings to {embeddings_file_path}. Shape: {embedding_tensor.shape}")

Vocab indexing...
Creating data loaders...
Created data loaders!
Mapping pretrained fasttext embeddings to vocabulary indices...
Emebddings pre-exists: loaded embeddings from ./cache/mapped_pretrained_embeddings.pkl. Shape: torch.Size([217732, 300])


In [13]:
def train(model, num_epochs, train_dl, test_dl, use_lengths=False):
    """Trains and evaluates a binary classification model (CNN, RNN, LSTM, etc.).

    Args:
        model (nn.Module): PyTorch model
        num_epochs (int): Number of epochs
        train_dl (DataLoader): DataLoader for training
        test_dl (DataLoader): DataLoader for testing/validation
        use_lengths (bool): Whether to compute and pass lengths to the model (for LSTMs, RNNs, etc.)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Create an optimizer (Adam by default)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_hist_train = [0] * num_epochs
    accuracy_hist_train = [0] * num_epochs
    f1_hist_train = [0] * num_epochs
    loss_hist_test = [0] * num_epochs
    accuracy_hist_test = [0] * num_epochs
    f1_hist_test = [0] * num_epochs

    for epoch in range(num_epochs):
        ### ------------------ TRAINING LOOP ------------------ ###
        model.train()
        all_train_preds = []
        all_train_labels = []

        print(f"Epoch {epoch + 1}/{num_epochs} Training...")

        for batch_idx, (x_batch, y_batch) in enumerate(train_dl):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            # (1) Optionally compute sequence lengths, if required:
            if use_lengths:
                lengths = torch.sum(x_batch != 0, dim=1)
                pred = model(x_batch, lengths)[:, 0]
            else:
                pred = model(x_batch)[:, 0]

            # (2) Compute loss
            loss = nn.BCEWithLogitsLoss()(pred, y_batch.float())

            # (3) Backprop and update
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # (4) Bookkeeping: accumulate metrics
            loss_hist_train[epoch] += loss.item() * y_batch.size(0)

            # Binary accuracy: (pred >= 0.5) vs actual
            is_correct = ((pred >= 0.5).float() == y_batch).float()
            accuracy_hist_train[epoch] += is_correct.sum()

            # Store predictions & labels for F1
            all_train_preds.extend((pred >= 0.5).cpu().numpy())
            all_train_labels.extend(y_batch.cpu().numpy())

            # (5) Print batch progress
            if (batch_idx + 1) % 1000 == 0 or (batch_idx + 1) == len(train_dl):
                print(f"    Batch {batch_idx+1}/{len(train_dl)}: Loss: {loss.item():.4f}")

        # Compute average training metrics for the epoch
        loss_hist_train[epoch] /= len(train_dl.dataset)
        accuracy_hist_train[epoch] /= len(train_dl.dataset)
        f1_hist_train[epoch] = f1_score(all_train_labels, all_train_preds)

        ### ------------------ EVALUATION LOOP ------------------ ###
        model.eval()
        all_test_preds = []
        all_test_labels = []

        print(f"Epoch {epoch + 1}/{num_epochs} Evaluating...")

        with torch.no_grad():
            for batch_idx, (x_batch, y_batch) in enumerate(test_dl):
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                # Optionally compute lengths for test set
                if use_lengths:
                    lengths = torch.sum(x_batch != 0, dim=1)
                    pred = model(x_batch, lengths)[:, 0]
                else:
                    pred = model(x_batch)[:, 0]

                loss = nn.BCEWithLogitsLoss()(pred, y_batch.float())

                # Bookkeeping
                loss_hist_test[epoch] += loss.item() * y_batch.size(0)
                is_correct = ((pred >= 0.5).float() == y_batch).float()
                accuracy_hist_test[epoch] += is_correct.sum()
                all_test_preds.extend((pred >= 0.5).cpu().numpy())
                all_test_labels.extend(y_batch.cpu().numpy())

                # Print batch progress
                if (batch_idx + 1) % 1000 == 0 or (batch_idx + 1) == len(test_dl):
                    print(f"    Batch {batch_idx + 1}/{len(test_dl)}: Loss: {loss.item():.4f}")

        # Compute average test metrics for the epoch
        loss_hist_test[epoch] /= len(test_dl.dataset)
        accuracy_hist_test[epoch] /= len(test_dl.dataset)
        f1_hist_test[epoch] = f1_score(all_test_labels, all_test_preds)

        # Print epoch summary
        print(f"Epoch {epoch + 1}/{num_epochs} Summary:")
        print(f"    Train - Accuracy: {accuracy_hist_train[epoch]:.3f}, F1: {f1_hist_train[epoch]:.3f}")
        print(f"    Test  - Accuracy: {accuracy_hist_test[epoch]:.3f}, F1: {f1_hist_test[epoch]:.3f}")

    return (loss_hist_train, loss_hist_test, accuracy_hist_train,
            accuracy_hist_test, f1_hist_train, f1_hist_test)

In [14]:
# STEP 4: MODEL BUILDING ================================================================
# CNN-based text classification model

class TextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layers
        convolution_layer = nn.Conv1d(in_channels=embedding_tensor.size(1),
                                      out_channels=128,
                                      kernel_size=3,
                                      padding="same")
        activation_layer = nn.ReLU()
        pooling_layer = nn.AdaptiveAvgPool1d(1)
        h_layers = [convolution_layer, activation_layer, pooling_layer]
        self.hidden_layers = nn.ModuleList(h_layers)
        # classification layer
        self.classification_layer = nn.Linear(in_features=128, out_features=1)

    # define forward pass
    def forward(self, x):
        x = self.embedding_layer(x).permute(0, 2, 1)

        for layer in self.hidden_layers:
            x = layer(x)

        x = x.squeeze(2)

        x = self.classification_layer(x)
        return x

model_cnn = TextClassificationModel(embedding_tensor=embedding_tensor)

In [15]:
# Train the model
num_epochs = 10
hist_cnn = train(model_cnn, num_epochs=10, train_dl=train_dl, test_dl=test_dl, use_lengths=False)
torch.save(model_cnn, "./models/cnn_model_full.pth")

Epoch 1/10 Training...


KeyboardInterrupt: 

In [None]:
# EXTENSION 1: RNN =====================================================================

class RNNTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.rnn_layer = nn.RNN(input_size=embedding_tensor.size(1),
                                hidden_size=32,
                                num_layers=1, # increase to stack RNNs
                                batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, h_t = self.rnn_layer(x) # o_t includes the outputs,
                                     # h_t the hidden state at the last time step
        x = h_t[-1, :, :] # extract from last layer (in case of num_layers > 1)
        x = self.classification_layer(x)
        return x

model_rnn = RNNTextClassificationModel(embedding_tensor=embedding_tensor)
hist_rnn = train(model_rnn, num_epochs, train_dl, test_dl, use_lengths=True) # fluctuating f1 scores, exploding gradients
torch.save(model_rnn, "./models/rnn_model_full.pth")

Epoch 1/10 Training...
    Batch 1000/2006: Loss: 0.6791


KeyboardInterrupt: 

In [23]:
# EXTENSION 2: LSTM =====================================================================

class LSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.lstm_layer = nn.LSTM(input_size=embedding_tensor.size(1),
                                  hidden_size=32,
                                  num_layers=1,
                                  batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, (h_t, c_t) = self.lstm_layer(x) # c_t the cell state at the last time step
        x = h_t[-1, :, :] # extract from last layer (in case of num_layers > 1)
        x = self.classification_layer(x)
        return x

model_lstm = LSTMTextClassificationModel(embedding_tensor=embedding_tensor)

hist_lstm = train(model_lstm, num_epochs=10, train_dl=train_dl, test_dl=test_dl, use_lengths=True)
torch.save(model_lstm, "./models/lstm_model_full.pth")

Epoch 1/10 Training...


KeyboardInterrupt: 

In [24]:
# EXTENSION 2.5: STACKING LSTM LAYERS WITH DIFFERENT HIDDEN SIZES =========================

class StackedLSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.lstm_layer_1 = nn.LSTM(input_size=embedding_tensor.size(1),
                                    hidden_size=64,
                                    num_layers=1,
                                    batch_first=True)
        self.lstm_layer_2 = nn.LSTM(input_size=64,
                                    hidden_size=32,
                                    num_layers=1,
                                    batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t_1, (h_t_1, c_t_1) = self.lstm_layer_1(x)
        o_t_2, (h_t_2, c_t_2) = self.lstm_layer_2(o_t_1)
        x = h_t_2[-1, :, :]
        x = self.classification_layer(x)
        return x

model_lstm_stacked = StackedLSTMTextClassificationModel(embedding_tensor=embedding_tensor)
hist = train(model_lstm_stacked, num_epochs, train_dl, test_dl, use_lengths=True)
torch.save(model_lstm_stacked, "./models/lstm_stacked_model_full.pth")

Epoch 1/10 Training...


KeyboardInterrupt: 

In [25]:
# EXTENSION 4: BI-DIRECTIONAL LSTM ======================================================

class BidirectionalLSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.bid_lstm_layer = nn.LSTM(input_size=embedding_tensor.size(1),
                                      hidden_size=32,
                                      num_layers=1,
                                      batch_first=True,
                                      bidirectional=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32*2, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, (h_t, c_t) = self.bid_lstm_layer(x)
        x = torch.cat((h_t[-2, :, :],
                       h_t[-1, :, :]), dim=1)
        x = self.classification_layer(x)
        return x

model_bi_lstm = BidirectionalLSTMTextClassificationModel(embedding_tensor=embedding_tensor)

hist = train(model_bi_lstm, num_epochs, train_dl, test_dl, use_lengths=True)
torch.save(model_bi_lstm, "./models/bi_lstm_model_full.pth")

Epoch 1/10 Training...


KeyboardInterrupt: 

# Transformer & Transfer Learning

In [16]:
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel

# Load checkpoint and tokenizer
checkpoint = "google/bert_uncased_L-2_H-128_A-2"
bert_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
bert_uncased = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Define the features of the dataset
features = Features({
    'text': Value(dtype='string'),
    'label': ClassLabel(num_classes=2, names=['factual', 'misinfo']),
})

# Convert train and test data to Hugging Face Dataset
dataset_train = Dataset.from_pandas(df_misinfo_train, features=features)
dataset_test = Dataset.from_pandas(df_misinfo_test, features=features)

# Display the first few rows of the training dataset
print(dataset_train[2]) 

# Check the unique values of the 'label' column to ensure the classes are correct
unique_labels = set(dataset_train['label'])
print("Unique label values in training data:", unique_labels)

# Check the mapping of integer labels to class names
print("Class name mapping:", dataset_train.features['label'].int2str)

# Create a Hugging Face DatasetDict
dataset_dict = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})

# Print the DatasetDict to check its contents
print(f'\n {dataset_dict}')

print(dataset_train[2]) 
print(dataset_dict['train'][2])

{'text': 'The only reality show Donald Trump should have ever been featured in is The Biggest Loser because he just got his ass handed to him in court.Two years ago, Trump National Doral Miami golf resort signed a contract worth $200,000 for a local business called The Paint Spot to provide paint used to renovate the golf course.Well, guess who tried to stiff The Paint Spot of the final $34,863 payment in the deal?Yeah, that would be Republican nominee Donald J. Trump.Trump and his company refused to honor the contract by not paying the final payment, saying that they ve  paid enough  for the paint. In other words, Trump negotiated a deal that ended up costing him more in the end, just like the kinds of deals he wants to negotiate for America with the rest of the world.Anyway, Paint Spot owner Juan Carlos Enriquez filed suit against Trump in court, and Judge Jorge Cueto just slapped Trump and his company with a $300,000 hit to cover The Paint Shop s attorney and court fees, nearly ten 

In [18]:
# tokenize ------------------------------------------------------------------------------
def tokenize_function(dataset):
    return bert_tokenizer(dataset["text"], truncation=True, padding="max_length", max_length=512)
    # truncates at 512 for the chosen checkpoint

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets

tokenized_datasets['train'][0]['text']
tokenized_datasets['train'][0]['label']
tokenized_datasets['train'][0]['input_ids']
tokenized_datasets['train'][0]['attention_mask']

# fine-tune -----------------------------------------------------------------------------
training_args = TrainingArguments(output_dir="./transformer_results",
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=30,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='f1',
                                  disable_tqdm=False,
                                  use_cpu=False)

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    bert_uncased,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# Save the model and tokenizer after training
trainer.save_model("./models/transformer_results")  
bert_tokenizer.save_pretrained("./models/transformer_results")  

Map: 100%|██████████| 92394/92394 [00:22<00:00, 4055.69 examples/s]
Map: 100%|██████████| 10267/10267 [00:02<00:00, 4863.68 examples/s]
  0%|          | 45/86640 [00:20<6:38:39,  3.62it/s] 

KeyboardInterrupt: 

In [None]:
# predict -------------------------------------------------------------------------------
bert_uncased.eval()

# Helper function to process data in batches
def batch_predict(model, tokenizer, texts, batch_size=16, max_length=512):
    all_preds = []
    # Check if GPU is available and move model to GPU
    if torch.cuda.is_available():
        model = model.cuda()

    with torch.no_grad():
        for start in range(0, len(texts), batch_size):
            end = min(start + batch_size, len(texts))
            batch_texts = texts[start:end]

            # Tokenize the batch of texts
            tokenized_batch = tokenizer(batch_texts, truncation=True, padding="max_length",
                                        max_length=max_length, return_tensors="pt")

            # Move tensors to GPU if available
            if torch.cuda.is_available():
                tokenized_batch = {key: value.cuda() for key, value in tokenized_batch.items()}

            # Get predictions
            outputs = model(**tokenized_batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_labels = torch.argmax(predictions, dim=1)
            all_preds.extend(predicted_labels.cpu().numpy())

    return all_preds

# Prepare your dataset
disinfo_test_texts = df_misinfo_test["text"].to_list()
true_labels = df_misinfo_test["label"].to_list()

# Make predictions in batches
predicted_labels = batch_predict(bert_uncased, bert_tokenizer, disinfo_test_texts, batch_size=16)

# Evaluate the performance
f1 = f1_score(true_labels, predicted_labels)
acc = accuracy_score(true_labels, predicted_labels)
print(f"F1 Score: {f1}")
print(f"Accuracy: {acc}")

---
# Climate Tweets

In [25]:
# reading climate df
input_path_climate = "/Users/henrybaker/.cache/kagglehub/datasets/die9origephit/climate-change-tweets/versions/1/Climate change_2022-1-17_2022-7-19.csv"

output_path_climate = "/Users/henrybaker/Documents/repositories/NLP/nlp_project/data/climate-change-tweets.csv"

df_climate = pd.read_csv(input_path_climate)
print(f"Loading dataset from '{input_path_climate}'...")
df_climate.head()

Loading dataset from '/Users/henrybaker/.cache/kagglehub/datasets/die9origephit/climate-change-tweets/versions/1/Climate change_2022-1-17_2022-7-19.csv'...


Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


In [26]:
df_climate_inference = df_climate[['Embedded_text']].rename(columns={'Embedded_text': 'text'})
df_climate_inference['label'] = None

print(f"Train shape {df_climate_inference.shape} \n")
df_climate_inference.head()

Train shape (9050, 2) 



Unnamed: 0,text,label
0,The only solution I’ve ever heard the Left pro...,
1,Climate change doesn’t cause volcanic eruption...,
2,Vaccinated tennis ball boy collapses in the te...,
3,North America has experienced an average winte...,
4,They're gonna do the same with Climate Change ...,


In [31]:
# TOKENISATION ==========================================================================

# Check if the pickle files already exist
# local
climate_tokens_file = './cache/climate_tokens.pkl'

# for Hertie GPU:
# climate_tokens_file = '/workspace/workspace/cache/climate_tokens.pkl'


if os.path.exists(climate_tokens_file):
    print("Tokenized climate tweets pkl files found: loading data...")
    # Load the pre-saved tokenized data
    with open(climate_tokens_file, 'rb') as f:
        climate_tokens = pickle.load(f)
else:
    print("Pickle files not found. Running tokenization on climate tweets...")

    print("Loading spaCy model...")
    nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "ner", "lemmatizer", "attribute_ruler"])

    misinfo_tokenizer = get_trained_tokenizer(
        df_misinfo_train["text"],
        tokenizer_file=tokenizer_file,
        min_df=3
    )
    
    misinfo_tokenizer_analyzer = misinfo_tokenizer.build_analyzer()
  
    print("Tokenizing Climate Data in Batches...")
    climate_tokens = batch_tokenize(
        df_climate_inference, 
        32,
        misinfo_tokenizer_analyzer,
    )

    # Save tokenized train and test data
    with open(climate_tokens_file, 'wb') as f:
        pickle.dump(climate_tokens, f)

Pickle files not found. Running tokenization on climate tweets...
Loading spaCy model...
Tokenizer file './cache/misinfo_tokenizer.pkl' found. Loading it...
Tokenizing Climate Data in Batches...
Tokenizing batch 20 of 283...
Tokenizing batch 40 of 283...
Tokenizing batch 60 of 283...
Tokenizing batch 80 of 283...
Tokenizing batch 100 of 283...
Tokenizing batch 120 of 283...
Tokenizing batch 140 of 283...
Tokenizing batch 160 of 283...
Tokenizing batch 180 of 283...
Tokenizing batch 200 of 283...
Tokenizing batch 220 of 283...
Tokenizing batch 240 of 283...
Tokenizing batch 260 of 283...
Tokenizing batch 280 of 283...
Tokenizing batch 283 of 283...


In [37]:
climate_tokens[:2]

[['text'], ['label']]

In [34]:
# STEP 1: INPUT PIPELINE ================================================================

# vocabulary indexing -------------------------------------------------------------------
print ("Vocab indexing")

vocab_idx_climate = vocab_mapping(tokenized_text=climate_tokens)

# create data loaders -------------------------------------------------------------------

print("Creating data loaders")
climate_dl = DataLoader(
    dataset=list(zip(climate_tokens, climate_tokens["label"])),
    batch_size=32,
    shuffle=True,
    collate_fn=Collator(vocab_idx, max_seq_length)
)

print("Created data loaders!")

# map pretrained fasttext embeddings to vocabulary indices ------------------------------

# Define the file path for the pickle file
# for local:
embeddings_file_path = "./cache/mapped_pretrained_embeddings.pkl"
# for Hertie GPU:
#pickle_file_path = "/workspace/workspace/mapped_pretrained_embeddings.pkl"

# Check if the pickle file already exists
if os.path.exists(embeddings_file_path):
    # If the file exists, load it from the pickle file
    with open(embeddings_file_path, 'rb') as f:
        embedding_tensor = pickle.load(f)
    print(f"Emebddings pre-exists: loaded embeddings from {embeddings_file_path}. Shape: {embedding_tensor.shape}")
else:
    # If the file does not exist, proceed with creating the embeddings and save them
    # Load pre-trained FastText model
    print("Embeddings do not pre-exist: mapping pretrained fasttext embeddings to vocabulary indices")

    # Map pretrained FastText embeddings to vocabulary indices
    mapped_pretrained_embeddings = embedding_mapping_fasttext(vocabulary=vocab_idx_climate,
                                                              pre_trained_embeddings=ft)

    # Convert mapped embeddings to a tensor
    embedding_tensor = torch.FloatTensor(mapped_pretrained_embeddings)

    # Save the embeddings to a pickle file
    with open(embeddings_file_path, 'wb') as f:
        pickle.dump(embedding_tensor, f)
    print(f"Saved embeddings to {embeddings_file_path}. Shape: {embedding_tensor.shape}")

Vocab indexing
Creating data loaders


TypeError: list indices must be integers or slices, not str