<a href="https://colab.research.google.com/github/henrycgbaker/nlp_research_note/blob/main/research_note.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
# !pip install datasets fasttext evaluate
import os
import subprocess
import kagglehub
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.under_sampling import RandomUnderSampler
import spacy
import spacy.cli
import fasttext.util as fasttext_util
import fasttext
from collections import Counter
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import tqdm
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
import warnings
import sys
from tqdm import tqdm
import matplotlib.pyplot as plt

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda" if torch.cuda.is_available() else "cpu")
warnings.filterwarnings("ignore")

In [18]:
# load helper functions & scripts ----------------------------------------------
'''
sys.path.append('./aux_scripts')
from  misinfo_tokenizer import (get_trained_tokenizer,
                                batch_tokenize,
                                #vocab_mapping,
                                custom_analyzer
                                )
from data_loader_helpers import (#Collator,
                                 embedding_mapping_fasttext
                                 )
'''

"\nsys.path.append('./aux_scripts')\nfrom  misinfo_tokenizer import (get_trained_tokenizer,\n                                batch_tokenize,\n                                #vocab_mapping,\n                                custom_analyzer\n                                )\nfrom data_loader_helpers import (#Collator,\n                                 embedding_mapping_fasttext\n                                 )\n"

In [19]:
# DATA
DATA_DIR = "./data"
TRAIN_DATA_FILE = "./data/train_data.csv"
TEST_DATA_FILE = "./data/test_data.csv"

if os.path.exists(TRAIN_DATA_FILE) and os.path.exists(TEST_DATA_FILE):
    print("Loading data")
    train_df = pd.read_csv("./data/train_data.csv")
    test_df = pd.read_csv("./data/test_data.csv")
else:
    os.makedirs(DATA_DIR, exist_ok=True)
    print("calling data proicessing script")
    
    # Calling the Jupyter notebook script
    subprocess.run(['jupyter', 'nbconvert', '--execute', '--to', 'notebook', '--inplace', '_1_data_processing.ipynb'])

Loading data


---
# Import `Model Development Data`
See `data_processing script`

In [20]:
# download spacy model for tokenization ----------------------------------------
cache_path = './cache/'
os.makedirs(cache_path, exist_ok=True)
os.environ['SPACY_DATA'] = cache_path
spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# DEFINE TOKENIZATION FLOW =====================================================================

nlp = spacy.load("en_core_web_sm", 
                 disable=["tok2vec", "tagger", "parser", "ner", "lemmatizer", "attribute_ruler"])

def custom_tokenizer(text):
    tokenized_text = nlp(text)
    return [tok.text for tok in tokenized_text]

def custom_analyzer(text, trained_tokenizer):
    """
    Uses the custom_tokenizer, then replaces out-of-vocabulary tokens with <unk>.
    """
    tokens = custom_tokenizer(text)
    vocab = trained_tokenizer.vocabulary_
    return [token if token in vocab else "<unk>" for token in tokens]

def get_trained_tokenizer(text_series, rnn_tokenizer_file=None, min_df=3):
    """
    1) Checks if a previously fitted tokenizer exists in tokenizer_file.
    2) If not, create a new CountVectorizer, fit it on 'text_series'.
    3) Save the fitted tokenizer if tokenizer_file is provided.
    4) Return the tokenizer.
    """
    # Ensure the directory exists, but check if it's a file first
    if rnn_tokenizer_file:
        tokenizer_dir = os.path.dirname(rnn_tokenizer_file)
        if os.path.exists(tokenizer_dir):
            if os.path.isfile(tokenizer_dir):
                raise FileExistsError(f"The path '{tokenizer_dir}' exists but is a file, not a directory.")
        else:
            os.makedirs(tokenizer_dir, exist_ok=True)

    # If a tokenizer file path is given and exists, load it
    if rnn_tokenizer_file and os.path.exists(rnn_tokenizer_file):
        print(f"Tokenizer file '{rnn_tokenizer_file}' found. Loading it...")
        with open(rnn_tokenizer_file, 'rb') as f:
            tokenizer = pickle.load(f)
    else:
        # Otherwise, create a new one and fit
        print("No pre-fitted tokenizer found or no file specified. Creating a new one...")
        tokenizer = CountVectorizer(
            analyzer="word",
            tokenizer=custom_tokenizer,  # We define custom_tokenizer for splitting
            lowercase=False,
            min_df=min_df
        )
        tokenizer.fit(text_series)
        
        # Save the tokenizer if a path was provided
        if rnn_tokenizer_file:
            print(f"Saving fitted tokenizer to '{rnn_tokenizer_file}'...")
            with open(rnn_tokenizer_file, 'wb') as f:
                pickle.dump(tokenizer, f)

    return tokenizer

def batch_tokenize(text_series, batch_size, analyzer_func):
    """
    Tokenizes a Pandas Series of text in batches to avoid memory issues.
    """
    tokenized_result = []
    total = len(text_series)
    num_batches = (total // batch_size) + (1 if total % batch_size != 0 else 0)
    
    for batch_idx in range(0, total, batch_size):
        
        # Print progress every 200 batches or at the last batch
        if (batch_idx // batch_size + 1) % 200 == 0 or (batch_idx + batch_size >= total):
            print(f'Tokenizing batch {batch_idx // batch_size + 1} of {num_batches}...')
        
        batch_texts = text_series[batch_idx : batch_idx + batch_size]
        for text in batch_texts:
            tokenized_result.append(analyzer_func(text))
    
    return tokenized_result

# TOKENIZATION ==========================================================================
TOKENIZER_DIR = './cache' 
RNN_TOKENIZER_PKL_PATH = os.path.join(TOKENIZER_DIR, 'rnn_tokenizer_file.pkl')  
train_tokenised_file = os.path.join(TOKENIZER_DIR, 'misinfo_train_tokenised.pkl')
test_tokenised_file = os.path.join(TOKENIZER_DIR, 'misinfo_test_tokenised.pkl')

if os.path.exists(train_tokenised_file) and os.path.exists(test_tokenised_file):
    print("Tokenized text pkl files found: loading data...")
    # Load pre-saved tokenized data
    with open(train_tokenised_file, 'rb') as f:
        misinfo_train_tokenised = pickle.load(f)
    with open(test_tokenised_file, 'rb') as f:
        misinfo_test_tokenised = pickle.load(f)

else:
    print("Pickle files not found. Running tokenization...")

    # 1) Train tokenizer
    misinfo_tokenizer = get_trained_tokenizer(
        train_df["text"],
        rnn_tokenizer_file=RNN_TOKENIZER_PKL_PATH,  # Path fixed
        min_df=3
    )

    # Build the default analyzer from our tokenizer
    misinfo_tokenizer_analyzer = misinfo_tokenizer.build_analyzer()

    # 2) Tokenize train data in batches using the built analyzer (trained on train set)
    print("Tokenizing Train Data in Batches...")
    misinfo_train_tokenised = batch_tokenize(
        train_df["text"],
        32,
        misinfo_tokenizer_analyzer
    )
    
    # 3) Tokenize test data in batches using custom_analyzer (which replaces OOV tokens with <unk>)
    print("Tokenizing Test Data in Batches...")
    misinfo_test_tokenised = batch_tokenize(
        test_df["text"],
        32,
        lambda text: custom_analyzer(text, trained_tokenizer=misinfo_tokenizer)
    )

    # save the tokenized data
    os.makedirs(TOKENIZER_DIR, exist_ok=True)  # This will now work, creating only if directory doesn't exist
    
    with open(train_tokenised_file, 'wb') as f:
        pickle.dump(misinfo_train_tokenised, f)
    
    with open(test_tokenised_file, 'wb') as f:
        pickle.dump(misinfo_test_tokenised, f)

print("Train inputs tokenised:", len(misinfo_train_tokenised))
print("Test inputs tokenised:", len(misinfo_test_tokenised))


Tokenized text pkl files found: loading data...
Train inputs tokenised: 5393
Test inputs tokenised: 1034


In [22]:
# STEP 1: INPUT PIPELINE ================================================================

# vocabulary indexing -------------------------------------------------------------------

def vocab_mapping(tokenized_text):
    token_counts = Counter()
    for text in tokenized_text:
        token_counts.update(text)
    special_tokens = ["<pad>", "<unk>"]
    vocab_tokens = special_tokens + [token for token, freq in token_counts.most_common()]
    vocab = {token: idx for idx, token in enumerate(vocab_tokens)}
    return vocab

vocab_idx = vocab_mapping(tokenized_text=misinfo_train_tokenised)

print(f"Vocab size: {len(vocab_idx)}")
print(f"Vocab example: {list(vocab_idx.items())[:10]}")

Vocab size: 74703
Vocab example: [('<pad>', 0), ('<unk>', 1), (',', 2), ('the', 3), ('.', 4), ('to', 5), ('of', 6), ('and', 7), ('a', 8), ('in', 9)]


In [23]:
# create data loaders -------------------------------------------------------------------

def collate_fn(data, include_lengths=True):
    text_list, label_list, lengths = [], [], []
    for _text, _label in data:
        # Integer encoding with truncation
        processed_text = torch.tensor([vocab_idx[token] for token in _text][:max_seq_length],
                                      dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(_label)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    # Padding
    padded_text_list = nn.utils.rnn.pad_sequence(text_list,
                                                 batch_first=True,
                                                 padding_value=0)
    if include_lengths:
        return padded_text_list, label_list, lengths
    else:
        return padded_text_list, label_list

max_seq_length = 300 # too long for full RNNs
batch_size = 32

# standard dls with collate_fn
train_dl = DataLoader(dataset=list(zip(misinfo_train_tokenised,
                                         train_df["label"])),
                        batch_size=32, shuffle=True, 
                        collate_fn=lambda x: collate_fn(x, include_lengths=True))

test_dl = DataLoader(dataset=list(zip(misinfo_test_tokenised,
                                         test_df["label"])),
                        batch_size=32, shuffle=False, 
                        collate_fn=lambda x: collate_fn(x, include_lengths=True))

# dls w/o collate_fn for CNNs:
train_dl_cnn = DataLoader(dataset=list(zip(misinfo_train_tokenised,
                                         train_df["label"])),
                        batch_size=32, shuffle=True,
                        collate_fn=lambda x: collate_fn(x, include_lengths=False))

test_dl_cnn = DataLoader(dataset=list(zip(misinfo_test_tokenised,
                                         test_df["label"])),
                         batch_size=32, shuffle=True,
                        collate_fn=lambda x: collate_fn(x, include_lengths=False))

In [24]:
# EMBEDDING MAPPING =====================================================================

# download pretrained embeddings -----------------------------------------------
# for local
#fasttext.util.download_model('en', if_exists='ignore')
ft_path = "./cc.en.300.bin"

# for Colab
# !pip install datasets fasttext evaluate
#from google.colab import drive
#drive.mount('/content/drive')
#os.chdir('/content/drive/Othercomputers/My MacBook Pro/Documents/repositories/nlp/nlp_research_note')
#ft_path = "./cc.en.300.bin"

ft = fasttext.load_model(ft_path)
# map pretrained fasttext embeddings to vocabulary indices ------------------------------

EMBEDDINGS_FILE_PATH = "./cache/mapped_pretrained_embeddings.pkl"

def embedding_mapping_fasttext(vocabulary, pre_trained_embeddings):
    vocab_size = len(vocabulary)
    embedding_dim = pre_trained_embeddings.get_dimension()
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for idx, word in enumerate(vocabulary):
        embedding_matrix[idx] = pre_trained_embeddings.get_word_vector(word)
    return embedding_matrix

if os.path.exists(EMBEDDINGS_FILE_PATH):
    with open(EMBEDDINGS_FILE_PATH, 'rb') as f:
        embedding_tensor = pickle.load(f)
    print(f"Emebddings pre-exists: loaded embeddings from {EMBEDDINGS_FILE_PATH}. Shape: {embedding_tensor.shape}")
else:
    print("Embeddings do not pre-exist: mapping pretrained fasttext embeddings to vocabulary indices")

    mapped_pretrained_embeddings = embedding_mapping_fasttext(vocabulary=vocab_idx,
                                                              pre_trained_embeddings=ft)
    embedding_tensor = torch.FloatTensor(mapped_pretrained_embeddings)

    # Save embeddings
    with open(EMBEDDINGS_FILE_PATH, 'wb') as f:
        pickle.dump(embedding_tensor, f)
    print(f"Saved embeddings to {EMBEDDINGS_FILE_PATH}. Shape: {embedding_tensor.shape}")

Emebddings pre-exists: loaded embeddings from ./cache/mapped_pretrained_embeddings.pkl. Shape: torch.Size([74703, 300])


In [45]:
def train(model, num_epochs, train_dl, test_dl, use_lengths=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.BCEWithLogitsLoss()

    device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model.to(device)

    metrics = {
        "loss_train": [],
        "loss_test": [],
        "accuracy_train": [],
        "accuracy_test": [],
        "f1_train": [],
        "f1_test": []
    }

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss, train_correct = 0, 0
        all_train_preds, all_train_labels = [], []

        print(f"Epoch {epoch + 1}/{num_epochs} Training...")
        for batch in tqdm(train_dl, desc="Training", leave=False):
            if use_lengths:
                x_batch, y_batch, lengths = batch
                x_batch, y_batch, lengths = x_batch.to(device), y_batch.to(device), lengths.to(device)
                pred = model(x_batch, lengths)[:, 0]  # Include lengths for RNNs/LSTMs
            else:
                x_batch, y_batch = batch
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                pred = model(x_batch)[:, 0]

            # Compute loss
            loss = loss_fn(pred, y_batch.float())

            # Backpropagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # Metrics
            train_loss += loss.item() * y_batch.size(0)
            preds = (pred >= 0.5).float()
            train_correct += (preds == y_batch).float().sum().item()
            all_train_preds.extend(preds.cpu().numpy())
            all_train_labels.extend(y_batch.cpu().numpy())

        metrics["loss_train"].append(train_loss / len(train_dl.dataset))
        metrics["accuracy_train"].append(train_correct / len(train_dl.dataset))
        metrics["f1_train"].append(f1_score(all_train_labels, all_train_preds))

        # Evaluation phase
        model.eval()
        test_loss, test_correct = 0, 0
        all_test_preds, all_test_labels = [], []
        print(f"Epoch {epoch + 1}/{num_epochs} Evaluating...")
        with torch.no_grad():
            for batch in tqdm(test_dl, desc="Evaluating", leave=False):
                if use_lengths:
                    x_batch, y_batch, lengths = batch
                    x_batch, y_batch, lengths = x_batch.to(device), y_batch.to(device), lengths.to(device)
                    pred = model(x_batch, lengths)[:, 0]
                else:
                    x_batch, y_batch = batch
                    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                    pred = model(x_batch)[:, 0]

                # Compute loss
                loss = loss_fn(pred, y_batch.float())

                # Metrics
                test_loss += loss.item() * y_batch.size(0)
                preds = (pred >= 0.5).float()
                test_correct += (preds == y_batch).float().sum().item()
                all_test_preds.extend(preds.cpu().numpy())
                all_test_labels.extend(y_batch.cpu().numpy())

        metrics["loss_test"].append(test_loss / len(test_dl.dataset))
        metrics["accuracy_test"].append(test_correct / len(test_dl.dataset))
        metrics["f1_test"].append(f1_score(all_test_labels, all_test_preds))

        # Print summary
        print(f"Epoch {epoch + 1}/{num_epochs} Summary:")
        print(f"    Train - Loss: {metrics['loss_train'][-1]:.4f}, Accuracy: {metrics['accuracy_train'][-1]:.3f}, F1: {metrics['f1_train'][-1]:.3f}")
        print(f"    Test  - Loss: {metrics['loss_test'][-1]:.4f}, Accuracy: {metrics['accuracy_test'][-1]:.3f}, F1: {metrics['f1_test'][-1]:.3f}")

    return metrics

In [48]:
# MODEL BUILDING ================================================================
# see aux_scripts

sys.path.append('./aux_scripts')
from model_architectures import CNNTextClassifier, RNNTextClassifier, StackedLSTMTextClassifier, BidirectionalLSTMTextClassifier, BidirectionalGRUTextClassifier, LSTMTextClassifier

In [46]:

# TRAINING ================================================================

# CNN

num_epochs = 10

from model_architectures import CNNTextClassifier
model_cnn = CNNTextClassifier(embedding_tensor=embedding_tensor)

if os.path.exists("./models/cnn_model.pth"):
    model_cnn = torch.load("./models/cnn_model.pth")
    print("Model loaded from disk.")
else:
    os.makedirs("./models/train_hist", exist_ok=True)
    print("Training the model...")
    num_epochs = 10
    hist_cnn = train(model_cnn, num_epochs, train_dl_cnn, test_dl_cnn, use_lengths=False)
    print("Training the model...")
    with open("./models/train_hist/cnn_hist.pkl", "wb") as f:
        pickle.dump(hist_cnn, f)
    torch.save(model_cnn, "./models/cnn_model.pth")

Training the model...
Using device: mps
Epoch 1/10 Training...


                                                          

KeyboardInterrupt: 

In [47]:
# Vanilla RNN =====================================================================

model_rnn = RNNTextClassifier(embedding_tensor=embedding_tensor)

if os.path.exists("./models/rnn_model.pth"):
    model_rnn = torch.load("./models/rnn_model.pth")
    print("Model loaded from disk.")
else:
    os.makedirs("./models/", exist_ok=True)
    print("Training the model...")
    hist_rnn = train(model_rnn, num_epochs, train_dl, test_dl, use_lengths=True) 
    with open("./models/train_hist/rnn_hist.pkl", "wb") as f:
        pickle.dump(hist_rnn, f)

    torch.save(model_rnn, "./models/rnn_model.pth")

Training the model...
Using device: mps
Epoch 1/10 Training...


                                                          

KeyboardInterrupt: 

In [50]:
# LSTM =====================================================================

model_lstm = LSTMTextClassifier(embedding_tensor=embedding_tensor)

if os.path.exists("./models/lstm_model.pth"):
    model_lstm = torch.load("./models/lstm_model.pth")
    print("Model loaded from disk.")
else:
    os.makedirs("./models/", exist_ok=True)
    print("Training the model...")
    hist_lstm = train(model_lstm, num_epochs=10, train_dl=train_dl, test_dl=test_dl, use_lengths=True)
    with open("./models/train_hist/lstm_hist.pkl", "wb") as f:
        pickle.dump(hist_rnn, f)
    torch.save(model_lstm, "./models/lstm_model.pth")

Training the model...
Using device: mps
Epoch 1/10 Training...


                                                         

KeyboardInterrupt: 

In [51]:
# STACKING LSTM LAYERS WITH DIFFERENT HIDDEN SIZES =========================

model_lstm_stacked = StackedLSTMTextClassifier(embedding_tensor=embedding_tensor)

if os.path.exists("./models/lstm_stacked_model.pth"):
    model_lstm_stacked = torch.load("./models/lstm_stacked_model.pth")
    print("Model loaded from disk.")
else:
    os.makedirs("./models/", exist_ok=True)
    print("Training the model...")
    hist_lstm_stacked = train(model_lstm_stacked, num_epochs, train_dl, test_dl, use_lengths=True)
    with open("./models/train_hist/lstm_stacked_hist.pkl", "wb") as f:
        pickle.dump(hist_lstm_stacked, f)
    torch.save(model_lstm_stacked, "./models/lstm_stacked_model.pth")

Training the model...
Using device: mps
Epoch 1/10 Training...


                                                         

KeyboardInterrupt: 

In [53]:
# EXTENSION 4: BI-DIRECTIONAL LSTM ======================================================

model_bi_lstm = BidirectionalLSTMTextClassifier(embedding_tensor=embedding_tensor)

if os.path.exists("./models/bi_lstm_model.pth"):
    model_bi_lstm = torch.load("./models/bi_lstm_model.pth")
    print("Model loaded from disk.")
else:
    os.makedirs("./models/", exist_ok=True)
    print("Training the model...")
    hist_bi_lstm = train(model_bi_lstm, num_epochs, train_dl, test_dl, use_lengths=True)
    with open("./models/train_hist/bi_lstm_hist.pkl", "wb") as f:
        pickle.dump(hist_bi_lstm, f)
    torch.save(model_bi_lstm, "./models/bi_lstm_model.pth")

Training the model...
Using device: mps
Epoch 1/10 Training...


                                                         

KeyboardInterrupt: 

In [None]:
# BIDIRECTIONAL GRU  ======================================================

model_gru = BidirectionalGRUTextClassifier(embedding_tensor=embedding_tensor)

if os.path.exists("./models/bidirectional_gru_model.pth"):
    model_gru = torch.load("./models/bidirectional_gru_model.pth")
    print("Model loaded from disk.")
else:
    os.makedirs("./models/", exist_ok=True)
    print("Training the model...")
    hist_gru = train(model_gru, num_epochs, train_dl, test_dl, use_lengths=True)
    with open("./models/train_hist/bidirectional_gru_hist.pkl", "wb") as f:
        pickle.dump(hist_gru, f)
    torch.save(model_gru, "./models/bidirectional_gru_model.pth")


Training the model...
Using device: mps
Epoch 1/10 Training...


                                                           

Epoch 1/10 Evaluating...


                                                           

Epoch 1/10 Summary:
    Train - Loss: 0.6407, Accuracy: 0.583, F1: 0.277
    Test  - Loss: 0.6029, Accuracy: 0.649, F1: 0.596
Epoch 2/10 Training...


                                                           

Epoch 2/10 Evaluating...


                                                           

Epoch 2/10 Summary:
    Train - Loss: 0.5904, Accuracy: 0.637, F1: 0.473
    Test  - Loss: 0.5985, Accuracy: 0.590, F1: 0.456
Epoch 3/10 Training...


                                                           

Epoch 3/10 Evaluating...


                                                           

Epoch 3/10 Summary:
    Train - Loss: 0.5694, Accuracy: 0.655, F1: 0.512
    Test  - Loss: 0.5696, Accuracy: 0.613, F1: 0.462
Epoch 4/10 Training...


                                                           

Epoch 4/10 Evaluating...


                                                           

Epoch 4/10 Summary:
    Train - Loss: 0.5500, Accuracy: 0.658, F1: 0.508
    Test  - Loss: 0.5760, Accuracy: 0.611, F1: 0.463
Epoch 5/10 Training...


Training:  70%|██████▉   | 118/169 [03:36<01:29,  1.75s/it]

# Transformer & Transfer Learning

In [None]:
# Load checkpoint and tokenizer
checkpoint = "google/bert_uncased_L-2_H-128_A-2"
bert_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
bert_uncased = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

transformer_model = bert_uncased
transformer_tokenizer = bert_tokenizer

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# define features
features = Features({
    'text': Value(dtype='string'),
    'label': ClassLabel(num_classes=2, names=['factual', 'misinfo']),
})

# convert to Hugging Face Dataset
dataset_train = Dataset.from_pandas(train_df, features=features)
dataset_test = Dataset.from_pandas(test_df, features=features)

# Check the unique values of the 'label' column to ensure the classes are correct
unique_labels = set(dataset_train['label'])
print("Unique label values in training data:", unique_labels)

# Create a Hugging Face DatasetDict
dataset_dict = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})

Unique label values in training data: {0, 1}


In [17]:
# tokenising the texts ================================================================

TRANSFORMER_TOKENIZED_TEXT_DIR = "./data/input_text_transformer_tokenized"

if os.path.exists(TRANSFORMER_TOKENIZED_TEXT_DIR):
    tokenized_datasets = DatasetDict.load_from_disk(TRANSFORMER_TOKENIZED_TEXT_DIR)
    print("Loaded existing tokenized text")
    
else:    
    os.makedirs(TRANSFORMER_TOKENIZED_TEXT_DIR, exist_ok=True)
    print("Tokenizing...")                                      
    
    def tokenize_function(dataset):
        return transformer_tokenizer(dataset["text"], truncation=True, padding="max_length", max_length=512)
        # truncates at 512 for the chosen checkpoint

    tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
    tokenized_datasets

    tokenized_datasets['train'][0]['text']
    tokenized_datasets['train'][0]['label']
    tokenized_datasets['train'][0]['input_ids']
    tokenized_datasets['train'][0]['attention_mask']
    
    tokenized_datasets.save_to_disk(TRANSFORMER_TOKENIZED_TEXT_DIR)

# fine-tuning the model ================================================================

TRANSFORMER_MODEL_DIR = "./models/transformer_results"
TRANSFORMER_MODEL_FILE = os.path.join(TRANSFORMER_MODEL_DIR, "model.safetensors")
TRANSFORMER_TOKENIZER_DIR = "./models/transformer_results"


if os.path.exists(TRANSFORMER_MODEL_FILE):
    transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_TOKENIZER_DIR)
    transformer_model = AutoModelForSequenceClassification.from_pretrained(TRANSFORMER_MODEL_DIR)
    print("Model loaded from disk.")
else:
    os.makedirs(TRANSFORMER_MODEL_DIR, exist_ok=True)
    print("Fine tuning...")

    training_args = TrainingArguments(output_dir=TRANSFORMER_MODEL_DIR,
                                    eval_strategy="epoch",
                                    save_strategy="epoch",
                                    per_device_train_batch_size=32,
                                    per_device_eval_batch_size=32,
                                    num_train_epochs=30,
                                    load_best_model_at_end=True,
                                    metric_for_best_model='f1',
                                    disable_tqdm=False,
                                    use_cpu=False)

    def compute_metrics(eval_preds):
        metric = evaluate.load("glue", "mrpc")
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels)

    trainer = Trainer(
        transformer_model,
        training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    # Save the model and tokenizer after training
    trainer.save_model(TRANSFORMER_MODEL_DIR)  
    transformer_tokenizer.save_pretrained(TRANSFORMER_TOKENIZER_DIR)  

Loaded existing tokenized text
Model loaded from disk.


In [None]:
# test transformer
def batch_predict(model, tokenizer, texts, batch_size=16, device='mps', max_length=512):
    """
    Predict labels for a batch of texts using the specified model and tokenizer.

    Parameters:
        model: The pre-trained model 
        tokenizer: The tokenizer associated with the pre-trained model.
        texts: List of input texts to predict.
        batch_size: Number of samples per batch.
        device: Device to use ('mps', 'cuda', or 'cpu').
        max_length: Maximum sequence length for tokenization.

    Returns:
        List of predicted labels.
    """
    model.to(device)
    model.eval()

    predictions = []

    with torch.no_grad():  # No gradients needed for prediction
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]

            # Tokenize the batch with truncation and padding
            tokenized_batch = tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors="pt"
            )

            # Move tokenized inputs to the same device as the model
            tokenized_batch = {key: value.to(device) for key, value in tokenized_batch.items()}

            # Get model outputs
            outputs = model(**tokenized_batch)

            # Apply softmax to logits and determine predicted labels
            preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_labels = torch.argmax(preds, dim=1)

            # Collect predictions
            predictions.extend(predicted_labels.cpu().numpy())  # Move predictions to CPU before storing

    return predictions

# USAGE
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")

# Prepare dataset
test_texts = dataset_test["text"]
true_labels = dataset_test["label"]

# predictions in batches
predicted_labels = batch_predict(bert_uncased, bert_tokenizer, test_texts, batch_size=16, device=device)

# Evaluate the performance
f1 = f1_score(true_labels, predicted_labels)
acc = accuracy_score(true_labels, predicted_labels)
print(f"F1 Score: {f1}")
print(f"Accuracy: {acc}")

Using device: mps
F1 Score: 0.675208199871877
Accuracy: 0.5096711798839458
