<a href="https://colab.research.google.com/github/henrycgbaker/nlp_research_note/blob/main/research_note.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import kagglehub
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import spacy
import spacy.cli
from sklearn.feature_extraction.text import CountVectorizer
import fasttext.util as fasttext_util
import fasttext
from sklearn.metrics import f1_score
from collections import Counter
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import tqdm
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
from sklearn.metrics import f1_score, accuracy_score
import warnings
import sys
from sklearn.metrics import f1_score
from tqdm import tqdm

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Hertie server
'''
import os
# Set the environment variables
os.environ['HOME_CONFIG'] = './/workspace/workspace'
os.environ['KAGGLE_CONFIG'] = './workspace/workspace/.kaggle'
os.environ['SPACY_CACHE'] = '/workspace/workspace/cache'

# Optionally, check if the environment variables were set correctly
print(os.getenv('HOME_CONFIG'))
print(os.getenv('KAGGLE_CONFIG'))
print(os.getenv('SPACY_CACHE'))
'''

"\nimport os\n# Set the environment variables\nos.environ['HOME_CONFIG'] = './/workspace/workspace'\nos.environ['KAGGLE_CONFIG'] = './workspace/workspace/.kaggle'\nos.environ['SPACY_CACHE'] = '/workspace/workspace/cache'\n\n# Optionally, check if the environment variables were set correctly\nprint(os.getenv('HOME_CONFIG'))\nprint(os.getenv('KAGGLE_CONFIG'))\nprint(os.getenv('SPACY_CACHE'))\n"

In [3]:

# download pretrained embeddings -----------------------------------------------
# for local
#fasttext.util.download_model('en', if_exists='ignore')
ft_path = "./cc.en.300.bin"

# for Colab
# !pip install datasets fasttext evaluate
#from google.colab import drive
#drive.mount('/content/drive')
#os.chdir('/content/drive/Othercomputers/My MacBook Pro/Documents/repositories/nlp/nlp_research_note')
#ft_path = "./cc.en.300.bin"

ft = fasttext.load_model(ft_path)

# download spacy model for tokenization ----------------------------------------
cache_path = './cache/'
os.makedirs(cache_path, exist_ok=True)
os.environ['SPACY_DATA'] = cache_path
spacy.cli.download("en_core_web_sm")

# load helper functions & scripts ----------------------------------------------
'''
sys.path.append('./aux_scripts')
from  misinfo_tokenizer import (get_trained_tokenizer,
                                batch_tokenize,
                                #vocab_mapping,
                                custom_analyzer
                                )
from data_loader_helpers import (#Collator,
                                 embedding_mapping_fasttext
                                 )
'''

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


"\nsys.path.append('./aux_scripts')\nfrom  misinfo_tokenizer import (get_trained_tokenizer,\n                                batch_tokenize,\n                                #vocab_mapping,\n                                custom_analyzer\n                                )\nfrom data_loader_helpers import (#Collator,\n                                 embedding_mapping_fasttext\n                                 )\n"

---
# Import & process Hugging Face `misinfo` dataset

In [4]:
hf_cache_dir = os.getenv("HF_DATASETS_CACHE", "./cache/huggingface/datasets")
dataset_path = os.path.join(hf_cache_dir, "roupenminassian", "twitter-misinformation")

if os.path.exists(dataset_path):
    print(f"Dataset found in cache: {dataset_path}")
    ds = load_dataset("roupenminassian/twitter-misinformation", cache_dir=hf_cache_dir)
else:
    print(f"Dataset not found in cache. Downloading...")
    ds = load_dataset("roupenminassian/twitter-misinformation")

print(f'Cache Directory: \n{hf_cache_dir}')
print(f'\nExternal Structure: \n{ds.shape}')
print(f'\nInternal Structure: \n{ds["train"]}')

Dataset not found in cache. Downloading...
Cache Directory: 
./cache/huggingface/datasets

External Structure: 
{'train': (92394, 4), 'test': (10267, 4)}

Internal Structure: 
Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'text', 'label'],
    num_rows: 92394
})


In [22]:
# DATA PARTITIONING =====================================================================
ds_cloned = ds.copy()

ds_cloned['train'] = ds_cloned['train'].remove_columns(['Unnamed: 0', 'Unnamed: 0.1'])
ds_cloned['test'] = ds_cloned['test'].remove_columns(['Unnamed: 0', 'Unnamed: 0.1'])

df_misinfo_train = pd.DataFrame(ds_cloned['train'], columns=["text", "label"])
df_misinfo_test = pd.DataFrame(ds_cloned['test'], columns=["text", "label"])

df_misinfo_train = df_misinfo_train.sample(n=5000, random_state=42) # REMOVE THIS
df_misinfo_test = df_misinfo_test.sample(n=5000, random_state=42) # REMOVE THIS

print(f"Train shape {df_misinfo_train.shape} \n")
print('\n0: factual, 1: misinformation\n')
print("Training positive vs negative examples: \n", df_misinfo_train.value_counts("label")/df_misinfo_train.shape[0])
print("\nTesting positive vs negative examples: \n",df_misinfo_test.value_counts("label")/df_misinfo_test.shape[0])

df_misinfo_train.head()

Train shape (5000, 2) 


0: factual, 1: misinformation

Training positive vs negative examples: 
 label
0    0.646
1    0.354
Name: count, dtype: float64

Testing positive vs negative examples: 
 label
0    0.6548
1    0.3452
Name: count, dtype: float64


Unnamed: 0,text,label
62905,"A sudden there was a flood on the road, and th...",0
48977,"No food, no FEMA: Hurricane Michael’s survivor...",0
20691,"President Trump visits Florida hospital, prai...",1
32672,"During my 2nd week at @sacbee_news, I covered ...",0
70612,"Irma is a 5 category hurricane, and your prior...",0


In [6]:
# DEFINE TOKENIZATION FLOW =====================================================================

nlp = spacy.load("en_core_web_sm", 
                 disable=["tok2vec", "tagger", "parser", "ner", "lemmatizer", "attribute_ruler"])

def custom_tokenizer(text):
    tokenized_text = nlp(text)
    return [tok.text for tok in tokenized_text]

def custom_analyzer(text, trained_tokenizer):
    """
    Uses the custom_tokenizer, then replaces out-of-vocabulary tokens with <unk>.
    """
    tokens = custom_tokenizer(text)
    vocab = trained_tokenizer.vocabulary_
    return [token if token in vocab else "<unk>" for token in tokens]

def get_trained_tokenizer(text_series, tokenizer_file=None, min_df=3):
    """
    1) Checks if a previously fitted tokenizer exists in tokenizer_file.
    2) If not, create a new CountVectorizer, fit it on 'text_series'.
    3) Save the fitted tokenizer if tokenizer_file is provided.
    4) Return the tokenizer.
    """
    # If a tokenizer file path is given and exists, load it
    if tokenizer_file and os.path.exists(tokenizer_file):
        print(f"Tokenizer file '{tokenizer_file}' found. Loading it...")
        with open(tokenizer_file, 'rb') as f:
            tokenizer = pickle.load(f)
    else:
        # Otherwise, create a new one and fit
        print("No pre-fitted tokenizer found or no file specified. Creating a new one...")
        tokenizer = CountVectorizer(
            analyzer="word",
            tokenizer=custom_tokenizer,  # We define custom_tokenizer for splitting
            lowercase=False,
            min_df=min_df
        )
        tokenizer.fit(text_series)
        
        # Save the tokenizer if a path was provided
        if tokenizer_file:
            print(f"Saving fitted tokenizer to '{tokenizer_file}'...")
            with open(tokenizer_file, 'wb') as f:
                pickle.dump(tokenizer, f)

    return tokenizer

def batch_tokenize(text_series, batch_size, analyzer_func):
    """
    Tokenizes a Pandas Series of text in batches to avoid memory issues.
    """
    tokenized_result = []
    total = len(text_series)
    num_batches = (total // batch_size) + (1 if total % batch_size != 0 else 0)
    
    for batch_idx in range(0, total, batch_size):
        
        # Print progress every 200 batches or at the last batch
        if (batch_idx // batch_size + 1) % 200 == 0 or (batch_idx + batch_size >= total):
            print(f'Tokenizing batch {batch_idx // batch_size + 1} of {num_batches}...')
        
        batch_texts = text_series[batch_idx : batch_idx + batch_size]
        for text in batch_texts:
            tokenized_result.append(analyzer_func(text))
    
    return tokenized_result

# TOKENIZATION ==========================================================================

TOKENIZER_DIR = './cache/misinfo_tokenizer.pkl'
TRAIN_TOKENISED_DIR = './cache/misinfo_train_tokenised.pkl'
TEST_TOKENISED_DIR = './cache/misinfo_test_tokenised.pkl'

if os.path.exists(TRAIN_TOKENISED_DIR) and os.path.exists(TEST_TOKENISED_DIR):
    print("Tokenized text pkl files found: loading data...")
    # Load pre-saved tokenized data
    with open(TRAIN_TOKENISED_DIR, 'rb') as f:
        misinfo_train_tokenised = pickle.load(f)
    with open(TEST_TOKENISED_DIR, 'rb') as f:
        misinfo_test_tokenised = pickle.load(f)

else:
    print("Pickle files not found. Running tokenization...")

    # 1) Train tokenizer
    misinfo_tokenizer = get_trained_tokenizer(
        df_misinfo_train["text"],
        tokenizer_file=TOKENIZER_DIR,
        min_df=3
    )

    # Build the default analyzer from our tokenizer
    misinfo_tokenizer_analyzer = misinfo_tokenizer.build_analyzer()

    # 2) Tokenize train data in batches using the built analyzer (trained on train set)
    print("Tokenizing Train Data in Batches...")
    misinfo_train_tokenised = batch_tokenize(
        df_misinfo_train["text"],
        32,
        misinfo_tokenizer_analyzer
    )
    
    # 3) Tokenize test data in batches using custom_analyzer (which replaces OOV tokens with <unk>)
    print("Tokenizing Test Data in Batches...")
    misinfo_test_tokenised = batch_tokenize(
        df_misinfo_test["text"],
        32,
        lambda text: custom_analyzer(text, trained_tokenizer=misinfo_tokenizer)
    )

    # Optionally, save the tokenized data
    with open(TRAIN_TOKENISED_DIR, 'wb') as f:
        pickle.dump(misinfo_train_tokenised, f)
    with open(TEST_TOKENISED_DIR, 'wb') as f:
        pickle.dump(misinfo_test_tokenised, f)

print("Train inputs tokenised:", len(misinfo_train_tokenised))
print("Test inputs tokenised:", len(misinfo_test_tokenised))

Tokenized text pkl files found: loading data...
Train inputs tokenised: 5000
Test inputs tokenised: 5000


In [7]:
# STEP 1: INPUT PIPELINE ================================================================

# vocabulary indexing -------------------------------------------------------------------

def vocab_mapping(tokenized_text):
    token_counts = Counter()
    for text in tokenized_text:
        token_counts.update(text)
    special_tokens = ["<pad>", "<unk>"]
    vocab_tokens = special_tokens + [token for token, freq in token_counts.most_common()]
    vocab = {token: idx for idx, token in enumerate(vocab_tokens)}
    return vocab

vocab_idx = vocab_mapping(tokenized_text=misinfo_train_tokenised)

print(f"Vocab size: {len(vocab_idx)}")
print(f"Vocab example: {list(vocab_idx.items())[:10]}")

Vocab size: 46425
Vocab example: [('<pad>', 0), ('<unk>', 1), ('the', 2), (',', 3), ('.', 4), ('to', 5), ('of', 6), ('and', 7), ('a', 8), ('in', 9)]


In [14]:
# create data loaders -------------------------------------------------------------------

def collate_fn(data):
    text_list, label_list, lengths = [], [], []
    for _text, _label in data:
        # integer encoding with truncation
        processed_text = torch.tensor([vocab_idx[token] for token in _text][:max_seq_length],
                                      dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(_label)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    # padding
    padded_text_list = nn.utils.rnn.pad_sequence(text_list,
                                                 batch_first=True,
                                                 padding_value=0)
    return padded_text_list, label_list, lengths

max_seq_length = 300 # too long for classic RNN
batch_size = 32

train_dl = DataLoader(dataset=list(zip(misinfo_train_tokenised,
                                         df_misinfo_train["label"])),
                        batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(dataset=list(zip(misinfo_test_tokenised,
                                         df_misinfo_test["label"])),
                        batch_size=32, shuffle=False, collate_fn=collate_fn)

In [15]:
# EMBEDDING MAPPING =====================================================================

# map pretrained fasttext embeddings to vocabulary indices ------------------------------

EMBEDDINGS_FILE_PATH = "./cache/mapped_pretrained_embeddings.pkl"

def embedding_mapping_fasttext(vocabulary, pre_trained_embeddings):
    vocab_size = len(vocabulary)
    embedding_dim = pre_trained_embeddings.get_dimension()
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for idx, word in enumerate(vocabulary):
        embedding_matrix[idx] = pre_trained_embeddings.get_word_vector(word)
    return embedding_matrix

if os.path.exists(EMBEDDINGS_FILE_PATH):
    with open(EMBEDDINGS_FILE_PATH, 'rb') as f:
        embedding_tensor = pickle.load(f)
    print(f"Emebddings pre-exists: loaded embeddings from {EMBEDDINGS_FILE_PATH}. Shape: {embedding_tensor.shape}")
else:
    print("Embeddings do not pre-exist: mapping pretrained fasttext embeddings to vocabulary indices")

    mapped_pretrained_embeddings = embedding_mapping_fasttext(vocabulary=vocab_idx,
                                                              pre_trained_embeddings=ft)
    embedding_tensor = torch.FloatTensor(mapped_pretrained_embeddings)

    # Save embeddings
    with open(EMBEDDINGS_FILE_PATH, 'wb') as f:
        pickle.dump(embedding_tensor, f)
    print(f"Saved embeddings to {EMBEDDINGS_FILE_PATH}. Shape: {embedding_tensor.shape}")


Emebddings pre-exists: loaded embeddings from ./cache/mapped_pretrained_embeddings.pkl. Shape: torch.Size([46425, 300])


In [10]:
def train(model, num_epochs, train_dl, test_dl, use_lengths=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    loss_fn = nn.BCEWithLogitsLoss()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    metrics = {
        "loss_train": [],
        "loss_test": [],
        "accuracy_train": [],
        "accuracy_test": [],
        "f1_train": [],
        "f1_test": []
    }

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss, train_correct = 0, 0
        all_train_preds, all_train_labels = [], []

        print(f"Epoch {epoch + 1}/{num_epochs} Training...")
        for batch in tqdm(train_dl, desc="Training", leave=False):
            if use_lengths:
                x_batch, y_batch, lengths = batch
                x_batch, y_batch, lengths = x_batch.to(device), y_batch.to(device), lengths.to(device)
                pred = model(x_batch, lengths)[:, 0]  # Include lengths for RNNs/LSTMs
            else:
                x_batch, y_batch = batch
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                pred = model(x_batch)[:, 0]

            # Compute loss
            loss = loss_fn(pred, y_batch.float())

            # Backpropagation
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # Metrics
            train_loss += loss.item() * y_batch.size(0)
            preds = (pred >= 0.5).float()
            train_correct += (preds == y_batch).float().sum().item()
            all_train_preds.extend(preds.cpu().numpy())
            all_train_labels.extend(y_batch.cpu().numpy())

        metrics["loss_train"].append(train_loss / len(train_dl.dataset))
        metrics["accuracy_train"].append(train_correct / len(train_dl.dataset))
        metrics["f1_train"].append(f1_score(all_train_labels, all_train_preds))

        # Evaluation phase
        model.eval()
        test_loss, test_correct = 0, 0
        all_test_preds, all_test_labels = [], []
        print(f"Epoch {epoch + 1}/{num_epochs} Evaluating...")
        with torch.no_grad():
            for batch in tqdm(test_dl, desc="Evaluating", leave=False):
                if use_lengths:
                    x_batch, y_batch, lengths = batch
                    x_batch, y_batch, lengths = x_batch.to(device), y_batch.to(device), lengths.to(device)
                    pred = model(x_batch, lengths)[:, 0]
                else:
                    x_batch, y_batch = batch
                    x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                    pred = model(x_batch)[:, 0]

                # Compute loss
                loss = loss_fn(pred, y_batch.float())

                # Metrics
                test_loss += loss.item() * y_batch.size(0)
                preds = (pred >= 0.5).float()
                test_correct += (preds == y_batch).float().sum().item()
                all_test_preds.extend(preds.cpu().numpy())
                all_test_labels.extend(y_batch.cpu().numpy())

        metrics["loss_test"].append(test_loss / len(test_dl.dataset))
        metrics["accuracy_test"].append(test_correct / len(test_dl.dataset))
        metrics["f1_test"].append(f1_score(all_test_labels, all_test_preds))

        # Print summary
        print(f"Epoch {epoch + 1}/{num_epochs} Summary:")
        print(f"    Train - Loss: {metrics['loss_train'][-1]:.4f}, Accuracy: {metrics['accuracy_train'][-1]:.3f}, F1: {metrics['f1_train'][-1]:.3f}")
        print(f"    Test  - Loss: {metrics['loss_test'][-1]:.4f}, Accuracy: {metrics['accuracy_test'][-1]:.3f}, F1: {metrics['f1_test'][-1]:.3f}")

    return metrics

In [11]:
# MODEL BUILDING ================================================================

# CNN-based text classification model

class TextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layers
        convolution_layer = nn.Conv1d(in_channels=embedding_tensor.size(1),
                                      out_channels=128,
                                      kernel_size=3,
                                      padding="same")
        activation_layer = nn.ReLU()
        pooling_layer = nn.AdaptiveAvgPool1d(1)
        h_layers = [convolution_layer, activation_layer, pooling_layer]
        self.hidden_layers = nn.ModuleList(h_layers)
        # classification layer
        self.classification_layer = nn.Linear(in_features=128, out_features=1)

    # define forward pass
    def forward(self, x):
        x = self.embedding_layer(x).permute(0, 2, 1)

        for layer in self.hidden_layers:
            x = layer(x)

        x = x.squeeze(2)

        x = self.classification_layer(x)
        return x

model_cnn = TextClassificationModel(embedding_tensor=embedding_tensor)

In [12]:
# Train the model
num_epochs = 10
hist_cnn = train(model_cnn, num_epochs=10, train_dl=train_dl, test_dl=test_dl, use_lengths=False)
torch.save(model_cnn, "./models/cnn_model_full.pth")

Epoch 1/10 Training...


                                                           

Epoch 1/10 Evaluating...


                                                             

Epoch 1/10 Summary:
    Train - Loss: 0.5440, Accuracy: 0.743, F1: 0.438
    Test  - Loss: 0.4818, Accuracy: 0.782, F1: 0.542
Epoch 2/10 Training...


                                                          

KeyboardInterrupt: 

In [16]:
# EXTENSION 1: RNN =====================================================================

class RNNTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.rnn_layer = nn.RNN(input_size=embedding_tensor.size(1),
                                hidden_size=32,
                                num_layers=1, # increase to stack RNNs
                                batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, h_t = self.rnn_layer(x) # o_t includes the outputs,
                                     # h_t the hidden state at the last time step
        x = h_t[-1, :, :] # extract from last layer (in case of num_layers > 1)
        x = self.classification_layer(x)
        return x

model_rnn = RNNTextClassificationModel(embedding_tensor=embedding_tensor)
hist_rnn = train(model_rnn, num_epochs, train_dl, test_dl, use_lengths=True) # fluctuating f1 scores, exploding gradients
torch.save(model_rnn, "./models/rnn_model_full.pth")

Epoch 1/10 Training...


                                                           

Epoch 1/10 Evaluating...


                                                              

Epoch 1/10 Summary:
    Train - Loss: 0.5834, Accuracy: 0.664, F1: 0.130
    Test  - Loss: 0.4936, Accuracy: 0.734, F1: 0.424
Epoch 2/10 Training...


                                                           

Epoch 2/10 Evaluating...


                                                              

Epoch 2/10 Summary:
    Train - Loss: 0.4494, Accuracy: 0.779, F1: 0.630
    Test  - Loss: 0.5444, Accuracy: 0.758, F1: 0.708
Epoch 3/10 Training...


                                                           

Epoch 3/10 Evaluating...


                                                              

Epoch 3/10 Summary:
    Train - Loss: 0.4080, Accuracy: 0.811, F1: 0.702
    Test  - Loss: 0.3665, Accuracy: 0.841, F1: 0.744
Epoch 4/10 Training...


                                                           

Epoch 4/10 Evaluating...


                                                              

KeyboardInterrupt: 

In [17]:
# EXTENSION 2: LSTM =====================================================================

class LSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.lstm_layer = nn.LSTM(input_size=embedding_tensor.size(1),
                                  hidden_size=32,
                                  num_layers=1,
                                  batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, (h_t, c_t) = self.lstm_layer(x) # c_t the cell state at the last time step
        x = h_t[-1, :, :] # extract from last layer (in case of num_layers > 1)
        x = self.classification_layer(x)
        return x

model_lstm = LSTMTextClassificationModel(embedding_tensor=embedding_tensor)

hist_lstm = train(model_lstm, num_epochs=10, train_dl=train_dl, test_dl=test_dl, use_lengths=True)
torch.save(model_lstm, "./models/lstm_model_full.pth")

Epoch 1/10 Training...


                                                           

Epoch 1/10 Evaluating...


                                                             

Epoch 1/10 Summary:
    Train - Loss: 0.5794, Accuracy: 0.650, F1: 0.031
    Test  - Loss: 0.4691, Accuracy: 0.694, F1: 0.235
Epoch 2/10 Training...


                                                           

Epoch 2/10 Evaluating...


                                                             

Epoch 2/10 Summary:
    Train - Loss: 0.4758, Accuracy: 0.748, F1: 0.557
    Test  - Loss: 0.4292, Accuracy: 0.806, F1: 0.690
Epoch 3/10 Training...


                                                          

KeyboardInterrupt: 

In [18]:
# EXTENSION 2.5: STACKING LSTM LAYERS WITH DIFFERENT HIDDEN SIZES =========================

class StackedLSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.lstm_layer_1 = nn.LSTM(input_size=embedding_tensor.size(1),
                                    hidden_size=64,
                                    num_layers=1,
                                    batch_first=True)
        self.lstm_layer_2 = nn.LSTM(input_size=64,
                                    hidden_size=32,
                                    num_layers=1,
                                    batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t_1, (h_t_1, c_t_1) = self.lstm_layer_1(x)
        o_t_2, (h_t_2, c_t_2) = self.lstm_layer_2(o_t_1)
        x = h_t_2[-1, :, :]
        x = self.classification_layer(x)
        return x

model_lstm_stacked = StackedLSTMTextClassificationModel(embedding_tensor=embedding_tensor)
hist = train(model_lstm_stacked, num_epochs, train_dl, test_dl, use_lengths=True)
torch.save(model_lstm_stacked, "./models/lstm_stacked_model_full.pth")

Epoch 1/10 Training...


                                                           

Epoch 1/10 Evaluating...


                                                             

Epoch 1/10 Summary:
    Train - Loss: 0.5663, Accuracy: 0.646, F1: 0.001
    Test  - Loss: 0.4683, Accuracy: 0.663, F1: 0.046
Epoch 2/10 Training...


                                                           

Epoch 2/10 Evaluating...


                                                             

Epoch 2/10 Summary:
    Train - Loss: 0.4871, Accuracy: 0.756, F1: 0.537
    Test  - Loss: 0.4791, Accuracy: 0.784, F1: 0.581
Epoch 3/10 Training...


                                                          

KeyboardInterrupt: 

In [19]:
# EXTENSION 4: BI-DIRECTIONAL LSTM ======================================================

class BidirectionalLSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.bid_lstm_layer = nn.LSTM(input_size=embedding_tensor.size(1),
                                      hidden_size=32,
                                      num_layers=1,
                                      batch_first=True,
                                      bidirectional=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32*2, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, (h_t, c_t) = self.bid_lstm_layer(x)
        x = torch.cat((h_t[-2, :, :],
                       h_t[-1, :, :]), dim=1)
        x = self.classification_layer(x)
        return x

model_bi_lstm = BidirectionalLSTMTextClassificationModel(embedding_tensor=embedding_tensor)

hist = train(model_bi_lstm, num_epochs, train_dl, test_dl, use_lengths=True)
torch.save(model_bi_lstm, "./models/bi_lstm_model_full.pth")

Epoch 1/10 Training...


                                                           

Epoch 1/10 Evaluating...


                                                             

Epoch 1/10 Summary:
    Train - Loss: 0.4312, Accuracy: 0.752, F1: 0.479
    Test  - Loss: 0.2345, Accuracy: 0.902, F1: 0.855
Epoch 2/10 Training...


                                                           

Epoch 2/10 Evaluating...


                                                             

Epoch 2/10 Summary:
    Train - Loss: 0.1677, Accuracy: 0.924, F1: 0.886
    Test  - Loss: 0.1813, Accuracy: 0.913, F1: 0.861
Epoch 3/10 Training...


                                                          

KeyboardInterrupt: 

# Transformer & Transfer Learning

In [20]:
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel

# Load checkpoint and tokenizer
checkpoint = "google/bert_uncased_L-2_H-128_A-2"
bert_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
bert_uncased = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Define the features of the dataset
features = Features({
    'text': Value(dtype='string'),
    'label': ClassLabel(num_classes=2, names=['factual', 'misinfo']),
})

df_misinfo_train = df_misinfo_train.reset_index(drop=True)
df_misinfo_test = df_misinfo_test.reset_index(drop=True)

# Convert train and test data to Hugging Face Dataset
dataset_train = Dataset.from_pandas(df_misinfo_train, features=features)
dataset_test = Dataset.from_pandas(df_misinfo_test, features=features)

# Display the first few rows of the training dataset
print(dataset_train[2]) 

# Check the unique values of the 'label' column to ensure the classes are correct
unique_labels = set(dataset_train['label'])
print("Unique label values in training data:", unique_labels)

# Check the mapping of integer labels to class names
print("Class name mapping:", dataset_train.features['label'].int2str)

# Create a Hugging Face DatasetDict
dataset_dict = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})

# Print the DatasetDict to check its contents
print(f'\n {dataset_dict}')

print(dataset_train[2]) 
print(dataset_dict['train'][2])

{'text': ' President Trump visits Florida hospital, praises first responders following school shooting:  It s very sad something like that could happen, but the job the doctors did, the nurses, the hospital, the first responders, law enforcement   really incredible. A White House statement said that the Trumps were visiting  to pay their respects and thank the medical professionals for their life-saving assistance  in response to shooting.NEW: "The job they ve done is incredible," Pres. Trump says of doctors, first responders as he and first lady Melania Trump meet Parkland shooting victims at Broward Health North Hospital https://t.co/n6Ltn0H0nn pic.twitter.com/gKN8aHbRz4  CBS News (@CBSNews) February 17, 2018POTUS AND FLOTUS THEN MET WITH FLORIDA LAW ENFORCEMENT:After President Trump and First Lady Melania visited with victims, families and the incredible medical teams at Broward Health North   they headed to thank the amazing law enforcement officers at the @BrowardSheriff s Departm

In [25]:
# tokenize ------------------------------------------------------------------------------
def tokenize_function(dataset):
    return bert_tokenizer(dataset["text"], truncation=True, padding="max_length", max_length=512)
    # truncates at 512 for the chosen checkpoint

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets

tokenized_datasets['train'][0]['text']
tokenized_datasets['train'][0]['label']
tokenized_datasets['train'][0]['input_ids']
tokenized_datasets['train'][0]['attention_mask']

# fine-tune -----------------------------------------------------------------------------
training_args = TrainingArguments(output_dir="./transformer_results",
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=30,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='f1',
                                  disable_tqdm=False,
                                  use_cpu=False)

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    bert_uncased,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# Save the model and tokenizer after training
trainer.save_model("./models/transformer_results")  
bert_tokenizer.save_pretrained("./models/transformer_results")  

Map: 100%|██████████| 5000/5000 [00:01<00:00, 2928.85 examples/s]
Map: 100%|██████████| 5000/5000 [00:01<00:00, 4676.48 examples/s]
  3%|▎         | 157/4710 [00:46<48:53,  1.55it/s]
Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 1.49MB/s]
                                                  
  3%|▎         | 157/4710 [01:04<48:53,  1.55it/s]

{'eval_loss': 0.22444546222686768, 'eval_accuracy': 0.9278, 'eval_f1': 0.8903068975995139, 'eval_runtime': 17.516, 'eval_samples_per_second': 285.454, 'eval_steps_per_second': 8.963, 'epoch': 1.0}


                                                    
  7%|▋         | 314/4710 [02:06<20:15,  3.62it/s]

{'eval_loss': 0.10185230523347855, 'eval_accuracy': 0.9654, 'eval_f1': 0.9505007153075823, 'eval_runtime': 18.0443, 'eval_samples_per_second': 277.096, 'eval_steps_per_second': 8.701, 'epoch': 2.0}


 10%|▉         | 456/4710 [02:46<19:51,  3.57it/s]  

KeyboardInterrupt: 

In [26]:
# predict -------------------------------------------------------------------------------
bert_uncased.eval()

# Helper function to process data in batches
def batch_predict(model, tokenizer, texts, batch_size=16, max_length=512):
    all_preds = []
    # Check if GPU is available and move model to GPU
    if torch.cuda.is_available():
        model = model.cuda()

    with torch.no_grad():
        for start in range(0, len(texts), batch_size):
            end = min(start + batch_size, len(texts))
            batch_texts = texts[start:end]

            # Tokenize the batch of texts
            tokenized_batch = tokenizer(batch_texts, truncation=True, padding="max_length",
                                        max_length=max_length, return_tensors="pt")

            # Move tensors to GPU if available
            if torch.cuda.is_available():
                tokenized_batch = {key: value.cuda() for key, value in tokenized_batch.items()}

            # Get predictions
            outputs = model(**tokenized_batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_labels = torch.argmax(predictions, dim=1)
            all_preds.extend(predicted_labels.cpu().numpy())

    return all_preds

# Prepare your dataset
disinfo_test_texts = df_misinfo_test["text"].to_list()
true_labels = df_misinfo_test["label"].to_list()

# Make predictions in batches
predicted_labels = batch_predict(bert_uncased, bert_tokenizer, disinfo_test_texts, batch_size=16)

# Evaluate the performance
f1 = f1_score(true_labels, predicted_labels)
acc = accuracy_score(true_labels, predicted_labels)
print(f"F1 Score: {f1}")
print(f"Accuracy: {acc}")

RuntimeError: Placeholder storage has not been allocated on MPS device!

---
# Climate Tweets

In [27]:
# reading climate df
input_path_climate = "/Users/henrybaker/.cache/kagglehub/datasets/die9origephit/climate-change-tweets/versions/1/Climate change_2022-1-17_2022-7-19.csv"

output_path_climate = "/Users/henrybaker/Documents/repositories/NLP/nlp_project/data/climate-change-tweets.csv"

df_climate = pd.read_csv(input_path_climate)
print(f"Loading dataset from '{input_path_climate}'...")
df_climate.head()

Loading dataset from '/Users/henrybaker/.cache/kagglehub/datasets/die9origephit/climate-change-tweets/versions/1/Climate change_2022-1-17_2022-7-19.csv'...


Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,Lauren Boebert,@laurenboebert,2022-01-17T23:32:38.000Z,Lauren Boebert\n@laurenboebert\n·\nJan 18,The only solution I’ve ever heard the Left pro...,,1683,2259,11.7K,[],https://twitter.com/laurenboebert/status/14832...
1,Catherine,@catherine___c,2022-01-17T22:54:02.000Z,Catherine\n@catherine___c\n·\nJan 17,Climate change doesn’t cause volcanic eruption...,,158,64,762,[],https://twitter.com/catherine___c/status/14832...
2,king Keith,@KaConfessor,2022-01-17T23:51:41.000Z,king Keith\n@KaConfessor\n·\nJan 18,Vaccinated tennis ball boy collapses in the te...,,24,118,159,['https://pbs.twimg.com/ext_tw_video_thumb/148...,https://twitter.com/KaConfessor/status/1483225...
3,PETRIFIED CLIMATE PARENT,@climate_parent,2022-01-17T21:42:04.000Z,PETRIFIED CLIMATE PARENT\n@climate_parent\n·\n...,North America has experienced an average winte...,,15,50,158,[],https://twitter.com/climate_parent/status/1483...
4,Thomas Speight,@Thomas_Sp8,2022-01-17T21:10:40.000Z,Thomas Speight\n@Thomas_Sp8\n·\nJan 17,They're gonna do the same with Climate Change ...,🅾,4,24,127,['https://pbs.twimg.com/profile_images/1544171...,https://twitter.com/Thomas_Sp8/status/14831850...


In [28]:
df_climate_inference = df_climate[['Embedded_text']].rename(columns={'Embedded_text': 'text'})
df_climate_inference['label'] = None

print(f"Train shape {df_climate_inference.shape} \n")
df_climate_inference.head()

Train shape (9050, 2) 



Unnamed: 0,text,label
0,The only solution I’ve ever heard the Left pro...,
1,Climate change doesn’t cause volcanic eruption...,
2,Vaccinated tennis ball boy collapses in the te...,
3,North America has experienced an average winte...,
4,They're gonna do the same with Climate Change ...,


In [None]:
# TOKENISATION ==========================================================================

# Check if the pickle files already exist
# local
CLIMATE_TOKENISED_DIR = './cache/climate_tokenised.pkl'

if os.path.exists(CLIMATE_TOKENISED_DIR):
    print("Tokenized climate tweets pkl files found: loading data...")
    # Load the pre-saved tokenized data
    with open(CLIMATE_TOKENISED_DIR, 'rb') as f:
        climate_tokenised = pickle.load(f)
else:
    print("Pickle files not found. Tokenizating climate tweets...")


    climate_tokenised = batch_tokenize(
        df_climate_inference,
        32,
        lambda text: custom_analyzer(text, trained_tokenizer=misinfo_tokenizer) # NB using misinfo tokenizer - have to be the same
    )

    # Save tokenized train and test data
    with open(CLIMATE_TOKENISED_DIR, 'wb') as f:
        pickle.dump(climate_tokenised, f)

In [None]:
# STEP 1: INPUT PIPELINE ================================================================

# I THINK A LOT OF THIS NOT NEEDED

vocab_idx = vocab_mapping(tokenized_text=climate_tokenised) # is this correct?? ithink not
 
climate_dl = DataLoader(
    dataset=list(zip(climate_tokenised,climate_tokenised["label"])), # THIS SHOULD BE BLANK...
    batch_size=32, 
    shuffle=False, 
    collate_fn=collate_fn)

EMBEDDINGS_FILE_PATH_CLIMATE = "./cache/mapped_pretrained_embeddings_climate.pkl"

if os.path.exists(EMBEDDINGS_FILE_PATH_CLIMATE):
    with open(EMBEDDINGS_FILE_PATH_CLIMATE, 'rb') as f:
        embedding_tensor_climate = pickle.load(f)
    print(f"Emebddings pre-exists: loaded embeddings from {EMBEDDINGS_FILE_PATH_CLIMATE}. Shape: {embedding_tensor_climate.shape}")
else:
    print("Embeddings do not pre-exist: mapping pretrained fasttext embeddings to vocabulary indices")

    mapped_pretrained_embeddings_climate = embedding_mapping_fasttext(vocabulary=vocab_idx,
                                                              pre_trained_embeddings=ft)
    embedding_tensor = torch.FloatTensor(mapped_pretrained_embeddings_climate)

    # Save embeddings
    with open(EMBEDDINGS_FILE_PATH_CLIMATE, 'wb') as f:
        pickle.dump(embedding_tensor, f)
    print(f"Saved embeddings to {EMBEDDINGS_FILE_PATH_CLIMATE}. Shape: {embedding_tensor_climate.shape}")

