<a href="https://colab.research.google.com/github/henrycgbaker/nlp_research_note/blob/main/nlp_research_note.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Hertie server
'''
import os
# Set the environment variables
os.environ['HOME_CONFIG'] = './/workspace/workspace'
os.environ['KAGGLE_CONFIG'] = './workspace/workspace/.kaggle'
os.environ['SPACY_CACHE'] = '/workspace/workspace/cache'

# Optionally, check if the environment variables were set correctly
print(os.getenv('HOME_CONFIG'))
print(os.getenv('KAGGLE_CONFIG'))
print(os.getenv('SPACY_CACHE'))
'''

"\nimport os\n# Set the environment variables\nos.environ['HOME_CONFIG'] = './/workspace/workspace'\nos.environ['KAGGLE_CONFIG'] = './workspace/workspace/.kaggle'\nos.environ['SPACY_CACHE'] = '/workspace/workspace/cache'\n\n# Optionally, check if the environment variables were set correctly\nprint(os.getenv('HOME_CONFIG'))\nprint(os.getenv('KAGGLE_CONFIG'))\nprint(os.getenv('SPACY_CACHE'))\n"

In [2]:
import os
import kagglehub
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import spacy
import spacy.cli
from sklearn.feature_extraction.text import CountVectorizer
import fasttext.util as fasttext_util
import fasttext
from sklearn.metrics import f1_score
from collections import Counter
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import tqdm
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
from sklearn.metrics import f1_score, accuracy_score
import warnings
import sys

# For Google Colab (if needed)
# from google.colab import drive

# from other modules:
import sys
sys.path.append('./aux_scripts')

from misinfo_tokenizer import (
    get_trained_tokenizer,
    batch_tokenize,
    custom_analyzer,
    vocab_mapping
)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
warnings.filterwarnings("ignore")
sys.path.append('/scripts/')

In [4]:
# download pretrained embeddings --------------------------------------------------------
# for local
fasttext.util.download_model('en', if_exists='ignore')
model_path = "./cc.en.300.bin"

# for Gdrive
# drive.mount('/content/drive')
# model_path = "/content/drive/MyDrive/cc.en.300.bin"
# ft = fasttext.load_model(model_path)

# download spacy model for tokenization -------------------------------------------------
cache_path = './cache/'
os.makedirs(cache_path, exist_ok=True)
os.environ['SPACY_DATA'] = cache_path
spacy.cli.download("en_core_web_sm")

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


---
# Import & process Hugging Face `misinfo` dataset

In [5]:
ds = load_dataset("roupenminassian/twitter-misinformation")
hf_cache_dir = os.getenv("HF_DATASETS_CACHE", "~/.cache/huggingface/datasets")

print(f' Directory: \n {hf_cache_dir}')
print(f'\n External Structure: \n {ds.shape}')
print(f'\n Internal Structure: \n {ds["train"]}') 

 Directory: 
 ~/.cache/huggingface/datasets

 External Structure: 
 {'train': (92394, 4), 'test': (10267, 4)}

 Internal Structure: 
 Dataset({
    features: ['Unnamed: 0.1', 'Unnamed: 0', 'text', 'label'],
    num_rows: 92394
})


In [6]:
# DATA PARTITIONING =====================================================================
ds_cloned = ds.copy()

ds_cloned['train'] = ds_cloned['train'].remove_columns(['Unnamed: 0', 'Unnamed: 0.1'])
ds_cloned['test'] = ds_cloned['test'].remove_columns(['Unnamed: 0', 'Unnamed: 0.1'])

df_misinfo_train = pd.DataFrame(ds_cloned['train'], columns=["text", "label"])
df_misinfo_test = pd.DataFrame(ds_cloned['test'], columns=["text", "label"])

print("PRE-BALANCING:\n")
print(f"Train shape {df_misinfo_train.shape} \n")
print("Training positive vs negative examples: \n", df_misinfo_train.value_counts("label")/df_misinfo_train.shape[0])
print("\nTesting positive vs negative examples: \n",df_misinfo_test.value_counts("label")/df_misinfo_test.shape[0])

# balance train split -------------------------------------------------------------------

balancer = RandomUnderSampler(random_state=42, sampling_strategy = 'majority')
df_misinfo_train_balanced = pd.concat(balancer.fit_resample(X = df_misinfo_train.iloc[:,[0]],
                                                           y = df_misinfo_train.iloc[:,[1]]),
                                     axis=1).sample(frac = 1).reset_index(drop=True)
df_misinfo_train_balanced.value_counts("label")/df_misinfo_train_balanced.shape[0]

print("\nPost-BALANCING:\n")
print(f"Train shape {df_misinfo_train_balanced.shape} \n")
df_misinfo_train_balanced.head()

PRE-BALANCING:

Train shape (92394, 2) 

Training positive vs negative examples: 
 label
0    0.652737
1    0.347263
Name: count, dtype: float64

Testing positive vs negative examples: 
 label
0    0.659686
1    0.340314
Name: count, dtype: float64

Post-BALANCING:

Train shape (64170, 2) 



Unnamed: 0,text,label
0,It s time to stop hitting the snooze button Am...,1
1,WASHINGTON (Reuters) - U.S. government officia...,0
2,Donald Trump s mouth gets him into trouble a l...,1
3,"We are devout Catholics, but my daughter goes ...",1
4,So tell me why my stepdad thought it would he ...,0


In [7]:
# TOKENISATION ==========================================================================

train_tokens_file = './cache/misinfo_train_tokens.pkl'
test_tokens_file = './cache/misinfo_test_tokens.pkl'
tokenizer_file = './cache/misinfo_tokenizer.pkl' 

if os.path.exists(train_tokens_file) and os.path.exists(test_tokens_file):
    print("Tokenized text pkl files found: loading data...")
    # Load pre-saved tokenized data
    with open(train_tokens_file, 'rb') as f:
        misinfo_train_tokens = pickle.load(f)
    with open(test_tokens_file, 'rb') as f:
        misinfo_test_tokens = pickle.load(f)

else:
    print("Pickle files not found. Running tokenization...")

    # 1) Get the trained tokenizer (will create if it doesn't exist)
    #    'df_misinfo_train["text"]' is used to fit the vocabulary
    misinfo_tokenizer = get_trained_tokenizer(
        df_misinfo_train["text"],
        tokenizer_file=tokenizer_file,
        min_df=3
    )

    # 2) Build an analyzer from the trained tokenizer
    #    Alternatively, you can directly use your custom_analyzer
    misinfo_tokenizer_analyzer = misinfo_tokenizer.build_analyzer()

    # 3) Tokenize train data in batches
    print("Tokenizing Train Data in Batches...")
    misinfo_train_tokens = batch_tokenize(
        df_misinfo_train["text"],
        misinfo_tokenizer_analyzer
    )

    # 4) Tokenize test data in batches (using a lambda to leverage custom_analyzer)
    print("Tokenizing Test Data in Batches...")
    misinfo_test_tokens = batch_tokenize(
        df_misinfo_test["text"],
        lambda x: custom_analyzer(x, misinfo_tokenizer)
    )

    # 5) Save the tokenized train and test data
    with open(train_tokens_file, 'wb') as f:
        pickle.dump(misinfo_train_tokens, f)
    with open(test_tokens_file, 'wb') as f:
        pickle.dump(misinfo_test_tokens, f)

print("Done! Train tokens count:", len(misinfo_train_tokens))
print("Test tokens count:", len(misinfo_test_tokens))

Tokenized text pkl files found: loading data...
Done! Train tokens count: 92394
Test tokens count: 10267


In [8]:
# STEP 1: INPUT PIPELINE ================================================================

# vocabulary indexing -------------------------------------------------------------------
print ("Vocab indexing...")

#getting vocab_mapping() from misinfo_tokenizer.py
vocab_idx = vocab_mapping(tokenized_text=misinfo_train_tokens)

# create data loaders -------------------------------------------------------------------

print("Creating data loaders...")

# getting collate_fn() from data_loader_helpers.py
from data_loader_helpers import collate_fn

max_seq_length = 300
batch_size = 32

train_dl = DataLoader(dataset=list(zip(misinfo_train_tokens,
                                         df_misinfo_train_balanced["label"])), 
                        batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(dataset=list(zip(misinfo_test_tokens,
                                         df_misinfo_test["label"])),
                        batch_size=32, shuffle=False, collate_fn=collate_fn)

print("Created data loaders!")

# map pretrained fasttext embeddings to vocabulary indices ------------------------------

# getting embedding_mapping_fasttext() from data_loader_helpers.py
from data_loader_helpers import embedding_mapping_fasttext

# Define the file path for the pickle file
# for local:
embeddings_file_path = "./cache/mapped_pretrained_embeddings.pkl"

# for Hertie GPU:
#pickle_file_path = "/workspace/workspace/mapped_pretrained_embeddings.pkl"

# Check if the pickle file already exists
if os.path.exists(embeddings_file_path):
    # If the file exists, load it from the pickle file
    with open(embeddings_file_path, 'rb') as f:
        embedding_tensor = pickle.load(f)
    print(f"Emebddings pre-exists: loaded embeddings from {embeddings_file_path}. Shape: {embedding_tensor.shape}")
else:
    # If the file does not exist, proceed with creating the embeddings and save them
    # Load pre-trained FastText model
    print("Embeddings do not pre-exist: mapping pretrained fasttext embeddings to vocabulary indices")
    
    ft = fasttext.load_model(model_path)

    # Map pretrained FastText embeddings to vocabulary indices
    mapped_pretrained_embeddings = embedding_mapping_fasttext(vocabulary=vocab_idx,
                                                              pre_trained_embeddings=ft)

    # Convert mapped embeddings to a tensor
    embedding_tensor = torch.FloatTensor(mapped_pretrained_embeddings)

    # Save the embeddings to a pickle file
    with open(embeddings_file_path, 'wb') as f:
        pickle.dump(embedding_tensor, f)
    print(f"Saved embeddings to {embeddings_file_path}. Shape: {embedding_tensor.shape}")

Vocab indexing...
Creating data loaders...
Created data loaders!
Emebddings pre-exists: loaded embeddings from ./cache/mapped_pretrained_embeddings.pkl. Shape: torch.Size([217732, 300])


In [9]:
# STEP 2: LOSS FUNCTION AND OPTIMIZER SPECIFICATION =====================================
loss_fn = nn.BCEWithLogitsLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # moved this within train function

# STEP 3: MODEL TRAINING AND EVALUATION =================================================

def train(model, num_epochs, train_dl, test_dl):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Ensure the optimizer uses the correct device (it should automatically use the same device as the model)
    optimizer = torch.optim.Adam(model.parameters())  # Assuming Adam optimizer here

    loss_hist_train = [0] * num_epochs
    accuracy_hist_train = [0] * num_epochs
    f1_hist_train = [0] * num_epochs
    loss_hist_test = [0] * num_epochs
    accuracy_hist_test = [0] * num_epochs
    f1_hist_test = [0] * num_epochs

    # train model
    for epoch in range(num_epochs):
        model.train()  # set training mode
        all_train_preds = []
        all_train_labels = []
        print(f"Epoch {epoch + 1}/{num_epochs} Training...")
        for batch_idx, (x_batch, y_batch) in enumerate(train_dl):
            # Move data to device (GPU or CPU)
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            # forward pass
            # pred = model(x_batch)[:, 0]  # FOR CNN & RNN
            loss = loss_fn(pred, y_batch.float())  # compute loss
            lengths = torch.sum(x_batch != 0, dim=1)
            pred = model(x_batch, lengths)[:, 0] # FOR LTSM

            # backward pass
            loss.backward()  # compute gradients
            optimizer.step()  # update parameters
            optimizer.zero_grad()  # reset gradients

            # evaluate train
            loss_hist_train[epoch] += loss.item() * y_batch.size(0)
            is_correct = ((pred >= 0.5).float() == y_batch).float()
            accuracy_hist_train[epoch] += is_correct.sum()
            all_train_preds.extend((pred >= 0.5).cpu().numpy())
            all_train_labels.extend(y_batch.cpu().numpy())

            # Print batch progress
            if (batch_idx + 1) % 1000 == 0 or (batch_idx + 1) == len(train_dl):
                print(f"    Batch {batch_idx + 1}/{len(train_dl)}: "
                      f"Loss: {loss.item():.4f}")

        # record epoch progress
        loss_hist_train[epoch] /= len(train_dl.dataset)
        accuracy_hist_train[epoch] /= len(train_dl.dataset)
        f1_hist_train[epoch] = f1_score(all_train_labels, all_train_preds)

        # evaluate model
        model.eval()  # set evaluation mode
        all_test_preds = []
        all_test_labels = []
        print(f"Epoch {epoch + 1}/{num_epochs} Evaluating...")
        with torch.no_grad():
            for batch_idx, (x_batch, y_batch) in enumerate(test_dl):
                # Move data to device (GPU or CPU)
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                # forward pass
                pred = model(x_batch)[:, 0]
                loss = loss_fn(pred, y_batch.float())

                # evaluate test
                loss_hist_test[epoch] += loss.item() * y_batch.size(0)
                is_correct = ((pred >= 0.5).float() == y_batch).float()
                accuracy_hist_test[epoch] += is_correct.sum()
                all_test_preds.extend((pred >= 0.5).cpu().numpy())
                all_test_labels.extend(y_batch.cpu().numpy())

                # Print batch progress
                if (batch_idx + 1) % 1000 == 0 or (batch_idx + 1) == len(test_dl):
                    print(f"    Batch {batch_idx + 1}/{len(test_dl)}: "
                          f"Loss: {loss.item():.4f}")

        # record epoch progress
        loss_hist_test[epoch] /= len(test_dl.dataset)
        accuracy_hist_test[epoch] /= len(test_dl.dataset)
        f1_hist_test[epoch] = f1_score(all_test_labels, all_test_preds)

        # Print epoch summary
        print(f"Epoch {epoch + 1}/{num_epochs} Summary:")
        print(f"    Train - Accuracy: {accuracy_hist_train[epoch]:.3f}, F1: {f1_hist_train[epoch]:.3f}")
        print(f"    Test  - Accuracy: {accuracy_hist_test[epoch]:.3f}, F1: {f1_hist_test[epoch]:.3f}")

    return [loss_hist_train, loss_hist_test, accuracy_hist_train,
            accuracy_hist_test, f1_hist_train, f1_hist_test]

In [10]:
import torch

In [11]:
# STEP 4: MODEL BUILDING ================================================================
# CNN-based text classification model

class TextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layers
        convolution_layer = nn.Conv1d(in_channels=embedding_tensor.size(1),
                                      out_channels=128,
                                      kernel_size=3,
                                      padding="same")
        activation_layer = nn.ReLU()
        pooling_layer = nn.AdaptiveAvgPool1d(1)
        h_layers = [convolution_layer, activation_layer, pooling_layer]
        self.hidden_layers = nn.ModuleList(h_layers)
        # classification layer
        self.classification_layer = nn.Linear(in_features=128, out_features=1)

    # define forward pass
    def forward(self, x):
        x = self.embedding_layer(x).permute(0, 2, 1)

        for layer in self.hidden_layers:
            x = layer(x)

        x = x.squeeze(2)

        x = self.classification_layer(x)
        return x

model_cnn = TextClassificationModel(embedding_tensor=embedding_tensor)

# Train the model
num_epochs = 10
hist_cnn = train(model_cnn, num_epochs, train_dl, test_dl)
torch.save(model_cnn, "cnn_model_full.pth")

Epoch 1/10 Training...


NameError: name 'torch' is not defined

In [34]:
# EXTENSION 1: RNN =====================================================================

class RNNTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.rnn_layer = nn.RNN(input_size=embedding_tensor.size(1),
                                hidden_size=32,
                                num_layers=1, # increase to stack RNNs
                                batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, h_t = self.rnn_layer(x) # o_t includes the outputs,
                                     # h_t the hidden state at the last time step
        x = h_t[-1, :, :] # extract from last layer (in case of num_layers > 1)
        x = self.classification_layer(x)
        return x

model_rnn = RNNTextClassificationModel(embedding_tensor=embedding_tensor)

hist_rnn = train(model_rnn, num_epochs, train_dl, test_dl) # fluctuating f1 scores, exploding gradients
torch.save(model_rnn, "rnn_model_full.pth")

Epoch 1/10 Training...
    Batch 1000/2888: Loss: 0.1073
    Batch 2000/2888: Loss: 0.0263
    Batch 2888/2888: Loss: 0.1515
Epoch 1/10 Evaluating...
    Batch 321/321: Loss: 0.0536
Epoch 1/10 Summary:
    Train - Accuracy: 0.961, F1: 0.943
    Test  - Accuracy: 0.959, F1: 0.938
Epoch 2/10 Training...
    Batch 1000/2888: Loss: 0.1465
    Batch 2000/2888: Loss: 0.1635
    Batch 2888/2888: Loss: 0.0102
Epoch 2/10 Evaluating...
    Batch 321/321: Loss: 0.0463
Epoch 2/10 Summary:
    Train - Accuracy: 0.962, F1: 0.945
    Test  - Accuracy: 0.960, F1: 0.941
Epoch 3/10 Training...
    Batch 1000/2888: Loss: 0.1117
    Batch 2000/2888: Loss: 0.1026
    Batch 2888/2888: Loss: 0.0168
Epoch 3/10 Evaluating...
    Batch 321/321: Loss: 0.0470
Epoch 3/10 Summary:
    Train - Accuracy: 0.964, F1: 0.948
    Test  - Accuracy: 0.962, F1: 0.944
Epoch 4/10 Training...
    Batch 1000/2888: Loss: 0.0475
    Batch 2000/2888: Loss: 0.1583
    Batch 2888/2888: Loss: 0.1525
Epoch 4/10 Evaluating...
    Batch 

In [40]:
# EXTENSION 2: LSTM =====================================================================

class LSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.lstm_layer = nn.LSTM(input_size=embedding_tensor.size(1),
                                  hidden_size=32,
                                  num_layers=1,
                                  batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, (h_t, c_t) = self.lstm_layer(x) # c_t the cell state at the last time step
        x = h_t[-1, :, :] # extract from last layer (in case of num_layers > 1)
        x = self.classification_layer(x)
        return x

model_lstm = LSTMTextClassificationModel(embedding_tensor=embedding_tensor)

hist = train(model_lstm, num_epochs, train_dl, test_dl) # better but not great
torch.save(model_lstm, "lstm_model_full.pth")

Epoch 1/10 Training...


UnboundLocalError: local variable 'pred' referenced before assignment

In [38]:
# EXTENSION 2.5: STACKING LSTM LAYERS WITH DIFFERENT HIDDEN SIZES =========================

class StackedLSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.lstm_layer_1 = nn.LSTM(input_size=embedding_tensor.size(1),
                                    hidden_size=64,
                                    num_layers=1,
                                    batch_first=True)
        self.lstm_layer_2 = nn.LSTM(input_size=64,
                                    hidden_size=32,
                                    num_layers=1,
                                    batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t_1, (h_t_1, c_t_1) = self.lstm_layer_1(x)
        o_t_2, (h_t_2, c_t_2) = self.lstm_layer_2(o_t_1)
        x = h_t_2[-1, :, :]
        x = self.classification_layer(x)
        return x

model_lstm_stacked = StackedLSTMTextClassificationModel(embedding_tensor=embedding_tensor)

hist = train(model_lstm_stacked, num_epochs, train_dl, test_dl)
torch.save(model_lstm_stacked, "lstm_stacked_model_full.pth")

Epoch 1/10 Training...


TypeError: StackedLSTMTextClassificationModel.forward() missing 1 required positional argument: 'lengths'

In [None]:
# EXTENSION 4: BI-DIRECTIONAL LSTM ======================================================

class BidirectionalLSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.bid_lstm_layer = nn.LSTM(input_size=embedding_tensor.size(1),
                                      hidden_size=32,
                                      num_layers=1,
                                      batch_first=True,
                                      bidirectional=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32*2, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, (h_t, c_t) = self.bid_lstm_layer(x)
        x = torch.cat((h_t[-2, :, :],
                       h_t[-1, :, :]), dim=1)
        x = self.classification_layer(x)
        return x

model_bidi_lstm = BidirectionalLSTMTextClassificationModel(embedding_tensor=embedding_tensor)

hist = train(model_bidi_lstm, num_epochs, train_dl, test_dl)
torch.save(model_bidi_lstm, "bidi_lstm_model_full.pth")

# Transformer

In [11]:
# Load checkpoint and tokenizer
checkpoint = "google/bert_uncased_L-2_H-128_A-2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# Set model to evaluation mode
model.eval()

# Helper function to process data in batches
def batch_predict(model, tokenizer, texts, batch_size=16, max_length=512):
    all_preds = []
    # Check if GPU is available and move model to GPU
    if torch.cuda.is_available():
        model = model.cuda()

    with torch.no_grad():
        for start in range(0, len(texts), batch_size):
            end = min(start + batch_size, len(texts))
            batch_texts = texts[start:end]

            # Tokenize the batch of texts
            tokenized_batch = tokenizer(batch_texts, truncation=True, padding="max_length",
                                        max_length=max_length, return_tensors="pt")

            # Move tensors to GPU if available
            if torch.cuda.is_available():
                tokenized_batch = {key: value.cuda() for key, value in tokenized_batch.items()}

            # Get predictions
            outputs = model(**tokenized_batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
            predicted_labels = torch.argmax(predictions, dim=1)
            all_preds.extend(predicted_labels.cpu().numpy())

    return all_preds

# Prepare your dataset
texts = df_misinfo_test["text"].to_list()
true_labels = df_misinfo_test["label"].to_list()

# Make predictions in batches
predicted_labels = batch_predict(model, tokenizer, texts, batch_size=16)

# Evaluate the performance
f1 = f1_score(true_labels, predicted_labels)
acc = accuracy_score(true_labels, predicted_labels)
print(f"F1 Score: {f1}")
print(f"Accuracy: {acc}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/bert_uncased_L-2_H-128_A-2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [1]:
print(f1)
print(acc)

NameError: name 'f1' is not defined

## Transfer Learning

In [12]:
# convert train and test data to hugging face Dataset -----------------------------------
features = Features({
    'text': Value(dtype='string'),
    'label': ClassLabel(num_classes=2, names=['not_housing', 'housing']),
})
dataset_train = Dataset.from_pandas(df_misinfo_train, features=features)
dataset_test = Dataset.from_pandas(df_misinfo_test, features=features)

# create a hugging face DatasetDict -----------------------------------------------------
dataset_dict = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})
print(dataset_dict)

NameError: name 'Features' is not defined

In [None]:
# tokenize ------------------------------------------------------------------------------
def tokenize_function(dataset):
    return tokenizer(dataset["text"], truncation=True, padding="max_length", max_length=512)
    # truncates at 512 for the chosen checkpoint

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets

tokenized_datasets['train'][0]['text']
tokenized_datasets['train'][0]['label']
tokenized_datasets['train'][0]['input_ids']
tokenized_datasets['train'][0]['attention_mask']

# fine-tune -----------------------------------------------------------------------------
training_args = TrainingArguments(output_dir="./results",
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=30,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='f1',
                                  disable_tqdm=True,
                                  use_cpu=True)

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# predict -------------------------------------------------------------------------------
model.eval()
with torch.no_grad():
    outputs = model(**tokenized_texts)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_labels = torch.argmax(predictions, dim=1)
true_labels = torch.tensor(df_misinfo_test["label"].to_list())
f1 = f1_score(true_labels.numpy(), predicted_labels.numpy())
acc = accuracy_score(true_labels.numpy(), predicted_labels.numpy())
f1
acc

need to save it