<a href="https://colab.research.google.com/github/henrycgbaker/nlp_research_note/blob/main/nlp_research_note.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import kagglehub
import pandas as pd
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import fasttext.util as fasttext_util
import fasttext
from sklearn.metrics import f1_score
from collections import Counter
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import tqdm
import pickle
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import evaluate
from sklearn.metrics import f1_score, accuracy_score

# for GPU
"""
os.environ['PIP_CACHE_DIR'] = '/workspace/workspace/cache'
os.environ['TRANSFORMERS_CACHE'] = '/workspace/workspace/cache'
os.environ['TORCH_CACHE'] = '/workspace/workspace/cache'
os.environ['HF_CACHE'] = '/workspace/workspace/cache'
"""

# for Google Colab

from google.colab import drive


In [2]:
# specify custom functions --------------------------------------------------------------
def custom_tokenizer(text):
    tokenized_text = nlp(text)
    return [tok.text for tok in tokenized_text]

def embedding_mapping_fasttext(vocabulary, pre_trained_embeddings):
    vocab_size = len(vocabulary)
    embedding_dim = pre_trained_embeddings.get_dimension()
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for idx, word in enumerate(vocabulary):
        embedding_matrix[idx] = pre_trained_embeddings.get_word_vector(word)
    return embedding_matrix

# download pretrained embeddings --------------------------------------------------------
# for local
#fasttext_util.download_model('en', if_exists='ignore')

# for Gdrive

drive.mount('/content/drive')
model_path = "/content/drive/MyDrive/cc.en.300.bin"
ft = fasttext.load_model(model_path)


# download spacy model for tokenization -------------------------------------------------
spacy.cli.download("en_core_web_sm")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# Import & process `rumours` dataset

In [3]:
"""
os.chdir("/Users/henrybaker/Documents/repositories/nlp/nlp_research_note")
print("Working Directory:", os.getcwd())

path_rumour = kagglehub.dataset_download("syntheticprogrammer/rumor-detection-acl-2017")
print("Path to rumour dataset files:", path_rumour)

path_climate = kagglehub.dataset_download("die9origephit/climate-change-tweets")
print("Path to dataset files:", path_climate)

# List all files in the directory
files_rumour = os.listdir(path_rumour)
files_climate = os.listdir(path_climate)

# Print the list of files
print("Files in rumour dataset directory:", files_rumour)
print("Files in climate dataset directory:", files_climate)
"""

'\nos.chdir("/Users/henrybaker/Documents/repositories/nlp/nlp_research_note")\nprint("Working Directory:", os.getcwd())\n\npath_rumour = kagglehub.dataset_download("syntheticprogrammer/rumor-detection-acl-2017")\nprint("Path to rumour dataset files:", path_rumour)\n\npath_climate = kagglehub.dataset_download("die9origephit/climate-change-tweets")\nprint("Path to dataset files:", path_climate)\n\n# List all files in the directory\nfiles_rumour = os.listdir(path_rumour)\nfiles_climate = os.listdir(path_climate)\n\n# Print the list of files\nprint("Files in rumour dataset directory:", files_rumour)\nprint("Files in climate dataset directory:", files_climate)\n'

In [4]:
"""
combined_data = []

for numb in [15, 16]:
    path_numb = files_rumour[numb - 15]  # 'twitter15' or 'twitter16' folder path
    label_file_path_numb = os.path.join(path_rumour, path_numb, 'label.txt')
    tweets_file_path_numb = os.path.join(path_rumour, path_numb, 'source_tweets.txt')

    # Read label.txt
    label_dict = {}
    with open(label_file_path_numb, 'r') as file:
        for line in file:
            label, tweet_id = line.strip().split(':')
            label_dict[tweet_id] = label

    # Read source_tweets.txt
    tweets_dict = {}
    with open(tweets_file_path_numb, 'r') as file:
        for line in file:
            tweet_id, tweet_content = line.strip().split('\t', 1)
            tweets_dict[tweet_id] = tweet_content

    # Combine labels with tweets
    for tweet_id, tweet_content in tweets_dict.items():
        if tweet_id in label_dict:
            combined_data.append((label_dict[tweet_id], tweet_content))

    print(f"twitter_{numb}:")

    for label, tweet in combined_data[:5]:
        print(f"    Label: {label}, Tweet: {tweet}")
"""

' \ncombined_data = []\n\nfor numb in [15, 16]:\n    path_numb = files_rumour[numb - 15]  # \'twitter15\' or \'twitter16\' folder path\n    label_file_path_numb = os.path.join(path_rumour, path_numb, \'label.txt\')\n    tweets_file_path_numb = os.path.join(path_rumour, path_numb, \'source_tweets.txt\')\n\n    # Read label.txt\n    label_dict = {}\n    with open(label_file_path_numb, \'r\') as file:\n        for line in file:\n            label, tweet_id = line.strip().split(\':\')\n            label_dict[tweet_id] = label\n\n    # Read source_tweets.txt\n    tweets_dict = {}\n    with open(tweets_file_path_numb, \'r\') as file:\n        for line in file:\n            tweet_id, tweet_content = line.strip().split(\'\t\', 1)\n            tweets_dict[tweet_id] = tweet_content\n\n    # Combine labels with tweets\n    for tweet_id, tweet_content in tweets_dict.items():\n        if tweet_id in label_dict:\n            combined_data.append((label_dict[tweet_id], tweet_content))\n\n    print(f"

crop URL part
chosen specifically because it is twitter

In [5]:
"""
print(f"Total number of entries in combined data: {len(combined_data)} \n")

unique_labels = set(label for label, tweet in combined_data)

print("Unique labels in combined data:")
for label in unique_labels:
    print(f"   ", label)
"""

' \nprint(f"Total number of entries in combined data: {len(combined_data)} \n")\n\nunique_labels = set(label for label, tweet in combined_data)\n\nprint("Unique labels in combined data:")\nfor label in unique_labels:\n    print(f"   ", label)\n'

In [6]:
"""
df_combined = pd.DataFrame(combined_data, columns=["Label", "Tweet"])

print(df_combined.shape)
df_combined.head()
"""

' \ndf_combined = pd.DataFrame(combined_data, columns=["Label", "Tweet"])\n\nprint(df_combined.shape)\ndf_combined.head()\n'

In [7]:
"""
grouped = df_combined.groupby("Label")

# 5 random examples for each label
for label, group in grouped:
    print(f"\nLabel: {label}")
    sample = group.sample(n=5, random_state=42)
    for _, row in sample.iterrows():
        print(f"   Tweet: {row['Tweet']}")
"""

' \ngrouped = df_combined.groupby("Label")\n\n# 5 random examples for each label\nfor label, group in grouped:\n    print(f"\nLabel: {label}")\n    sample = group.sample(n=5, random_state=42)  \n    for _, row in sample.iterrows():\n        print(f"   Tweet: {row[\'Tweet\']}")\n'

---
# Import & process Hugging Face `misinfo` dataset

In [8]:
ds = load_dataset("roupenminassian/twitter-misinformation")
hf_cache_dir = os.getenv("HF_DATASETS_CACHE", "~/.cache/huggingface/datasets")
print(hf_cache_dir)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


~/.cache/huggingface/datasets


need to balance the dataset?

In [9]:
print(ds.shape)

print(ds['train'].column_names)
print(ds['train'][:5])

{'train': (92394, 4), 'test': (10267, 4)}
['Unnamed: 0.1', 'Unnamed: 0', 'text', 'label']
{'Unnamed: 0.1': [34366, 41656, 26726, 81585, 4016], 'Unnamed: 0': [34366, 41656, 26726, 81585, 4016], 'text': ["Local Charlotte, NC news station WSOCTV is reporting that sources tell them dash cameras captured Keith Scott getting out of car and coming towards officers with a gun in his hand:#BREAKING: Sources tell Channel 9 dash camera video shows #KeithScott getting out car, coming toward officers with gun in his hand pic.twitter.com/GGuM2Ow3wk  WSOCTV (@wsoctv) September 21, 2016For a second night, protests over a deadly officer-involved shooting in Charlotte, North Carolina, turned violent, with police firing tear gas and demonstrators throwing objects and trying to damage vehicles.Keith Lamont Scott, a father of seven, was killed by police in an apartment complex parking lot Tuesday as officers looked for another man named in a warrant they were trying to serve. The shooting set off a long ni

In [10]:
ds_cloned = ds.copy()

# DATA PARTITIONING =====================================================================

ds_cloned['train'] = ds_cloned['train'].remove_columns(['Unnamed: 0', 'Unnamed: 0.1'])
ds_cloned['test'] = ds_cloned['test'].remove_columns(['Unnamed: 0', 'Unnamed: 0.1'])

df_misinfo_train = pd.DataFrame(ds_cloned['train'], columns=["text", "label"])
df_misinfo_test = pd.DataFrame(ds_cloned['test'], columns=["text", "label"])


print(f"Train shape {df_misinfo_train.shape} \n")
print("Training positive vs negative examples: \n", df_misinfo_train.value_counts("label")/df_misinfo_train.shape[0])
print("\nTesting positive vs negative examples: \n",df_misinfo_test.value_counts("label")/df_misinfo_test.shape[0])

df_misinfo_train.head()

Train shape (92394, 2) 

Training positive vs negative examples: 
 label
0    0.652737
1    0.347263
Name: count, dtype: float64

Testing positive vs negative examples: 
 label
0    0.659686
1    0.340314
Name: count, dtype: float64


Unnamed: 0,text,label
0,"Local Charlotte, NC news station WSOCTV is rep...",1
1,The tsunami has started President Obama s Keny...,1
2,The only reality show Donald Trump should have...,1
3,"No Food, No FEMA: Hurricane Michael’s Survivor...",0
4,WASHINGTON (Reuters) - Here are some of the hi...,0


In [11]:
# balance train split -------------------------------------------------------------------

balancer = RandomUnderSampler(random_state=42, sampling_strategy = 'majority')
df_misinfo_train_balanced = pd.concat(balancer.fit_resample(X = df_misinfo_train.iloc[:,[0]],
                                                           y = df_misinfo_train.iloc[:,[1]]),
                                     axis=1).sample(frac = 1).reset_index(drop=True)
df_misinfo_train_balanced.value_counts("label")/df_misinfo_train_balanced.shape[0]
df_misinfo_train_balanced.shape[0]




64170

In [12]:
# TOKENIZATION ==========================================================================
# Define a function for batch processing
def batch_tokenize(data, tokenizer, batch_size=1000):
    """
    Tokenizes the input data in batches to prevent memory issues.

    Parameters:
    - data: The dataset to tokenize (as a pandas Series).
    - tokenizer: The tokenizer function.
    - batch_size: Number of rows to process in each batch.

    Returns:
    - A list of tokenized outputs.
    """
    tokenized_batches = []
    for start in range(0, len(data), batch_size):
        batch = data[start:start + batch_size]
        print(f"Tokenizing batch {start // batch_size + 1}/{len(data) // batch_size + 1}...")
        tokenized_batch = batch.map(tokenizer)
        tokenized_batches.extend(tokenized_batch)
    return tokenized_batches

def custom_analyzer(text, trained_tokenizer):
        tokens = custom_tokenizer(text)
        vocab = trained_tokenizer.vocabulary_
        return [token if token in vocab else "<unk>" for token in tokens]

# Check if the pickle files already exist
train_tokens_file = 'misinfo_train_tokens.pkl'
test_tokens_file = 'misinfo_test_tokens.pkl'

if os.path.exists(train_tokens_file) and os.path.exists(test_tokens_file):
    print("Tokenized text pkl files found: loading data...")
    # Load the pre-saved tokenized data
    with open(train_tokens_file, 'rb') as f:
        misinfo_train_tokens = pickle.load(f)

    with open(test_tokens_file, 'rb') as f:
        misinfo_test_tokens = pickle.load(f)
else:
    print("Pickle files not found. Running tokenization...")

    print("Loading spaCy model...")
    nlp = spacy.load("en_core_web_sm", disable=["tok2vec", "tagger", "parser", "ner", "lemmatizer", "attribute_ruler"])

    misinfo_tokenizer = CountVectorizer(analyzer="word",
                                        tokenizer=custom_tokenizer,
                                        lowercase=False,
                                        min_df=3)

    print("Fitting Tokenizer...")
    misinfo_tokenizer.fit(df_misinfo_train["text"])
    misinfo_tokenizer_analyzer = misinfo_tokenizer.build_analyzer()

    print("Tokenizing Train Data in Batches...")
    misinfo_train_tokens = batch_tokenize(df_misinfo_train["text"], misinfo_tokenizer_analyzer)

    print("Tokenizing Test Data in Batches...")
    misinfo_test_tokens = batch_tokenize(df_misinfo_test["text"],
                                         lambda x: custom_analyzer(x, misinfo_tokenizer))

    # Save tokenized train and test data
    with open(train_tokens_file, 'wb') as f:
        pickle.dump(misinfo_train_tokens, f)

    with open(test_tokens_file, 'wb') as f:
        pickle.dump(misinfo_test_tokens, f)


Tokenized text pkl files found: loading data...


In [13]:
# STEP 1: INPUT PIPELINE ================================================================

# vocabulary indexing -------------------------------------------------------------------
print ("vocab indexing")

def vocab_mapping(tokenized_text):
    token_counts = Counter()
    for text in tokenized_text:
        token_counts.update(text)
    special_tokens = ["<pad>", "<unk>"]
    vocab_tokens = special_tokens + [token for token, freq in token_counts.most_common()]
    vocab = {token: idx for idx, token in enumerate(vocab_tokens)}
    return vocab

vocab_idx = vocab_mapping(tokenized_text=misinfo_train_tokens)

# create data loaders -------------------------------------------------------------------

print("creating data loaders")

def collate_fn(data):
    text_list, label_list = [], []
    for _text, _label in data:
        # integer encoding with truncation
        processed_text = torch.tensor([vocab_idx[token] for token in _text][:max_seq_length],
                                      dtype=torch.int64)
        text_list.append(processed_text)
        label_list.append(_label)
    label_list = torch.tensor(label_list)
    # padding
    padded_text_list = nn.utils.rnn.pad_sequence(text_list,
                                                 batch_first=True,
                                                 padding_value=0)
    return padded_text_list, label_list

max_seq_length = 300
batch_size = 32

train_dl = DataLoader(dataset=list(zip(misinfo_train_tokens,
                                         df_misinfo_train["label"])), # was meant to be balanced
                        batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dl = DataLoader(dataset=list(zip(misinfo_test_tokens,
                                         df_misinfo_test["label"])),
                        batch_size=32, shuffle=False, collate_fn=collate_fn)

print("created data loaders")

# map pretrained fasttext embeddings to vocabulary indices ------------------------------

# Define the file path for the pickle file
pickle_file_path = "mapped_pretrained_embeddings.pkl"

# Check if the pickle file already exists
if os.path.exists(pickle_file_path):
    # If the file exists, load it from the pickle file
    with open(pickle_file_path, 'rb') as f:
        embedding_tensor = pickle.load(f)
    print(f"Emebddings pre-exists: loaded embeddings from {pickle_file_path}. Shape: {embedding_tensor.shape}")
else:
    # If the file does not exist, proceed with creating the embeddings and save them
    # Load pre-trained FastText model
    print("Embeddings do not pre-exist: mapping pretrained fasttext embeddings to vocabulary indices")

    # Map pretrained FastText embeddings to vocabulary indices
    mapped_pretrained_embeddings = embedding_mapping_fasttext(vocabulary=vocab_idx,
                                                              pre_trained_embeddings=ft)

    # Convert mapped embeddings to a tensor
    embedding_tensor = torch.FloatTensor(mapped_pretrained_embeddings)

    # Save the embeddings to a pickle file
    with open(pickle_file_path, 'wb') as f:
        pickle.dump(embedding_tensor, f)
    print(f"Saved embeddings to {pickle_file_path}. Shape: {embedding_tensor.shape}")

vocab indexing
creating data loaders
created data loaders
Emebddings pre-exists: loaded embeddings from mapped_pretrained_embeddings.pkl. Shape: torch.Size([217732, 300])


In [14]:
# STEP 2: LOSS FUNCTION AND OPTIMIZER SPECIFICATION =====================================
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# STEP 3: MODEL TRAINING AND EVALUATION =================================================

def train(model, num_epochs, train_dl, test_dl):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if GPU is available
    model.to(device)  # Move the model to GPU or CPU

    # Ensure the optimizer uses the correct device (it should automatically use the same device as the model)
    optimizer = torch.optim.Adam(model.parameters())  # Assuming Adam optimizer here

    loss_hist_train = [0] * num_epochs
    accuracy_hist_train = [0] * num_epochs
    f1_hist_train = [0] * num_epochs
    loss_hist_test = [0] * num_epochs
    accuracy_hist_test = [0] * num_epochs
    f1_hist_test = [0] * num_epochs

    # train model
    for epoch in range(num_epochs):
        model.train()  # set training mode
        all_train_preds = []
        all_train_labels = []
        print(f"Epoch {epoch + 1}/{num_epochs} Training...")
        for batch_idx, (x_batch, y_batch) in enumerate(train_dl):
            # Move data to device (GPU or CPU)
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            # forward pass
            pred = model(x_batch)[:, 0]  # generate predictions
            loss = loss_fn(pred, y_batch.float())  # compute loss

            # backward pass
            loss.backward()  # compute gradients
            optimizer.step()  # update parameters
            optimizer.zero_grad()  # reset gradients

            # evaluate train
            loss_hist_train[epoch] += loss.item() * y_batch.size(0)
            is_correct = ((pred >= 0.5).float() == y_batch).float()
            accuracy_hist_train[epoch] += is_correct.sum()
            all_train_preds.extend((pred >= 0.5).cpu().numpy())
            all_train_labels.extend(y_batch.cpu().numpy())

            # Print batch progress
            if (batch_idx + 1) % 1000 == 0 or (batch_idx + 1) == len(train_dl):
                print(f"    Batch {batch_idx + 1}/{len(train_dl)}: "
                      f"Loss: {loss.item():.4f}")

        # record epoch progress
        loss_hist_train[epoch] /= len(train_dl.dataset)
        accuracy_hist_train[epoch] /= len(train_dl.dataset)
        f1_hist_train[epoch] = f1_score(all_train_labels, all_train_preds)

        # evaluate model
        model.eval()  # set evaluation mode
        all_test_preds = []
        all_test_labels = []
        print(f"Epoch {epoch + 1}/{num_epochs} Evaluating...")
        with torch.no_grad():
            for batch_idx, (x_batch, y_batch) in enumerate(test_dl):
                # Move data to device (GPU or CPU)
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)

                # forward pass
                pred = model(x_batch)[:, 0]
                loss = loss_fn(pred, y_batch.float())

                # evaluate test
                loss_hist_test[epoch] += loss.item() * y_batch.size(0)
                is_correct = ((pred >= 0.5).float() == y_batch).float()
                accuracy_hist_test[epoch] += is_correct.sum()
                all_test_preds.extend((pred >= 0.5).cpu().numpy())
                all_test_labels.extend(y_batch.cpu().numpy())

                # Print batch progress
                if (batch_idx + 1) % 1000 == 0 or (batch_idx + 1) == len(test_dl):
                    print(f"    Batch {batch_idx + 1}/{len(test_dl)}: "
                          f"Loss: {loss.item():.4f}")

        # record epoch progress
        loss_hist_test[epoch] /= len(test_dl.dataset)
        accuracy_hist_test[epoch] /= len(test_dl.dataset)
        f1_hist_test[epoch] = f1_score(all_test_labels, all_test_preds)

        # Print epoch summary
        print(f"Epoch {epoch + 1}/{num_epochs} Summary:")
        print(f"    Train - Accuracy: {accuracy_hist_train[epoch]:.3f}, F1: {f1_hist_train[epoch]:.3f}")
        print(f"    Test  - Accuracy: {accuracy_hist_test[epoch]:.3f}, F1: {f1_hist_test[epoch]:.3f}")

    return [loss_hist_train, loss_hist_test, accuracy_hist_train,
            accuracy_hist_test, f1_hist_train, f1_hist_test]

NameError: name 'model' is not defined

In [None]:
# STEP 4: MODEL BUILDING ================================================================
# CNN-based text classification model

class TextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layers
        convolution_layer = nn.Conv1d(in_channels=embedding_tensor.size(1),
                                      out_channels=128,
                                      kernel_size=3,
                                      padding="same")
        activation_layer = nn.ReLU()
        pooling_layer = nn.AdaptiveAvgPool1d(1)
        h_layers = [convolution_layer, activation_layer, pooling_layer]
        self.hidden_layers = nn.ModuleList(h_layers)
        # classification layer
        self.classification_layer = nn.Linear(in_features=128, out_features=1)

    # define forward pass
    def forward(self, x):
        x = self.embedding_layer(x).permute(0, 2, 1)

        for layer in self.hidden_layers:
            x = layer(x)

        x = x.squeeze(2)

        x = self.classification_layer(x)
        return x

model = TextClassificationModel(embedding_tensor=embedding_tensor)
model

# Train the model
num_epochs = 10
hist_cnn = train(model, num_epochs, train_dl, test_dl)

In [None]:
# EXTENSION 1: RNN =====================================================================

class RNNTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.rnn_layer = nn.RNN(input_size=embedding_tensor.size(1),
                                hidden_size=32,
                                num_layers=1, # increase to stack RNNs
                                batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, h_t = self.rnn_layer(x) # o_t includes the outputs,
                                     # h_t the hidden state at the last time step
        x = h_t[-1, :, :] # extract from last layer (in case of num_layers > 1)
        x = self.classification_layer(x)
        return x

model_rnn = RNNTextClassificationModel(embedding_tensor=embedding_tensor)
model

hist_rnn = train(model, num_epochs, train_dl, test_dl) # fluctuating f1 scores, exploding gradients

In [None]:
# EXTENSION 2: LSTM =====================================================================

class LSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.lstm_layer = nn.LSTM(input_size=embedding_tensor.size(1),
                                  hidden_size=32,
                                  num_layers=1,
                                  batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, (h_t, c_t) = self.lstm_layer(x) # c_t the cell state at the last time step
        x = h_t[-1, :, :] # extract from last layer (in case of num_layers > 1)
        x = self.classification_layer(x)
        return x

model = LSTMTextClassificationModel(embedding_tensor=embedding_tensor)
model
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
hist = train(model, num_epochs, train_dl, test_dl) # better but not great

In [None]:
# EXTENSION 2.5: STACKING LSTM LAYERS WITH DIFFERENT HIDDEN SIZES =========================

class StackedLSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.lstm_layer_1 = nn.LSTM(input_size=embedding_tensor.size(1),
                                    hidden_size=64,
                                    num_layers=1,
                                    batch_first=True)
        self.lstm_layer_2 = nn.LSTM(input_size=64,
                                    hidden_size=32,
                                    num_layers=1,
                                    batch_first=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t_1, (h_t_1, c_t_1) = self.lstm_layer_1(x)
        o_t_2, (h_t_2, c_t_2) = self.lstm_layer_2(o_t_1)
        x = h_t_2[-1, :, :]
        x = self.classification_layer(x)
        return x

model = StackedLSTMTextClassificationModel(embedding_tensor=embedding_tensor)
model
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
hist = train(model, num_epochs, train_dl, test_dl)

In [None]:
# EXTENSION 4: BI-DIRECTIONAL LSTM ======================================================

class BidirectionalLSTMTextClassificationModel(nn.Module):
    # create layers
    def __init__(self, embedding_tensor):
        super().__init__()
        # input layer
        self.embedding_layer = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
        # hidden layer
        self.bid_lstm_layer = nn.LSTM(input_size=embedding_tensor.size(1),
                                      hidden_size=32,
                                      num_layers=1,
                                      batch_first=True,
                                      bidirectional=True)
        # classification layer
        self.classification_layer = nn.Linear(in_features=32*2, out_features=1)

    # define forward pass
    def forward(self, x, lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(x,
                                              lengths.cpu().numpy(),
                                              enforce_sorted=False,
                                              batch_first=True)
        o_t, (h_t, c_t) = self.bid_lstm_layer(x)
        x = torch.cat((h_t[-2, :, :],
                       h_t[-1, :, :]), dim=1)
        x = self.classification_layer(x)
        return x

model = BidirectionalLSTMTextClassificationModel(embedding_tensor=embedding_tensor)
model
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
hist = train(model, num_epochs, train_dl, test_dl)

# Transformer

In [None]:
# load checkpoint -----------------------------------------------------------------------
checkpoint = "google/bert_uncased_L-2_H-128_A-2" # aka BERT-Tiny
# model card: https://huggingface.co/google/bert_uncased_L-2_H-128_A-2

# load corresponding tokenizer ----------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# load corresponding model with binary classification head ------------------------------
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = 2)
# to see other heads, type transformers.AutoModel and see autocomplete


# APPLYING A MODEL WITHOUT FINE-TUNING (ignoring the warning) ===========================

# process texts and apply model ---------------------------------------------------------
tokenized_texts = tokenizer(df_misinfo_test["text"].to_list(), truncation=True,
                            padding="max_length", max_length=512, return_tensors="pt")

# predict and evaluate ------------------------------------------------------------------
model.eval()
with torch.no_grad():
    outputs = model(**tokenized_texts)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_labels = torch.argmax(predictions, dim=1)
true_labels = torch.tensor(df_misinfo_test["label"].to_list())
f1 = f1_score(true_labels.numpy(), predicted_labels.numpy())
acc = accuracy_score(true_labels.numpy(), predicted_labels.numpy())
# obviously bad and erratic performance as model is not tailored to the task at hand
f1
acc


## Transfer Learning

In [None]:
# convert train and test data to hugging face Dataset -----------------------------------
features = Features({
    'text': Value(dtype='string'),
    'label': ClassLabel(num_classes=2, names=['not_housing', 'housing']),
})
dataset_train = Dataset.from_pandas(df_misinfo_train, features=features)
dataset_test = Dataset.from_pandas(df_misinfo_test, features=features)

# create a hugging face DatasetDict -----------------------------------------------------
dataset_dict = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})
print(dataset_dict)

In [None]:
# tokenize ------------------------------------------------------------------------------
def tokenize_function(dataset):
    return tokenizer(dataset["text"], truncation=True, padding="max_length", max_length=512)
    # truncates at 512 for the chosen checkpoint

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets

tokenized_datasets['train'][0]['text']
tokenized_datasets['train'][0]['label']
tokenized_datasets['train'][0]['input_ids']
tokenized_datasets['train'][0]['attention_mask']

# fine-tune -----------------------------------------------------------------------------
training_args = TrainingArguments(output_dir="./results",
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=32,
                                  per_device_eval_batch_size=32,
                                  num_train_epochs=30,
                                  load_best_model_at_end=True,
                                  metric_for_best_model='f1',
                                  disable_tqdm=True,
                                  use_cpu=True)

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# predict -------------------------------------------------------------------------------
model.eval()
with torch.no_grad():
    outputs = model(**tokenized_texts)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predicted_labels = torch.argmax(predictions, dim=1)
true_labels = torch.tensor(df_misinfo_test["label"].to_list())
f1 = f1_score(true_labels.numpy(), predicted_labels.numpy())
acc = accuracy_score(true_labels.numpy(), predicted_labels.numpy())
f1
acc

---

In [None]:
# reading climate df
input_path_climate = "/Users/henrybaker/.cache/kagglehub/datasets/die9origephit/climate-change-tweets/versions/1/Climate change_2022-1-17_2022-7-19.csv"

output_path_climate = "/Users/henrybaker/Documents/repositories/NLP/nlp_project/data/climate-change-tweets.csv"

df_climate = pd.read_csv(input_path_climate)
print(f"Loading dataset from '{input_path_climate}'...")
df_climate.head()