# Power Identification with NN and RNN

This is a binary classification task, based on a speech of a speaker, to determine their belonging to coalition or opposition in the government.

- [Link to the task](https://touche.webis.de/clef24/touche24-web/ideology-and-power-identification-in-parliamentary-debates.html)
- [Full project](https://github.com/daschablume/power-identification?tab=readme-ov-file)



In [None]:
#%%
from pathlib import Path
import torch
from typing import Self
import csv
import numpy as np
import datetime
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from torch import nn
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import csr_matrix
from tqdm import tqdm

from model_samples.torch.nn import NNClassifier
from model_samples.torch.rnn import PositionalEncoder, RNNClassifier
from model_samples.utils import TrainConfig

In [None]:
class RawDataset():
    """Class to hold raw data load directly from the tsv files.
    """
    def __init__(self, ids: list[str], speakers: list[str], texts: list[str], labels: list[int]) -> None:
        assert len(ids) == len(speakers) == len(texts) == len(labels), "All arrays must have the same length"
        self.ids = ids
        self.speakers = speakers
        self.texts = texts
        self.labels = labels

    def subset(self, index_list: list[int]):

        data = RawDataset(
            [self.ids[idx] for idx in index_list],
            [self.speakers[idx] for idx in index_list],
            [self.texts[idx] for idx in index_list],
            [self.labels[idx] for idx in index_list],
        )

        return data

    def __getitem__(self, index: int):
        return (self.ids[index], self.speakers[index], self.texts[index], self.labels[index])

    def __add__(self, other: Self):
        return RawDataset(
            self.ids + other.ids,
            self.speakers + other.speakers,
            self.texts + other.texts,
            self.labels + other.labels
        )

    def __iter__(self):
        for data in zip(self.ids, self.speakers, self.texts, self.labels):
            yield data

    def __len__(self):
        return len(self.ids)

def load_data(file_path) -> RawDataset:
    """Load one file and return """

    data = pd.read_csv(file_path, sep="\t")
    return RawDataset(data["id"], data["speaker"], data["text"], data["label"])


class EncodedDataset(Dataset):
    """Custom Dataset object to hold parliament debate data. Each item in the dataset
    is a tuple of (input tensor, label)
    """
    def __init__(
            self,
            inputs: torch.Tensor,
            labels: torch.Tensor,
        ) -> None:
        super().__init__()
        assert len(inputs) == len(labels), "Inputs and labels have different length"
        self.data_ = list(zip(inputs, labels))

    def __len__(self):
        return len(self.data_)

    def __getitem__(self, index):
        return self.data_[index]

    def __iter__(self):
        for data in self.data_:
            yield data

def encode_torch_data(data: RawDataset, encoder: PositionalEncoder | TfidfVectorizer):
    """Convenience function to create the encoded dataset compatible with torch models"""
    # Encode text
    enc_texts_csr = encoder.transform(data.texts)

    if isinstance(enc_texts_csr, csr_matrix):
        inputs = torch.from_numpy(enc_texts_csr.todense()).float()
    else:
        inputs = enc_texts_csr.to_dense()

    # Convert labels to tensor
    labels = torch.tensor(data.labels)

    return EncodedDataset(inputs, labels)

# Helper functions
def get_average_metrics(result_list: list[dict]) -> dict:
    accuracy = np.mean([[result['accuracy'] for result in result_list]])
    precision = np.mean([[result['precision'] for result in result_list]])
    recall = np.mean([[result['recall'] for result in result_list]])
    f1 = np.mean([[result['f1'] for result in result_list]])
    auc = np.mean([[result['auc'] for result in result_list]])
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, "auc": auc}


def evaluate(y_test: np.ndarray, y_pred: np.ndarray, y_prob: np.ndarray) -> dict:
    """Conveninece function to evaluate predction of models.

    The function returns a dictionary of metrics:
    - Accuracy
    - Precision
    - Recall
    - F1
    - AUC

    Parameters
    ----------
    y_test : np.ndarray
        Labels of the test set
    y_pred : np.ndarray
        Prediction produced by the model
    y_prob : np.ndarray
        Probability array produced by the model

    Returns
    -------
    dict
    """

    true_pos = sum([pred == y == 1 for pred, y in zip(y_pred, y_test)])
    true_neg = sum([pred == y == 0 for pred, y in zip(y_pred, y_test)])
    false_pos = sum([(pred == 1) * (y == 0) for pred, y in zip(y_pred, y_test)])
    false_neg = sum([(pred == 0) * (y == 1) for pred, y in zip(y_pred, y_test)])
    total = len(y_test)

    accuracy = (true_pos + true_neg) / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    f1 = 2 * true_pos / (2 * true_pos + false_pos + false_neg)
    auc = roc_auc_score(y_test, y_prob)

    result = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "auc": auc
    }

    return result


# Classification with Neural Networks

In [None]:
data = load_data("./data/power-gb-train.tsv")

In [6]:

NFOLDS = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


result_list = []
kfold = KFold(n_splits=NFOLDS, shuffle=False, random_state=None)

for fold_idx, (train_idx, test_idx) in enumerate(kfold.split(data), start=1):
    print(f"CV fold {fold_idx}")

    print("Train TfidfVectorizer")
    encoder = TfidfVectorizer(max_features=10000, analyzer="char", ngram_range=(3,5), use_idf=True, sublinear_tf=True)
    encoder.fit(data.subset(train_idx).texts)

    # Encode data
    train_data = encode_torch_data(data.subset(train_idx), encoder)
    test_data = encode_torch_data(data.subset(test_idx), encoder)

    # Init model
    train_config = TrainConfig(
        num_epochs      = 10,
        early_stop      = False,
        violation_limit = 5
    )

    dataloader = DataLoader(train_data, batch_size=128, shuffle=True)

    model = NNClassifier(
        input_size=len(encoder.vocabulary_),
        hidden_size=128,
        n_linear_layers=3,
        device=DEVICE
    )

    # Train
    t0 = datetime.datetime.now()
    model.fit(dataloader, train_config, disable_progress_bar=True)
    time_elapsed = (datetime.datetime.now() - t0).total_seconds()

    print(f"Fold {fold_idx} train time: {time_elapsed / 60:.4} minutes")


    # Evaluate
    with torch.no_grad():
        X_test_nn = torch.stack([test[0] for test in test_data]).cpu()
        y_test_nn = torch.stack([test[1] for test in test_data]).cpu()
        y_pred_nn = model.predict(X_test_nn)
        logits_nn = model.forward(X_test_nn)

    result = evaluate(y_test_nn.cpu(), y_pred_nn.cpu(), logits_nn.cpu())
    result_list.append({"fold": str(fold_idx), **result})


avg_nn_results = get_average_metrics(result_list)
print([f"{key}: {value:.3f}" for key, value in avg_nn_results.items()])



CV fold 1
Train TfidfVectorizer
Run model on GPU

Fold 1 train time: 0.2388 minutes
CV fold 2
Train TfidfVectorizer
Run model on GPU

Fold 2 train time: 0.1972 minutes
CV fold 3
Train TfidfVectorizer
Run model on GPU

Fold 3 train time: 0.1944 minutes
CV fold 4
Train TfidfVectorizer
Run model on GPU

Fold 4 train time: 0.1951 minutes
CV fold 5
Train TfidfVectorizer
Run model on GPU

Fold 5 train time: 0.1914 minutes
['accuracy: 0.757', 'precision: 0.785', 'recall: 0.785', 'f1: 0.785', 'auc: 0.834']


# Classification with RNN

In [None]:

NFOLDS = 5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

result_list = []
kfold = KFold(n_splits=NFOLDS, shuffle=False)

for fold_idx, (train_idx, test_idx) in enumerate(kfold.split(data), start=1):
    print(f"CV fold {fold_idx}")

    chars_encoder = TfidfVectorizer(max_features=50000, analyzer="char", ngram_range=(3,5), use_idf=True, sublinear_tf=True)
    encoder = PositionalEncoder(tokenizer=chars_encoder.build_tokenizer())
    encoder.fit(data.subset(train_idx).texts)

    train_dataloader = DataLoader(data.subset(train_idx), batch_size=128, shuffle=True)
    test_dataloader = DataLoader(data.subset(test_idx), batch_size=128, shuffle=False)

    # Prepare baseline config
    train_config = TrainConfig(
        optimizer_params = {'lr': 0.01},
        num_epochs       = 10,
        early_stop       = False,
        violation_limit  = 5
    )

    # Train baseline model
    model = RNNClassifier(
        rnn_network         = nn.LSTM,
        word_embedding_dim  = 32,
        hidden_dim          = 64,
        bidirectional       = False,
        dropout             = 0,
        encoder             = encoder,
        device              = DEVICE
    )

    t0 = datetime.datetime.now()
    model.fit(train_dataloader, train_config, disable_progress_bar=True)

    time_elapsed = (datetime.datetime.now() - t0).total_seconds()
    print(f"Fold {fold_idx} train time: {time_elapsed / 60:.4} minutes")


    # Evaluate
    with torch.no_grad():
        model.device = "cpu"
        model.cpu()

        pred_lst = []
        probs_lst = []

        for _, _, raw_inputs, raw_targets in test_dataloader:
            batch_encoder = PositionalEncoder(vocabulary=encoder.vocabulary)
            test_inputs = batch_encoder.fit_transform(raw_inputs).cpu()
            # test_targets = torch.as_tensor(raw_targets, dtype=torch.float).cpu()

            pred_lst.append(model.predict(test_inputs))
            probs_lst.append(model._sigmoid(model.forward(test_inputs)).squeeze())

    pred = torch.cat(pred_lst).long().numpy()
    probs = torch.concat(probs_lst).numpy()

    result = evaluate(data.subset(test_idx).labels, pred, probs)
    result_list.append({"fold": str(fold_idx), **result})


avg_rnn_results = get_average_metrics(result_list)
print([f"{key}: {value:.3f}" for key, value in avg_rnn_results.items()])
