In [None]:
from datasets import load_dataset, DatasetDict
dataset_name = "ccdv/cnn_dailymail"
data_size = '100%'
splits = ('train', 'validation', 'test')
split_tuples = [f"{split}[:{data_size}]" for split in splits]
data_splits = load_dataset(dataset_name,
                           '3.0.0',
                            split=split_tuples,
                            )
cnn = DatasetDict(dict(zip(splits, data_splits)))
cnn

In [None]:
from transformers import BartTokenizerFast
tokenizer = BartTokenizerFast.from_pretrained(
    "facebook/bart-base",
    bos_token=None,
    eos_token=None,
    sep_token=None,
    cls_token=None,
    unk_token=None,
    pad_token=None,
)

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wnl = WordNetLemmatizer()

def clean_token(token):
    return wnl.lemmatize(token.lstrip('Ġ').lower())

def text_to_tokens(text):
    tokens = [
        cleaned for token in tokenizer.tokenize(text)
        if (cleaned:= clean_token(token)).isalpha() and not cleaned in stop_words]
    return tokens

def tokenize_dataset(x):
    x['article_tokens'] = text_to_tokens(x['article'])
    x['highlights_tokens'] = text_to_tokens(x['highlights'])
    x['highlights_tokens'] = [token for token in x['highlights_tokens'] if token != "ċ"]
    return x

In [None]:
cnn_with_tokens = cnn.map(tokenize_dataset)

In [None]:
import gensim
dct = gensim.corpora.Dictionary(cnn_with_tokens['train']['article_tokens'])
dct.filter_extremes(no_below=5, no_above=0.3, keep_n=10_000)

In [None]:
def add_gensim_bow(x):
    x['article_gensim_bow'] = dct.doc2bow(x['article_tokens'])
    x['highlights_gensim_bow'] = dct.doc2bow(x['highlights_tokens'])
    return x

cnn_with_bow = cnn_with_tokens.map(add_gensim_bow)

In [None]:
lda = gensim.models.ldamulticore.LdaMulticore(cnn_with_bow['train']['article_gensim_bow'], id2word=dct, num_topics=250, workers=4)
topic_word_dist = lda.get_topics()

# After LDA training

In [None]:
def get_topic_dist(x):
    return lda.get_document_topics(x, 0, 0)

def add_topic_dist(x):
    x['article_topics_distribution'] = lda.get_document_topics(x['article_gensim_bow'], 0, 0)
    x['highlights_topics_distribution'] = lda.get_document_topics(x['highlights_gensim_bow'], 0, 0)
    return x

cnn_with_topic_distribution = cnn_with_bow.map(add_topic_dist)

In [None]:
# If using tf-idf as input data, set filter_tfidf to reduce input space
filter_tfidf = False

if filter_tfidf:
    tfidf_vector_args = {'max_df': 0.25, 'min_df': 0.02}
else:
    tfidf_vector_args = {'vocabulary': dct.token2id.keys()}

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             tokenizer = lambda x: x,
                             lowercase=False,
                             ngram_range=(1, 1),
                             **tfidf_vector_args
                            )

vectorizer.fit(cnn_with_topic_distribution['train']['article_tokens'])
vectorizer.get_feature_names_out().shape

In [None]:
import numpy as np
from transformers import BartTokenizerFast

tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-base')

In [None]:
def clean_tokens(tokens):
    return [clean_token(token) for token in tokens]


def tokens_to_weights(tokens, weights, missing_weight=1e-9):
    cleaned_tokens = clean_tokens(tokens)
    print(cleaned_tokens)
    token_ids = [dct.token2id.get(clean_token, -1) for clean_token in cleaned_tokens]
    weights_with_missing = np.append(weights, [missing_weight])
    token_weights = [weights_with_missing[token_id] for token_id in token_ids]
    return token_weights

tokens_to_weights(['syrian', 'official', 'gun', '<unk>'], np.random.rand(10000))

In [None]:
import numpy as np
from scipy.sparse import coo_matrix
from tqdm import tqdm
from sklearn.preprocessing import normalize


def get_whole_sparse_matrix(bow_iterable, norm=False):
    # Initialize lists to store indices and values
    indices = []
    values = []

    # Iterate over each row of index-value pairs
    print('Starting iter')
    for row_index, row_pairs in enumerate(tqdm(bow_iterable)):
        for col_index, value in row_pairs:
            indices.append((int(row_index), int(col_index)))
            values.append(value)

    # Separate row and column indices
    row_indices, col_indices = zip(*indices)

    # Determine the shape of the matrix
    shape = (max(row_indices) + 1, max(col_indices) + 1)

    # Create COO sparse matrix
    coo_mat = coo_matrix((values, (row_indices, col_indices)), shape=shape, dtype=np.float32)
    if norm:
        return normalize(coo_mat, norm="l1")
    return coo_mat

In [None]:
# For Tf-idf input
input_data = {
    'train':vectorizer.transform(cnn_with_topic_distribution['train']['article_tokens']).astype(np.float32).toarray(),
    'validation':vectorizer.transform(cnn_with_topic_distribution['validation']['article_tokens']).astype(np.float32).toarray(),
    'test': vectorizer.transform(cnn_with_topic_distribution['test']['article_tokens']).astype(np.float32).toarray()
}

In [None]:
# For bow-input set norm=True to get Bow-Freq, else Bow-count
input_data = {
    'train':get_whole_sparse_matrix(cnn_with_topic_distribution['train']['article_gensim_bow'], norm=True).toarray(),
    'validation':get_whole_sparse_matrix(cnn_with_topic_distribution['validation']['article_gensim_bow'], norm=True).toarray(),
    'test': get_whole_sparse_matrix(cnn_with_topic_distribution['test']['article_gensim_bow'], norm=True).toarray()
}

In [None]:
test_article_dist = get_whole_sparse_matrix(cnn_with_topic_distribution['test']['article_topics_distribution'], norm=False).toarray()
test_highglight_dist = get_whole_sparse_matrix(cnn_with_topic_distribution['test']['highlights_topics_distribution'], norm=False).toarray()

In [None]:
import numpy as np
from scipy.sparse import coo_matrix

def convert_topic_distribution(x):
    return np.fromiter(map(lambda y: y[1],x), np.float32)

def convert_to_numpy(x):
    x['highlights_topics_distribution'] = convert_topic_distribution(x['highlights_topics_distribution'])
    return x

In [None]:
highlights_topics_distributions = cnn_with_topic_distribution.map(
                                                    convert_to_numpy, num_proc=4,keep_in_memory=True,
                                                    remove_columns=['article',
                                                                    'highlights',
                                                                    'id', 
                                                                    'article_tokens', 
                                                                    'highlights_tokens',
                                                                    'highlights_gensim_bow',
                                                                    'article_gensim_bow',
                                                                    'article_topics_distribution']
                                                ).with_format(
                                                    type="numpy",
                                                    columns=[
                                                            "highlights_topics_distribution",
                                                            ]
                                                       )
highlights_topics_distributions

In [None]:
target_data = {
    'train': highlights_topics_distributions['train']['highlights_topics_distribution'],
    'test': highlights_topics_distributions['test']['highlights_topics_distribution'],
    'validation': highlights_topics_distributions['validation']['highlights_topics_distribution']
}

In [None]:
from torch.utils.data import Dataset

# Due to memory constraints the per word weights target has to be calculated on the fly with the SpaceEfficientTopiDataset

class SpaceEfficientTopicDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx] @ topic_word_dist

class TopicDistributionDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels
    
    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [None]:
from typing import Any, Callable
import numpy as np
import torch
import torchmetrics

def intersect(x, y):
        a_cat_b, counts = torch.cat([x, y]).unique(return_counts=True)
        intersection = a_cat_b[torch.where(counts.gt(1))]
        return intersection


class TopKMetric:
    def __init__(self,k, device='cuda', **kwargs: Any) -> None:
        super().__init__(**kwargs)
        self.k = k
        self.mse = torchmetrics.MeanSquaredError().to(device)
        self.cross_entropy = torch.nn.CrossEntropyLoss().to(device)
    
    def __call__(self, input: torch.FloatTensor, target: torch.FloatTensor) -> Any:
        top_input = torch.topk(input, self.k)
        top_target = torch.topk(target, self.k)
        overlapping_indices = np.mean([len(intersect(x,y)) for x, y in zip(top_input.indices, top_target.indices)])
        top_input_vals = input.gather(1, top_target.indices)
        return {
            "num_overlapping_indices": overlapping_indices,
            "mse": self.mse(top_input_vals, top_target.values),
            "cross_entropy": self.cross_entropy(top_input_vals, top_target.values),
        }

In [None]:
from torch.utils.data import DataLoader

def make_loader(dataset_input, dataset_output, batch_size, shuffle=False):
    # Change the dataset here according to which target to use, Topics or Word Weights
    topic_dataset = TopicDistributionDataset(dataset_input, dataset_output)
    return DataLoader(
        topic_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=4, pin_memory=True
    )

data_config = {
    "batch_size": 500
}
train_loader = make_loader(input_data['train'], target_data['train'], data_config["batch_size"], True)
test_loader = make_loader(input_data['test'], target_data['test'], data_config["batch_size"])
val_loader = make_loader(input_data['validation'], target_data['validation'], data_config["batch_size"])

In [None]:
from typing import Any
from torch import nn, Tensor
import torch
import logging


class TopicFeedForward(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        output_size: int,
        hidden_activation_function = None,
        output_activation_function = None,
        input_dropout = None,
        hidden_dropout = None
    ):
        super(TopicFeedForward, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

        self.hidden_activation_function = None
        self.output_activation_function = None

        if hidden_activation_function is not None:
            self.hidden_activation_function = hidden_activation_function()
        if output_activation_function is not None:
            self.output_activation_function = output_activation_function()
        
        self.hidden_dropout = None
        self.input_dropout = None
        if hidden_dropout is not None and hidden_dropout > 0:
            self.hidden_dropout = torch.nn.Dropout(hidden_dropout)
        if input_dropout is not None and input_dropout > 0:
            self.input_dropout = torch.nn.Dropout(input_dropout)

        #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        #self.to(self.device)

        self.logger = logging.getLogger(self.__class__.__name__)

        self.logger.info(
            f"Initialized TopicFeedForward with input_size: {input_size}, hidden_size: {hidden_size}, output_size: {output_size}"
        )

    def forward(self, x: Tensor):
        if self.input_dropout is not None:
            x = self.input_dropout(x)
        out = self.fc1(x)
        if self.hidden_activation_function is not None:
            out = self.hidden_activation_function(out)
        if self.hidden_dropout is not None:
            out = self.hidden_dropout(out)
        out = self.fc2(out)
        if self.output_activation_function is not None:
            out = self.output_activation_function(out)
        return out

In [None]:
from typing import Any, Callable
import torch
import wandb
import torchmetrics
from pytorch_lightning import LightningModule
from huggingface_hub import PyTorchModelHubMixin


class Metric:
    def __init__(
        self,
        name: str,
        function: Callable,
        on_epoch: bool,
        for_steps: set[str],
    ) -> None:
        self.name = name
        self.function = function
        self.on_epoch = on_epoch
        self.for_steps = for_steps


class LitTopicFeedForward(LightningModule, PyTorchModelHubMixin):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        output_size: int,
        hidden_activation_function=None,
        output_activation_function=None,
        loss: Any = torch.nn.CrossEntropyLoss,
        optimizer: Any = torch.optim.Adam,
        learning_rate: float = 0.001,
        dropout=0.2,
        custom_metrics: list[Metric] = None,
    ):
        super(LitTopicFeedForward, self).__init__()
        self.model = TopicFeedForward(
            input_size,
            hidden_size,
            output_size,
            hidden_activation_function,
            output_activation_function,
            input_dropout=dropout,
            hidden_dropout=dropout,
        )
        self.loss = loss()
        self.optimizer = optimizer
        self.learning_rate = learning_rate

        self._mse = torchmetrics.MeanSquaredError()
        self._cross_entropy = torch.nn.CrossEntropyLoss()

        self._all_steps = {"train", "val", "test"}

        default_metrics = [
            Metric("mse", self._mse, True, self._all_steps),
            Metric("cross_entropy", self._cross_entropy, True, self._all_steps),
        ]
        self.metrics = default_metrics + (
            custom_metrics if custom_metrics is not None else []
        )

        self.save_hyperparameters()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.model(x)

    def _compute_metrics(self, batch, step_name: str) -> Any:
        x, y = batch
        y_hat = self(x)
        loss = self.loss(y_hat, y)
        self.log(f"{step_name}_loss", loss, on_epoch=True)
        for metric in self.metrics:
            if step_name in metric.for_steps:
                metric_value = metric.function(y_hat, y)
                if isinstance(metric_value, dict):
                    for key, value in metric_value.items():
                        self.log(
                            f"{step_name}_{metric.name}_{key}",
                            value,
                            on_epoch=metric.on_epoch,
                        )
                else:
                    self.log(
                        f"{step_name}_{metric.name}",
                        metric_value,
                        on_epoch=metric.on_epoch,
                    )
        return loss

    def training_step(self, batch: Any, batch_idx: int) -> Any:
        return self._compute_metrics(batch, "train")

    def validation_step(self, batch: Any, batch_idx: int) -> Any:
        return self._compute_metrics(batch, "val")

    def test_step(self, batch: Any, batch_idx: int) -> Any:
        return self._compute_metrics(batch, "test")

    def on_test_epoch_end(self) -> None:
        dummy_input = torch.randn(1, self.hparams.input_size)
        model_filename = f"topic_feedforward_final.onnx"
        self.to_onnx(model_filename, input_sample=dummy_input, export_params=True)

        artifact = wandb.Artifact(name="model_final.ckpt", type="model")
        artifact.add_file(model_filename)
        self.logger.experiment.log_artifact(artifact)

    def configure_optimizers(self) -> Any:
        return self.optimizer(self.model.parameters(), lr=self.learning_rate)


In [None]:
import wandb
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torch
import pytorch_lightning as pl
import torch
import numpy as np
pl.seed_everything(42)


wandb.init(
    mode='offline',
    name='bow_normalized_in_topics_out',
    project="ff_attempts",

    # track hyperparameters and run metadata
    config={
        "input": "article_bow",
        "target": "topic_dist",
        "input_shape": 10_000,
        "outpt_shape": 250
    }
)

wandb_logger = pl.loggers.WandbLogger(log_model="all")

model_config = {
    # Change based on input size
    "input_size": 10_000,
    # The same number of hidden nodes as in the CONFORMER
    "hidden_size": 300,
    # Change based on output size
    "output_size":  250,
    "hidden_activation_function": None,
    "output_activation_function": None,
    "loss": torch.nn.CrossEntropyLoss,
    "dropout": None,
    "optimizer": torch.optim.Adam,
    "custom_metrics": [
        Metric("topk3", TopKMetric(3), True, {'val', 'test'}),
        Metric("topk5", TopKMetric(5), True, {'val', 'test'}),
        Metric("topk10", TopKMetric(10), True, {'val', 'test'}),
    ]
}

stopping_callback = EarlyStopping(monitor="val_cross_entropy", patience=4, verbose=False, mode="min")

training_config= {
    "max_epochs": 15,
    "callbacks": [stopping_callback],
    "log_every_n_steps":25,
    "deterministic":True,
    "detect_anomaly":False,
    "val_check_interval":0.5,
}
trainer = pl.Trainer(
    logger=wandb_logger,
    **training_config
)

model = LitTopicFeedForward(**model_config)

wandb_logger.watch(model, log="all")

trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)
trainer.test(model, dataloaders=test_loader)

# Generate weights for CONFORMER

In [None]:
import pandas as pd
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary

# returns a Pandas DataFrame of the NEWTS testing set
def read_test(path="nortsformer/topic_model/lda/NEWTS_test_600.csv"):
    out = pd.read_csv(path, encoding='utf-8', index_col=[0])
    assert(len(out) == 600)
    return out

# returns a Pandas DataFrame of the NEWTS training set
def read_train(path="nortsformer/topic_model/lda/NEWTS_train_2400.csv"):
    out = pd.read_csv(path, encoding='utf-8', index_col=[0])
    assert(len(out) == 2400)
    return out

In [None]:
newts_test = read_test()
newts_train = read_train()

In [None]:
import datasets
hf_newts_test = datasets.Dataset.from_pandas(newts_test, preserve_index=False)
hf_newts_train = datasets.Dataset.from_pandas(newts_train, preserve_index=False)
hf_newts_train

In [None]:
def separate_samples(newts_dataset):
    newts_separated = []
    for x in newts_dataset:
        sample1 = {
            'id': x['AssignmentId'] + "_1",
            'article': x['article'],
            'sentence': x['sentences1'],
            'summary': x['summary1']
        }
        sample2 = {
            'id': x['AssignmentId'] + "_2",
            'article': x['article'],
            'sentence': x['sentences2'],
            'summary': x['summary2']
        }
        newts_separated.extend([sample1, sample2])
    return datasets.Dataset.from_pandas(pd.DataFrame(data=newts_separated))
newts_cleaned = datasets.DatasetDict(
    {'test':separate_samples(hf_newts_test),
    'train':separate_samples(hf_newts_train)
    }
)
newts_cleaned

In [None]:
def tokenize_newts(x):
    x['sentence_plus_article_tokens'] = text_to_tokens(x['sentence'] + " " + x['article'])
    return x

In [None]:
newts_with_tokens = newts_cleaned.map(tokenize_newts, num_proc=4)
newts_with_tokens

In [None]:
def add_gensim_bow(x):
    x['gensim_bow'] = dct.doc2bow(x['sentence_plus_article_tokens'])
    return x
newts_with_bow = newts_with_tokens.map(add_gensim_bow)
newts_with_bow

In [None]:
train_bow = get_whole_sparse_matrix(newts_with_bow['train']['gensim_bow'], norm=True).toarray()
test_bow = get_whole_sparse_matrix(newts_with_bow['test']['gensim_bow'], norm=True).toarray()

In [None]:
train_tfid = vectorizer.transform(newts_with_bow['train']['sentence_plus_article_tokens']).astype(np.float32).toarray()
test_tfid = vectorizer.transform(newts_with_bow['test']['sentence_plus_article_tokens']).astype(np.float32).toarray()

In [None]:
def get_document_topics_from_lda(x):
    x['document_topics'] = lda.get_document_topics(x['gensim_bow'], 0, 0)
    return x
newts_with_lda_topics = newts_with_bow.map(get_document_topics_from_lda)

In [None]:
import torch.nn.functional as F
# If model output is topics
test_result = F.softmax(model(torch.from_numpy(test_bow))).detach().numpy() @ topic_word_dist
train_result = F.softmax(model(torch.from_numpy(train_bow))).detach().numpy() @ topic_word_dist

In [None]:
# If model putput is word weights
test_result = model(torch.from_numpy(test_bow)).detach().numpy()
train_result = model(torch.from_numpy(train_bow)).detach().numpy()

In [None]:
test_tau_data = datasets.Dataset.from_dict({"tau": test_result})
test_final = datasets.concatenate_datasets([newts_with_bow['test'], test_tau_data], axis=1).remove_columns(['sentence_plus_article_tokens', 'gensim_bow'])
train_tau_data = datasets.Dataset.from_dict({"tau": train_result})
train_final = datasets.concatenate_datasets([newts_with_bow['train'], train_tau_data], axis=1).remove_columns(['sentence_plus_article_tokens', 'gensim_bow'])

In [None]:
final_dataset = datasets.DatasetDict(
    {
        'train': train_final,
        'test': test_final
    }
)
final_dataset.save_to_disk("../dataset/NEWTS_with_tau")