In [21]:
import pickle

import torch
from datasets import ClassLabel, Dataset, concatenate_datasets, load_dataset
from transformers import pipeline
from setfit import AbsaModel

from divide_and_conquer_sentiment import PolaritySentimentModel
from divide_and_conquer_sentiment.aggregation import MLP, MLPAggregator, SawonAggregator
from divide_and_conquer_sentiment.dataloaders import load_kaggle_dataset
from divide_and_conquer_sentiment.subprediction import ABSASubpredictor, ChunkSubpredictor
from divide_and_conquer_sentiment.subprediction.sentence import Chunker
from divide_and_conquer_sentiment.model import DACSModel

from divide_and_conquer_sentiment.evaluation import (
    model_dataset_comparison,
    plot_metrics_per_token_count_bins,
    simple_classification_report,
)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
TRAIN_BATCH_SIZE = 256

# Read & prepare dataset


In [5]:
twitter_airlines_dataset = load_kaggle_dataset(
    "crowdflower/twitter-airline-sentiment",
    {"text": "text", "airline_sentiment": "label"},
    val_test_perc=(0.1, 0.2),
    seed=42,
)

Dataset URL: https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment


  response_data.getheaders())


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/14640 [00:00<?, ? examples/s]

Map:   0%|          | 0/14640 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/14640 [00:00<?, ? examples/s]

In [6]:
amazon_headphones_dataset = load_kaggle_dataset(
    "mdwaquarazam/headphone-dataset-review-analysis",
    {"COMMENTS": "text", "RATINGS": "label"},
    val_test_perc=(0.1, 0.2),
    seed=42,
)

Dataset URL: https://www.kaggle.com/datasets/mdwaquarazam/headphone-dataset-review-analysis


  response_data.getheaders())


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1604 [00:00<?, ? examples/s]

Map:   0%|          | 0/1604 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/1604 [00:00<?, ? examples/s]

In [7]:
SST_LABEL_MAP = {0: 0, 1: 0, 2: 1, 3: 2, 4: 2}


def map_sst_label(x):
    x["label"] = SST_LABEL_MAP[x["label"]]
    return x


sst_dataset = (
    load_dataset("SetFit/sst5")
    .remove_columns(["label_text"])
    .map(map_sst_label)
    .cast_column("label", ClassLabel(names=["negative", "neutral", "positive"]))
)

Repo card metadata block was not found. Setting CardData to empty.


In [8]:
train_dataset = concatenate_datasets(
    [twitter_airlines_dataset["train"], amazon_headphones_dataset["train"], sst_dataset["train"]]
)
val_dataset = concatenate_datasets(
    [twitter_airlines_dataset["val"], amazon_headphones_dataset["val"], sst_dataset["validation"]]
)
test_dataset = concatenate_datasets(
    [twitter_airlines_dataset["test"], amazon_headphones_dataset["test"], sst_dataset["test"]]
)

In [9]:
for i in range(len(val_dataset["text"])):
    if val_dataset["text"][i] == "":
        print(val_dataset[i])

{'text': '', 'label': 2}
{'text': '', 'label': 0}
{'text': '', 'label': 2}
{'text': '', 'label': 0}
{'text': '', 'label': 2}
{'text': '', 'label': 2}



# Train MLP on ABSA model

In [None]:
!spacy download en_core_web_lg

In [None]:
subpredictor = ABSASubpredictor.from_pretrained(
    "tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-aspect",
    "tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-polarity",
    spacy_model="en_core_web_lg",
)
mlp = MLP(input_size=4, output_size=3, hidden_layer_sizes=(128, 64), lr=0.01)
aggregator = MLPAggregator(mlp)

In [None]:
train_subpreds = subpredictor.predict(train_dataset["text"])
val_subpreds = subpredictor.predict(val_dataset["text"])

In [None]:
train_dataset = concatenate_datasets([train_dataset, Dataset.from_dict({"subpreds": train_subpreds})], axis=1)
val_dataset = concatenate_datasets([val_dataset, Dataset.from_dict({"subpreds": val_subpreds})], axis=1)

In [None]:
with open("train_dataset_subpreds.pkl", "wb") as handle:
    pickle.dump(train_dataset, handle)

with open("val_dataset_subpreds.pkl", "wb") as handle:
    pickle.dump(val_dataset, handle)

In [None]:
with open("train_dataset_subpreds.pkl", "rb") as handle:
    train_dataset = pickle.load(handle).with_format("torch")

with open("val_dataset_subpreds.pkl", "rb") as handle:
    val_dataset = pickle.load(handle).with_format("torch")

In [None]:
aggregator.train(train_dataset, val_dataset)

# Calc SAWON

## Train

In [None]:
!spacy download en_core_web_lg

In [10]:
polarity_model = AbsaModel.from_pretrained(
    "tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-polarity",
    spacy_model="en_core_web_lg",
).polarity_model

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-polarity
Overriding labels in model configuration from None to ['no aspect', 'aspect'].
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: tomaarsen/setfit-absa-bge-small-en-v1.5-restaurants-polarity


In [11]:
polarity_sentiment_model = PolaritySentimentModel(polarity_model)

In [12]:
chunker = Chunker()
sentence_subpredictor = ChunkSubpredictor(chunker, polarity_sentiment_model)

In [13]:
sentences_train_subpreds = sentence_subpredictor.predict(train_dataset["text"])



In [14]:
sentences_val_subpreds = sentence_subpredictor.predict(val_dataset["text"])



In [15]:
with open("sentences_train_subpreds.pkl", "wb") as handle:
    pickle.dump(sentences_train_subpreds, handle)

with open("sentences_val_subpreds.pkl", "wb") as handle:
    pickle.dump(sentences_val_subpreds, handle)

In [16]:
sawon = SawonAggregator(polarity_sentiment_model,0.9)

In [17]:
sawon_train_preds = sawon.aggregate(sentences_train_subpreds, passages =  train_dataset["text"])

In [18]:
sawon_val_preds = sawon.aggregate(sentences_val_subpreds, passages = val_dataset["text"])

## Evaluate

In [22]:
dacs_sawon = DACSModel(sentence_subpredictor, sawon)

In [23]:
twitter_test_dacs_sawon_preds = dacs_sawon.predict(twitter_airlines_dataset["test"]["text"], passages =  twitter_airlines_dataset["test"]["text"])
amazon_test_dacs_sawon_preds = dacs_sawon.predict(amazon_headphones_dataset["test"]["text"], passages =  amazon_headphones_dataset["test"]["text"])
sst_test_dacs_sawon_preds = dacs_sawon.predict(sst_dataset["test"]["text"], passages =  sst_dataset["test"]["text"])



In [27]:
type(twitter_airlines_dataset["test"]["label"][0])

int

In [24]:
print(simple_classification_report(twitter_airlines_dataset["test"]["label"], twitter_test_dacs_sawon_preds))
print(simple_classification_report(amazon_headphones_dataset["test"]["label"], amazon_test_dacs_sawon_preds))
print(simple_classification_report(sst_dataset["test"]["label"], sst_test_dacs_sawon_preds))

ValueError: Unknown label values

# Train MLP on sentences

In [28]:
mlp_sentences = MLP(input_size=3, output_size=3, hidden_layer_sizes=(128, 64), lr=0.01)
mlp_aggregator_sentences = MLPAggregator(mlp_sentences)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/pawel.marcinkowski/.pyenv/versions/venv-spacy/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [29]:
train_sentences_dataset = concatenate_datasets([train_dataset, Dataset.from_dict({"subpreds": sentences_train_subpreds})], axis=1)
val_sentences_dataset = concatenate_datasets([val_dataset, Dataset.from_dict({"subpreds": sentences_val_subpreds})], axis=1)

Flattening the indices:   0%|          | 0/19914 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/19914 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/2726 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/2726 [00:00<?, ? examples/s]

In [30]:
mlp_aggregator_sentences.train(train_sentences_dataset, val_sentences_dataset)

/Users/pawel.marcinkowski/.pyenv/versions/venv-spacy/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.

  | Name   | Type       | Params | Mode 
----------------------------------------------
0 | layers | ModuleList | 10.6 K | train
----------------------------------------------
10.6 K    Trainable params
0         Non-trainable params
10.6 K    Total params
0.043     Total estimated model params size (MB)
4         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/pawel.marcinkowski/.pyenv/versions/venv-spacy/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/Users/pawel.marcinkowski/.pyenv/versions/venv-spacy/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [37]:
import torch
pred = torch.tensor([[0.1, 0.2, 0.3]])
type(torch.argmax(pred.squeeze()).item())

int

In [39]:
type(sst_dataset["test"]["label"][0])

int