In [None]:
pip install pytorch-lightning==2.1.0

Collecting pytorch-lightning==2.1.0
  Downloading pytorch_lightning-2.1.0-py3-none-any.whl (774 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.6/774.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning==2.1.0)
  Downloading torchmetrics-1.4.0.post0-py3-none-any.whl (868 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch-lightning==2.1.0)
  Downloading lightning_utilities-0.11.2-py3-none-any.whl (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.12.0->pytorch-lightning==2.1.0)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.12.0->pytorch-lightning==2.1.0)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#import pandas as pd
#from sklearn.model_selection import train_test_split
#
#def load_json_dataset(path="drive/MyDrive/data/twitter/ts_dataset/training.jsonl"):
#    data = pd.read_json(path, lines=True)
#
#    # Convert labels to numeric
#    label_map = {"NOT_SARCASM": 0, "SARCASM": 1}
#    data["label"] = data["label"].map(label_map)
#
#    # Extract context and response
#    data["text"] ="CLS " + data["context"].apply(lambda x: " ".join(x)) + " SEP " + data["response"]
#
#    data = data[["text", "label"]]
#
#    train_val, test_data = train_test_split(data, test_size=0.1, random_state=42)
#    train_data, val_data = train_test_split(train_val, test_size=0.11, random_state=42)
#
#    return train_data, val_data, test_data
#
#train, val, test = load_json_dataset()
#
#print(train.head())
#print(val.head())
#print(test.head())

                                                   text  label
738   CLS A live look at my emotions watching this g...      1
4050  CLS When I smashed my shoulder 3yrs ago my #re...      0
3758  CLS He was exposed as a fraud and ran away .. ...      0
4187  CLS life always gives you a second chance tomo...      0
2348  CLS Breaking news : Airstrike at Baghdad airpo...      1
                                                   text  label
3536  CLS @USER @USER @USER The ‘ once in a generati...      0
1863  CLS " Military-style " firearms aren't protect...      1
4710  CLS @USER very disappointed at not being able ...      0
3796  CLS @USER @USER I guess it's true when they sa...      0
2011  CLS Constant reminders of the tremendous loss ...      1
                                                   text  label
1501  CLS Blah blah blah . Just listen to this guy ....      1
2586  CLS There ’ s a beautiful #WolfMoon rising ove...      0
2653  CLS #TheBachelor I have no idea why such beaut...

In [None]:

import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import (
    BertModel,
    AdamW,
    get_linear_schedule_with_warmup,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from pytorch_lightning import Trainer


class HeadlinesSarcasmClassifier(pl.LightningModule):
    def __init__(self, n_classes=2, steps_per_epoch=None, n_epochs=None, lr=2e-5):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased", return_dict=True)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.classifier(output.pooler_output)
        return output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = self(input_ids, attention_mask)
        loss = F.cross_entropy(outputs, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = self(input_ids, attention_mask)
        loss = F.cross_entropy(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        acc = accuracy_score(labels.cpu(), preds.cpu())
        self.log("val_loss", loss, prog_bar=True, logger=True)
        self.log("val_acc", acc, prog_bar=True, logger=True)
        return {"val_loss": loss, "val_acc": acc}

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        outputs = self(input_ids, attention_mask)
        loss = F.cross_entropy(outputs, labels)
        preds = torch.argmax(outputs, dim=1)
        acc = accuracy_score(labels.cpu(), preds.cpu())
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels.cpu(), preds.cpu(), average="weighted"
        )
        self.log("test_loss", loss, prog_bar=True, logger=True)
        self.log("test_acc", acc, prog_bar=True, logger=True)
        self.log("test_precision", precision, prog_bar=True, logger=True)
        self.log("test_recall", recall, prog_bar=True, logger=True)
        self.log("test_f1", f1, prog_bar=True, logger=True)
        return {
            "test_loss": loss,
            "test_acc": acc,
            "test_precision": precision,
            "test_recall": recall,
            "test_f1": f1,
        }

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr)
        total_steps = self.steps_per_epoch * self.n_epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps
        )
        return [optimizer], [scheduler]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

def load_json_dataset(path="drive/MyDrive/data/twitter/ts_dataset/training.jsonl"):
    data = pd.read_json(path, lines=True)

    # Convert labels to numeric
    label_map = {"NOT_SARCASM": 0, "SARCASM": 1}
    data["label"] = data["label"].map(label_map)

    # Extract context and response without concatenation
    data["context_text"] = data["context"].apply(lambda x: " ".join(x))
    data["response_text"] = data["response"]

    data = data[["context_text", "response_text", "label"]]

    train_val, test_data = train_test_split(data, test_size=0.1, random_state=42)
    train_data, val_data = train_test_split(train_val, test_size=0.11, random_state=42)

    return train_data, val_data, test_data

train, val, test = load_json_dataset()


In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import pytorch_lightning as pl


class HeadlinesSarcasmDataset(Dataset):
    def __init__(self, data, max_token_len=512):
        self.data = data
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.max_token_len = max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        context = self.data.iloc[index]["context_text"]
        text = self.data.iloc[index]["response_text"]
        label = self.data.iloc[index]["label"]
        encoding = self.tokenizer.encode_plus(
            text,
            context,
            add_special_tokens=True,
            max_length=self.max_token_len,
            return_token_type_ids=True,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }


class HeadlinesSarcasmDataModule(pl.LightningDataModule):
    def __init__(self, train_data, val_data, test_data, batch_size=16):
        super().__init__()
        self.train_data = train_data
        self.val_data = val_data
        self.test_data = test_data
        self.batch_size = batch_size

    def setup(self, stage=None):
        self.train_dataset = HeadlinesSarcasmDataset(self.train_data)
        self.val_dataset = HeadlinesSarcasmDataset(self.val_data)
        self.test_dataset = HeadlinesSarcasmDataset(self.test_data)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)



In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
import pytorch_lightning as pl


#class HeadlinesSarcasmDataset(Dataset):
#    def __init__(self, data, max_token_len=256):
#        self.data = data
#        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#        self.max_token_len = max_token_len
#
#    def __len__(self):
#        return len(self.data)
#
#    def __getitem__(self, index):
#        text = self.data.iloc[index]["text"]
#        label = self.data.iloc[index]["label"]
#        encoding = self.tokenizer.encode_plus(
#            text,
#            add_special_tokens=True,
#            max_length=self.max_token_len,
#            return_token_type_ids=False,
#            padding="max_length",
#            truncation=True,
#            return_attention_mask=True,
#            return_tensors="pt",
#        )
#        return {
#            "input_ids": encoding["input_ids"].flatten(),
#            "attention_mask": encoding["attention_mask"].flatten(),
#            "labels": torch.tensor(label, dtype=torch.long),
#        }
#
#
#class HeadlinesSarcasmDataModule(pl.LightningDataModule):
#    def __init__(self, train_data, val_data, test_data, batch_size=16):
#        super().__init__()
#        self.train_data = train_data
#        self.val_data = val_data
#        self.test_data = test_data
#        self.batch_size = batch_size
#
#    def setup(self, stage=None):
#        self.train_dataset = HeadlinesSarcasmDataset(self.train_data)
#        self.val_dataset = HeadlinesSarcasmDataset(self.val_data)
#        self.test_dataset = HeadlinesSarcasmDataset(self.test_data)
#
#    def train_dataloader(self):
#        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
#
#    def val_dataloader(self):
#        return DataLoader(self.val_dataset, batch_size=self.batch_size)
#
#    def test_dataloader(self):
#        return DataLoader(self.test_dataset, batch_size=self.batch_size)



In [None]:
import torch
import torch.nn.functional as F
import pytorch_lightning as pl
from transformers import (
    BertModel,
    AdamW,
    logging,
    get_linear_schedule_with_warmup,
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import sys
# Append the directory to your python path using sys
sys.path.append('/content/drive/MyDrive/context_based/bert/')
from pytorch_lightning import Trainer

In [None]:
train, val, test = load_json_dataset("drive/MyDrive/data/twitter/ts_dataset/training.jsonl")
steps_per_epoch = len(train) // 16
n_epochs = 1
logging.set_verbosity_error()
classifier = HeadlinesSarcasmClassifier(
    steps_per_epoch=steps_per_epoch, n_epochs=n_epochs
)
data_module = HeadlinesSarcasmDataModule(train, val, test)
trainer = Trainer(max_epochs=n_epochs, accelerator="auto")
trainer.fit(model=classifier, datamodule=data_module)
trainer.test(model=classifier, datamodule=data_module)


INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 109 M 
1 | classifier | Linear    | 1.5 K 
-----------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.4271641969680786,
  'test_acc': 0.794,
  'test_precision': 0.8319032467532468,
  'test_recall': 0.794,
  'test_f1': 0.7928153427583767}]

In [None]:
train, val, test = load_json_dataset("drive/MyDrive/data/twitter/ts_dataset/training.jsonl")
steps_per_epoch = len(train) // 16
n_epochs = 3
logging.set_verbosity_error()
classifier = HeadlinesSarcasmClassifier(
    steps_per_epoch=steps_per_epoch, n_epochs=n_epochs
)
data_module = HeadlinesSarcasmDataModule(train, val, test)
trainer = Trainer(max_epochs=n_epochs, accelerator="auto", logger=True)
trainer.fit(model=classifier, datamodule=data_module)
trainer.test(model=classifier, datamodule=data_module)

In [None]:
train, val, test = load_json_dataset("drive/MyDrive/data/twitter/ts_dataset/training.jsonl")
steps_per_epoch = len(train) // 16
n_epochs = 5
logging.set_verbosity_error()
classifier = HeadlinesSarcasmClassifier(
    steps_per_epoch=steps_per_epoch, n_epochs=n_epochs
)
data_module = HeadlinesSarcasmDataModule(train, val, test)
trainer = Trainer(max_epochs=n_epochs, accelerator="auto", logger=True)
trainer.fit(model=classifier, datamodule=data_module)
trainer.test(model=classifier, datamodule=data_module)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type      | Params
-----------------------------------------
0 | bert       | BertModel | 109 M 
1 | classifier | Linear    | 1.5 K 
-----------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/trainer/connectors/data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=1` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.764754056930542,
  'test_acc': 0.818,
  'test_precision': 0.8489499278499281,
  'test_recall': 0.818,
  'test_f1': 0.8205677752946566}]