<a href="https://colab.research.google.com/github/ikram2500/pytorch/blob/main/text_classification_sarcasm_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers --quiet
!pip install opendatasets --quiet

import opendatasets as od
od.download("https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection")

Skipping, found downloaded files in "./news-headlines-dataset-for-sarcasm-detection" (use force=True to force download)


In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("availabel device: ", device)

availabel device:  cuda


In [None]:
data_df = pd.read_json("/content/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json", lines = True)
data_df.dropna(inplace=True)
data_df.drop_duplicates(inplace=True)
data_df.drop(["article_link"], axis=1, inplace=True)
print(data_df.shape)
data_df.head()

(26708, 2)


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.array(data_df['headline']), np.array(data_df['is_sarcastic']), test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

print("train: ", X_train.shape, y_train.shape, "rows which is : ", round(X_train.shape[0]/data_df.shape[0], 4) * 100, "%")
print("val: ", X_val.shape, y_val.shape, "rows which is : ", round(X_val.shape[0]/data_df.shape[0], 4) * 100, "%")
print("test: ", X_test.shape, y_test.shape, "rows which is : ", round(X_test.shape[0]/data_df.shape[0], 4) * 100, "%")

train:  (18695,) (18695,) rows which is :  70.0 %
val:  (4006,) (4006,) rows which is :  15.0 %
test:  (4007,) (4007,) rows which is :  15.0 %


In [None]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

In [None]:
class dataset(Dataset):
  def __init__(self, X,Y):
    self.X = [tokenizer(X,
                       max_length = 100,
                        truncation =  True,
                        padding= "max_length",
                        return_tensors = "pt").to(device) for X in X
              ]
    self.Y = torch.tensor(Y, dtype= torch.float32).to(device)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    return self.X[idx], self.Y[idx]

train_dataset = dataset(X_train, y_train)
val_dataset = dataset(X_val, y_val)
test_dataset = dataset(X_test, y_test)


In [None]:
BATCH_SIZE = 32
EPOCHS = 10
LR = 1E-4

In [None]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
class MyModel(nn.Module):
  def __init__(self, bert):
    super(MyModel, self).__init__()

    self.bert =bert
    self.dropout = nn.Dropout(0.25)
    self.linear1 = nn.Linear(768,384)
    self.linear2 = nn.Linear(384, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(input_ids, attention_mask, return_dict=False)[0][:,0]
    output = self.linear1(pooled_output)
    output = self.dropout(output)
    output = self.linear2(output)
    output = self.sigmoid(output)
    return output

In [None]:
for param in bert_model.parameters():
  param.requires_grad = False
model = MyModel(bert_model).to(device)

In [None]:
model

MyModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [None]:
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=LR)

In [None]:
total_loss_train_plot = []
total_loss_validation_plot = []
total_acc_train_plot = []
total_acc_validation_plot = []

for epoch in range(EPOCHS):
  total_acc_train = 0
  total_loss_train = 0
  total_acc_val = 0
  total_loss_val = 0

  for indx , data in enumerate(train_loader):
    inputs, labels = data
    inputs.to(device)
    labels.to(device)

    prediction = model(inputs["input_ids"].squeeze(1), inputs["attention_mask"].squeeze(1)).squeeze(1)
    batch_loss = criterion(prediction, labels)
    total_loss_train += batch_loss.item()

    acc = (prediction.round() == labels).sum().item()

    total_acc_train += acc

    batch_loss.backward()
    optimizer.step()
    optimizer.zero_grad()

  with torch.no_grad():
    for indx , data in enumerate(val_loader):
      inputs, labels = data
      inputs.to(device)
      labels.to(device)

      prediction = model(inputs["input_ids"].squeeze(1), inputs["attention_mask"].squeeze(1)).squeeze(1)
      batch_loss = criterion(prediction, labels)
      total_loss_val += batch_loss.item()

      acc = (prediction.round() == labels).sum().item()

      total_acc_val += acc
total_loss_train_plot.append(round(total_loss_train/1000, 4))
total_loss_validation_plot.append(round(total_loss_val/1000, 4))
total_acc_train_plot.append(round((total_acc_train/train_dataset.__len__()) * 100, 4))
total_acc_train_plot.append(round((total_acc_val/val_dataset.__len__()) * 100, 4))

print(f"""
Epoch No. {epoch+1} Train Loss: {round(total_loss_train/1000, 4)} Train Accuracy: {round((total_acc_train/train_dataset.__len__()) * 100, 4 )}
      Validation Loss: {round(total_loss_val/1000, 4)} Validation Accuracy: {round((total_acc_val/val_dataset.__len__()) * 100, 4)}
""")


Epoch No. 10 Train Loss: 0.1785 Train Accuracy: 87.034
      Validation Loss: 0.042 Validation Accuracy: 86.2706

