In [None]:
# !pip install numpy requests nlpaug
# !pip install tensformers
# !pip install evaluate
# !pip install tensorboard
# !pip install accelerate -U
# !pip uninstall pillow
# !pip install pillow==9.4.0
import pandas as pd
import numpy as np
from tqdm import tqdm
from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

from transformers import AutoTokenizer
import evaluate
from sklearn.model_selection import train_test_split
import os
import shutil

from transformers import  DataCollatorWithPadding
from datasets import  load_metric
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import BertTokenizer,BertForSequenceClassification
import time
import datetime
import random
from transformers import get_linear_schedule_with_warmup


In [None]:
torch.__version__
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:



df=pd.read_csv('../train-balanced-sarcasm.csv')
labels=df["label"].to_list()
comments=df["comment"].astype(str).to_list()
df2=pd.DataFrame({"comments":comments,"labels":labels})
df2_short={"text":comments,"labels":labels}

In [None]:

# x_train, x_test_valid, y_train, y_test_valid = train_test_split(comments, labels, test_size=0.33, random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(comments, labels, test_size=0.3, random_state=42)

# x_test, x_valid, y_test, y_valid = train_test_split(x_test_valid, y_test_valid, test_size=0.5, random_state=42)

In [None]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:



tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2, 
)
# model = AutoModelForSequenceClassification.from_pretrained(

#     "distilbert-base-uncased", num_labels=2

# )
# model = BertForSequenceClassification.from_pretrained(
#     "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
#     num_labels = 2, # The number of output labels--2 for binary classification.
#                     # You can increase this for multi-class tasks.   
#     output_attentions = False, # Whether the model returns attentions weights.
#     output_hidden_states = False, # Whether the model returns all hidden-states.
# )
model.to(device)
     
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [64]:
max_len = 0

# For every sentence...
for sent in x_train:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (962 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  9827


In [None]:


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


In [None]:

def format_time(elapsed):
   
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))



In [62]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self,encodings,labels):
      self.encodings=encodings
      self.labels=labels   
    def __len__(self):
        return len(self.encodings)    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

 
max_length = 256                     
train_encodings = tokenizer(x_train, truncation=True, padding=True,   add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length, 
                        return_attention_mask = True,   
                        return_tensors = 'pt')
val_encodings = tokenizer(x_valid, truncation=True, padding=True,   add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,           
                        return_attention_mask = True,   
                        return_tensors = 'pt',)


train_dataset=CustomDataset(train_encodings,labels=y_train)
eval_dataset=CustomDataset(val_encodings,labels=y_valid)

train_dataloader=torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
eval_dataloader=torch.utils.data.DataLoader(eval_dataset)


In [None]:
# next(iter(train_dataloader))
#

In [None]:

accuracy = evaluate.load("accuracy")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
metric = load_metric("accuracy")

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    print(eval_pred)
    predictions = np.argmax(logits, axis=-1)
    return clf_metrics.compute(predictions=predictions, references=labels)

In [None]:


epochs = 6

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


In [63]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

total_t0 = time.time()

for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # print(vars(batch))

        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

      
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)

        
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
       
        output = model(b_input_ids,
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # print(vars(output))
        loss=output.loss
        total_train_loss += loss.item()

    
        loss.backward()
 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in eval_dataloader:
        b_input_ids = batch['input_ids'].to(device)
        b_input_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        
   
        with torch.no_grad():        
            # (loss, logits) = model(b_input_ids,
            #                        attention_mask=b_input_mask,
            #                        labels=b_labels)
            output = model(b_input_ids,
                             attention_mask=b_input_mask, 
                             labels=b_labels)

        # print(vars(output))
        loss=output.loss    
        logits=output.logits
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(eval_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(eval_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.34
  Training epcoh took: 0:00:00

Running Validation...
  Accuracy: 0.00
  Validation Loss: 0.75
  Validation took: 0:00:00
Training...


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}



  Average training loss: 0.32
  Training epcoh took: 0:00:00

Running Validation...
  Accuracy: 0.00
  Validation Loss: 0.75
  Validation took: 0:00:00
Training...

  Average training loss: 0.35
  Training epcoh took: 0:00:00

Running Validation...
  Accuracy: 0.00
  Validation Loss: 0.75
  Validation took: 0:00:00
Training...

  Average training loss: 0.36
  Training epcoh took: 0:00:00

Running Validation...
  Accuracy: 0.00
  Validation Loss: 0.75
  Validation took: 0:00:00
Training...

  Average training loss: 0.31
  Training epcoh took: 0:00:00

Running Validation...
  Accuracy: 0.00
  Validation Loss: 0.75
  Validation took: 0:00:00
Training...

  Average training loss: 0.28
  Training epcoh took: 0:00:00

Running Validation...
  Accuracy: 0.00
  Validation Loss: 0.75
  Validation took: 0:00:00

Training complete!
Total training took 0:00:01 (h:mm:ss)


In [None]:
# training_args = TrainingArguments(
#     output_dir="my_model",
#     learning_rate=2e-5,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1,
#     num_train_epochs=10,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     push_to_hub=False,
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     tokenizer=tokenizer,
#     # data_collator=data_collator,
#     compute_metrics=compute_metrics,
    
# )

# trainer.train()

# trainer.evaluate()