In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from collections import defaultdict
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, KFold
from transformers import AutoTokenizer, AutoModel, AdamW

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip install --upgrade wandb
!pip install transformers

In [None]:
import wandb
wandb.login(key="3b35d1d20a61e201f1d0a77448c48771b233d70c")
os.environ["WANDB_API_KEY"] = "3b35d1d20a61e201f1d0a77448c48771b233d70c"
anony = None

In [None]:
NAME = "Siamese_roberta"

CONFIG = {"seed": 2021,
          "epochs": 3,
          "model_name": "roberta-base",
          "train_batch_size": 32,
          "valid_batch_size": 64,
          "max_length": 128,
          "learning_rate": 1e-4,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500,
          "weight_decay": 1e-6,
          "n_fold": 5,
          "n_accumulate": 1,
          "num_classes": 1,
          "margin": 0.5,
          "device": "cpu",
          "NAME": NAME
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{NAME}-Baseline'

np.random.seed(CONFIG['seed'])

In [None]:
df = pd.read_csv("train.En.csv")
df=df.loc[df['sarcastic'] == 1]
df=df[['tweet','rephrase','sarcastic']]
df.head()

Unnamed: 0,tweet,rephrase,sarcastic
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring...",1
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...,1
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall...",1
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to...",1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...,1


In [None]:
def gen_folds(train):
    skf = StratifiedKFold(n_splits=CONFIG['n_fold'], shuffle=True, random_state=CONFIG['seed'])

    for fold, ( _, val_) in enumerate(skf.split(X=train, y=train.sarcastic)):
        train.loc[val_ , "kfold"] = int(fold)
        
    train["kfold"] = train["kfold"].astype(int)
    return train

In [None]:
train, validate, test = np.split(df.sample(frac=1, random_state=42), [int(.6*len(df)), int(.8*len(df))])
train=pd.concat([train, validate], ignore_index=True)

In [None]:
train = gen_folds(train)
train.head()

Unnamed: 0,tweet,rephrase,sarcastic,kfold
0,All the shade i have been hearing about Ben Pl...,Dear Evan Hanson cast the same type of older a...,1,4
1,you would think the odds of me sitting next to...,I can't believe the same man ended up sitting ...,1,2
2,Billy Gunn sorta relevant for the first time i...,Billy Gunn is actually in the spotlight now.,1,4
3,"I see Dettol, Toilet Duck and Zoflora are all ...",Donald Trump asked the Chief Medical Officer i...,1,2
4,@catboychika Aren't you happy I gave you less ...,Sorry for putting off the event for so long TT,1,2


In [None]:
class SarcasmDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.more_toxic = df['tweet'].values
        self.less_toxic = df['rephrase'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        more_toxic = self.more_toxic[index]
        less_toxic = self.less_toxic[index]
        inputs_less_toxic = self.tokenizer.encode_plus(
                                less_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        inputs_more_toxic = self.tokenizer.encode_plus(
                                more_toxic,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
        
        target = 1

        less_toxic_ids = inputs_less_toxic['input_ids']
        less_toxic_mask = inputs_less_toxic['attention_mask']
        
        more_toxic_ids = inputs_more_toxic['input_ids']
        more_toxic_mask = inputs_more_toxic['attention_mask']
        
        return {
            'more_toxic_ids': torch.tensor(more_toxic_ids, dtype=torch.long),
            'more_toxic_mask': torch.tensor(more_toxic_mask, dtype=torch.long),
            'less_toxic_ids': torch.tensor(less_toxic_ids, dtype=torch.long),
            'less_toxic_mask': torch.tensor(less_toxic_mask, dtype=torch.long),
            'target': torch.tensor(target, dtype=torch.long)
        }

In [None]:
class SarcasmClassifer(nn.Module):
    def __init__(self, model_name):
        super(SarcasmClassifer, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.drop = nn.Dropout(p=0.2)
        self.fc = nn.Linear(768, CONFIG['num_classes'])
        
    def forward(self, ids, mask):        
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.drop(out[1])
        outputs = self.fc(out)
        return outputs

In [None]:
def criterion(outputs1, outputs2, targets):
    outputs1 = outputs1.squeeze(dim=1)
    outputs2 = outputs2.squeeze(dim=1)
    return nn.MarginRankingLoss(margin=CONFIG['margin'])(outputs1, outputs2, targets)

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        loss = loss / CONFIG['n_accumulate']
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])
    gc.collect()
    
    return epoch_loss

In [None]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        more_toxic_ids = data['more_toxic_ids'].to(device, dtype = torch.long)
        more_toxic_mask = data['more_toxic_mask'].to(device, dtype = torch.long)
        less_toxic_ids = data['less_toxic_ids'].to(device, dtype = torch.long)
        less_toxic_mask = data['less_toxic_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = more_toxic_ids.size(0)

        more_toxic_outputs = model(more_toxic_ids, more_toxic_mask)
        less_toxic_outputs = model(less_toxic_ids, less_toxic_mask)
        
        loss = criterion(more_toxic_outputs, less_toxic_outputs, targets)
        
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])   
    
    gc.collect()
    
    return epoch_loss

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    wandb.watch(model, log_freq=100)
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    history = defaultdict(list)
    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        val_epoch_loss = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
    
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        
        if val_epoch_loss <= best_epoch_loss:
            print(f"Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), "roberta.pth")
            print(f"Model Saved")
            
        print()
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history

In [None]:
def prepare_loaders(fold):
    df_train = train[train.kfold != fold].reset_index(drop=True)
    df_valid = train[train.kfold == fold].reset_index(drop=True)
    valid_dataset = SarcasmDataset(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    train_dataset = SarcasmDataset(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    return train_loader, valid_loader

In [None]:
for fold in range(0, CONFIG['n_fold']):
    print(f"====== Fold: {fold} ======")
    run = wandb.init(project='Jigsaw', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=['roberta-base', f'{NAME}', 'margin-loss'],
                     name=f'{NAME}-fold-{fold}',
                     anonymous='must')
    train_loader, valid_loader = prepare_loaders(fold=fold)
    model = SarcasmClassifer(CONFIG['model_name'])
    model.to(CONFIG['device'])

    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])    
    model, history = run_training(model, optimizer, scheduler,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['epochs'],
                                  fold=fold)
    
    run.finish()
    del model, history, train_loader, valid_loader
    gc.collect()
    print()

In [62]:
test = pd.read_csv("task_C_En_test.csv")
print(test['sarcastic_id'].value_counts())

sarcastic_id
0    107
1     93
Name: count, dtype: int64


In [None]:
test = pd.read_csv("task_C_En_test.csv")
test=test.loc[test['sarcastic_id'] == 0]
test=test[['text_0','text_1','sarcastic_id']]

new_column_names = {'text_0': 'tweet', 'text_1': 'rephrase', 'sarcastic_id': 'sarcastic'}
test = test.rename(columns=new_column_names)

In [None]:
print(test.head())

                                               tweet  \
0           I see that your team played well today!    
1  Anthony Taylor is such a fair referee, I wish ...   
4  Really great weather we're having, love a bit ...   
6  So happy my boyfriend had such a good night la...   
8  do love third year uni winter break, three thi...   

                                            rephrase  sarcastic  
0     I'm sorry that your team didn't win yesterday.          0  
1  I hope Anthony Taylor is never put in charge o...          0  
4  Really cold January so far - looking forward t...          0  
6    Boyfriend went out last night and had some fun!          0  
8  why do we have to have such a work filled wint...          0  


In [None]:
test.dropna(inplace=True)

In [None]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    dataset_size = 0
    running_loss = 0.0
    PREDS = []
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['text_ids'].to(device, dtype = torch.long)
        mask = data['text_mask'].to(device, dtype = torch.long)
        outputs = model(ids, mask)
        sig=nn.Sigmoid()
        outputs=sig(outputs)
        PREDS.append(outputs.detach().cpu().numpy()) 
    PREDS = np.concatenate(PREDS)
    gc.collect()
    return PREDS

In [None]:
def inference(dataloader, device):
    final_preds = []
    model = SarcasmClassifer('roberta-base')
    model.to(CONFIG['device'])
    model.load_state_dict(torch.load("roberta.pth"))
    preds = valid_fn(model, dataloader, device)
    final_preds.append(preds)
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    final_preds[final_preds>=0.5] = 1
    final_preds[final_preds<0.5] = 0
    return final_preds

In [None]:
class SarcasmDatasetTest(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.text = df['text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
                                text,
                                truncation=True,
                                add_special_tokens=True,
                                max_length=self.max_len,
                                padding='max_length'
                            )
       
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
                
        return {
            'text_ids': torch.tensor(ids, dtype=torch.long),
            'text_mask': torch.tensor(mask, dtype=torch.long),
           
        }

In [None]:
test_final

Unnamed: 0,text,label
0,I'm sorry that your team didn't win yesterday.,0
1,I hope Anthony Taylor is never put in charge o...,0
4,Really cold January so far - looking forward t...,0
6,Boyfriend went out last night and had some fun!,0
8,why do we have to have such a work filled wint...,0
...,...,...
193,"Ah, we really needed that vaccine booster addr...",1
195,"the tories betrayed the nation, what a surprise!",1
197,Isn't it just amazing how competent the govern...,1
198,Thanks Boris Johnson for restricting travel ab...,1


In [None]:
test_dataset = SarcasmDatasetTest(test_final, tokenizer=CONFIG["tokenizer"], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)

In [None]:
preds = inference(test_loader, CONFIG['device'])

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 4/4 [00:06<00:00,  1.55s/it]


In [None]:
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,classification_report

def print_statistics(y, y_pred):
    accuracy = accuracy_score(y, y_pred)
    precision =precision_score(y, y_pred, average='weighted')
    recall = recall_score(y, y_pred, average='weighted')
    f_score = f1_score(y, y_pred, average='weighted')
    print('Accuracy: %.3f\nPrecision: %.3f\nRecall: %.3f\nF_score: %.3f\n'
          % (accuracy, precision, recall, f_score))
    print(classification_report(y, y_pred))
    return accuracy, precision, recall, f_score

In [None]:
print(print_statistics(test_final['label'],preds))

Accuracy: 0.741
Precision: 0.819
Recall: 0.741
F_score: 0.725

              precision    recall  f1-score   support

           0       0.98      0.49      0.66       174
           1       0.66      0.99      0.79       174

    accuracy                           0.74       348
   macro avg       0.82      0.74      0.72       348
weighted avg       0.82      0.74      0.72       348

(0.7413793103448276, 0.8194055944055944, 0.7413793103448276, 0.7245576388644597)


In [None]:
print(print_statistics(test_final['label'],preds))

Accuracy: 0.478
Precision: 0.444
Recall: 0.478
F_score: 0.384

              precision    recall  f1-score   support

           0       0.40      0.09      0.14        93
           1       0.49      0.87      0.63        93

    accuracy                           0.48       186
   macro avg       0.44      0.48      0.38       186
weighted avg       0.44      0.48      0.38       186

(0.478494623655914, 0.4439759036144579, 0.478494623655914, 0.38353777291830393)


In [None]:
print(print_statistics(test_final['label'],preds))

Accuracy: 0.593
Precision: 0.735
Recall: 0.593
F_score: 0.521

              precision    recall  f1-score   support

           0       0.92      0.21      0.34       107
           1       0.55      0.98      0.71       107

    accuracy                           0.59       214
   macro avg       0.73      0.59      0.52       214
weighted avg       0.73      0.59      0.52       214

(0.5934579439252337, 0.7346491228070176, 0.5934579439252337, 0.5214742848330636)
