In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
! pip install transformers -q
! pip install emoji --upgrade -q
! pip install Unidecode -q

In [None]:
import re
import pandas as pd
from pathlib import Path
import matplotlib.cm as cm
import numpy as np
import pandas as pd
from typing import *
from tqdm.notebook import tqdm
from sklearn.utils.extmath import softmax
from sklearn import model_selection
from sklearn.metrics import classification_report, f1_score

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import transformers
from transformers import AdamW

In [None]:
import emoji
import regex

In [None]:
def seed_all(seed = 42):
  """
  Fix seed for reproducibility
  """
  # python RNG
  import random
  random.seed(seed)

  # pytorch RNGs
  import torch
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

  # numpy RNG
  import numpy as np
  np.random.seed(seed)

In [None]:
import os
os.chdir('/gdrive/My Drive/COVID_Tweet')

In [None]:
class config:
  SEED = 42
  KFOLD = 5
  SAVE_DIR = 'run_roberta_large_oof_fix_adv'
  TRAIN_FILE = 'train_fix.csv'
  VAL_FILE =  'valid_fix.csv'
  TEST_FILE = 'test_fix.csv'
  OOF_FILE = os.path.join(SAVE_DIR, 'oof.csv')
  MAX_LEN = 96
  MODEL = 'roberta-large'
  TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL)
  EPOCHS = 5
  TRAIN_BATCH_SIZE = 16
  VALID_BATCH_SIZE = 16
  DEVICE = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")

In [None]:
class AverageMeter:
    """
    Computes and stores the average and current value
    Source : https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch/
    """
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
class EarlyStopping:
    """
    Early stopping utility
    Source : https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch/
    """
    
    def __init__(self, patience=7, mode="max", delta=0.001):
        self.patience = patience
        self.counter = 0
        self.mode = mode
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        if self.mode == "min":
            self.val_score = np.Inf
        else:
            self.val_score = -np.Inf

    def __call__(self, epoch_score, model, model_path):
        if self.mode == "min":
            score = -1.0 * epoch_score
        else:
            score = np.copy(epoch_score)
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print('EarlyStopping counter: {} out of {}'.format(self.counter, self.patience))
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(epoch_score, model, model_path)
            self.counter = 0

    def save_checkpoint(self, epoch_score, model, model_path):
        if epoch_score not in [-np.inf, np.inf, -np.nan, np.nan]:
            print('Validation score improved ({} --> {}). Saving model!'.format(self.val_score, epoch_score))
            torch.save(model.state_dict(), model_path)
        self.val_score = epoch_score

In [None]:
import html
import unicodedata
import unidecode

control_char_regex = re.compile(r'[\r\n\t]+')
transl_table = dict([(ord(x), ord(y)) for x, y in zip(u"‘’´“”–-",  u"'''\"\"--")])

In [None]:
def preprocess(text):
  text = html.unescape(text)
  text = text.translate(transl_table)
  text = text.replace('…', '...')
  text = re.sub(control_char_regex, ' ', text)
  text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'C')
  text = ' '.join(text.split())
  text =  text.strip()
  
  text = text.replace('HTTPURL', 'URL')
  text = emoji.demojize(text)

  text = unidecode.unidecode(text)
  text = ''.join(ch for ch in text if unicodedata.category(ch)[0] != 'So')

  return text

In [None]:
def process_data(text, tokenizer, max_len, label):
  
  text = preprocess(text)
  
  token_ids = tokenizer.encode(text, add_special_tokens=True)
  mask = [1] * len(token_ids)

  padding = max_len - len(token_ids)
  
  if padding>=0:
    token_ids = token_ids + ([0] * padding)
    mask = mask + ([0] * padding)
  else:
    token_ids = token_ids[0:max_len]
    mask = mask[0:max_len]

  label = 1 if label=='INFORMATIVE' else 0

  assert len(token_ids)==max_len
  assert len(mask)==max_len

  return {'text':text,
          'ids':token_ids,
          'mask':mask,
          'label':label
          }

In [None]:
class Dataset:
    def __init__(self, text, label):
        self.text = text
        self.label = label
        self.tokenizer = config.TOKENIZER
        self.max_len = config.MAX_LEN
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, item):
        data = process_data(
            self.text[item], 
            self.tokenizer,
            self.max_len,
            self.label[item],
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'text': data['text'],
            'label': data['label'],
        }

In [None]:
class FGM():
    """
        Utility for Adversarial Training
        Source: https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/143764
    """
    
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [None]:
def train_fn(data_loader, model, optimizer, device):
  model.train()
  fgm = FGM(model)

  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  
  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    label = d['label']

    ids = ids.to(device, dtype=torch.long)
    label = label.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    model.zero_grad()
    loss, logits = model(input_ids=ids, attention_mask=mask, labels=label)
    loss.backward()

    fgm.attack() 
    loss_adv, _ = model(input_ids=ids, attention_mask=mask, labels=label)
    loss_adv.backward() 
    fgm.restore()

    optimizer.step()

    losses.update(loss.item(), ids.size(0))
    tk0.set_postfix(loss=losses.avg)


In [None]:
def eval_fn(data_loader, model, device):
  model.eval()
  losses = AverageMeter()
  tk0 = tqdm(data_loader, total=len(data_loader))
  yt, yp = [], []

  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    label = d['label']

    ids = ids.to(device, dtype=torch.long)
    label = label.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    with torch.no_grad():
      loss, logits = model(input_ids=ids, attention_mask=mask, labels=label)        
       
    
    logits = logits.detach().cpu().numpy()

    preds = softmax(logits)
    pred_labels = np.argmax(preds, axis=1).flatten()
    ground_labels = label.to('cpu').numpy()

    yt = yt + ground_labels.tolist()
    yp = yp + pred_labels.tolist()

    losses.update(loss.item(), ids.size(0))
    tk0.set_postfix(loss=losses.avg)

  return f1_score(yt, yp)


In [None]:
def test_fn(data_loader, model, device):
  model.eval()
  tk0 = tqdm(data_loader, total=len(data_loader))
  test_preds = []

  for bi, d in enumerate(tk0):
    ids = d['ids']
    mask = d['mask']
    label = d['label']
    
    ids = ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    label = label.to(device, dtype=torch.long)

    with torch.no_grad():
      _, logits = model(input_ids=ids, attention_mask=mask, labels=label)        
    
    logits = logits.detach().cpu().numpy()
    preds = softmax(logits)[:, 1]        
    test_preds = test_preds + preds.tolist()

  return test_preds

In [None]:
def run(df_train, df_val, fold=None):
  train_dataset = Dataset(
        text = df_train.Text.values,
        label = df_train.Label.values,
    )
  
  valid_dataset = Dataset(
      text = df_val.Text.values,
      label = df_val.Label.values,
    )
  
  train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=config.TRAIN_BATCH_SIZE,
        num_workers=4
    )

  valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=config.VALID_BATCH_SIZE,
        num_workers=2
    )
  
  
  model = transformers.RobertaForSequenceClassification.from_pretrained(config.MODEL, num_labels=2)
  device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
  model.to(device)

  lr = 2e-5
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'gamma', 'beta']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
  ]
  optimizer = AdamW(optimizer_grouped_parameters, lr=lr)

  es = EarlyStopping(patience=3, mode="max")

  print('Starting training....')
  for epoch in range(config.EPOCHS):
    train_fn(train_data_loader, model, optimizer, device)
    valid_loss = eval_fn(valid_data_loader, model, device)
    print(f'Epoch :{epoch + 1} | Validation Score :{valid_loss}')
    if fold is None:
      es(valid_loss, model, model_path=os.path.join(config.SAVE_DIR, f"model.bin"))
    else:
      es(valid_loss, model, model_path=os.path.join(config.SAVE_DIR, f"model_{fold}.bin"))
    if es.early_stop:
      print('Early stopping')
      break

  print('Predicting for OOF')
  if fold is None:
    model.load_state_dict(torch.load(os.path.join(config.SAVE_DIR, 'model.bin')))
  else:
    model.load_state_dict(torch.load(os.path.join(config.SAVE_DIR, f'model_{fold}.bin')))
    
  model.to(device)
  
  test_predictions = test_fn(valid_data_loader, model, device)
  return test_predictions

In [None]:
def run_fold(fold_idx):
  """
    Perform k-fold cross-validation
  """

  seed_all()
  scores = pd.DataFrame()

  df_train = pd.read_csv(config.TRAIN_FILE)
  df_val = pd.read_csv(config.VAL_FILE)
  
  # concatenating train and validation set
  train = pd.concat([df_train, df_val]).reset_index()
  
  # dividing folds
  kf = model_selection.StratifiedKFold(n_splits=config.KFOLD, shuffle=False, random_state=config.SEED)
  idx = None

  for fold, (train_idx, val_idx) in enumerate(kf.split(X=train, y=train.Label.values)):
      train.loc[val_idx, 'kfold'] = fold
      if fold==fold_idx:
        idx = val_idx

  if os.path.isfile(config.OOF_FILE):
    scores = pd.read_csv(config.OOF_FILE)
    print('Found oof file')
  else:
    scores = train.copy()
    scores['oof'] = 0
    scores.to_csv(config.OOF_FILE, index=False)
    print('Created oof file')
  
  df_train = train[train.kfold!=fold_idx]
  df_val = train[train.kfold==fold_idx]

  y = run(df_train, df_val, fold_idx)
  scores.loc[idx, 'oof'] = y
  
  scores.to_csv(config.OOF_FILE, index=False)


In [None]:
! rm -rf {config.SAVE_DIR} && mkdir {config.SAVE_DIR}

In [None]:
run_fold(0)

In [None]:
run_fold(1)

In [None]:
run_fold(2)

In [None]:
run_fold(3)

In [None]:
run_fold(4)

# Run Results

In [None]:
df = pd.read_csv(config.OOF_FILE)
df['gold'] = df['Label'].map({'INFORMATIVE':1, 'UNINFORMATIVE':0})
df.head(3)

In [None]:
df['pred'] = (df['oof']>=0.5)*1
print(classification_report(df['gold'].values, df['pred'].values))

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(df['gold'].values, df['oof'].values)

In [None]:
thresholds = np.arange(0, 1, 0.001)
fscores = [f1_score(df['gold'].values, (df['oof']>=t)*1) for t in thresholds]
idx = np.argmax(fscores)
print(thresholds[idx], fscores[idx])

# Test Predictions

In [None]:
df = pd.read_csv(config.TEST_FILE)

test_dataset = Dataset(
      text = df.Text.values,
      label = df.Label.values,
    )
  
test_data_loader = torch.utils.data.DataLoader(
      test_dataset,
      batch_size=config.VALID_BATCH_SIZE,
      num_workers=4
  )

scores = pd.DataFrame()

model = transformers.RobertaForSequenceClassification.from_pretrained(config.MODEL, num_labels=2)
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")
model.to(device)

for i in range(config.KFOLD):
  model.load_state_dict(torch.load(os.path.join(config.SAVE_DIR, f'model_{i}.bin')))
  y_preds = test_fn(test_data_loader, model, device)
  scores[f'prob_{i}'] = y_preds



scores['avg'] = (scores['prob_0'] + scores['prob_1'] + scores['prob_2'] + scores['prob_3'] + scores['prob_4'])/5
scores['preds'] = (scores['avg']>=0.5)*1
scores['Labels'] = scores['preds'].map({1:'INFORMATIVE', 0:'UNINFORMATIVE'})
scores.to_csv(os.path.join(config.SAVE_DIR, 'scores.csv'), index=False)

with open(os.path.join(config.SAVE_DIR, 'submission.txt'), 'w') as f:
  for i in scores['Labels'].values:
    f.write(i+'\n')