In [1]:
import os
import gc
import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from ast import literal_eval
from transformers import *
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [2]:
# 환경설정
## 기타설정
os.makedirs('tokens', exist_ok=True)
os.makedirs('model', exist_ok=True)
os.makedirs('data', exist_ok=True)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
warnings.filterwarnings('ignore')

## 환경변수
VER = 0.1
MODEL_NAME = 'google/bigbird-roberta-base'
CONFIG = {
    'model_name': MODEL_NAME,
    'max_length': 1024,
    'train_batch_size': 2,
    'valid_batch_size': 2,
    'epochs': 5,
    'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
    'max_grad_norm': 10,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
}
COMPUTE_VAL_SCORE = True

OUTPUT_LABELS = [
    'O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position',
    'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
    'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence',
    'B-Concluding Statement', 'I-Concluding Statement'
]
LABELS_TO_IDS = {v: k for k, v in enumerate(OUTPUT_LABELS)}
IDS_TO_LABELS = {k: v for k, v in enumerate(OUTPUT_LABELS)}

## colab 파일 업로드
# from google.colab import files
# files.upload()
# !mkdir -p ~/.kaggle && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

In [3]:
## 모델 설정
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True, cache_dir='cache')
tokenizer.save_pretrained('model')

config_model = AutoConfig.from_pretrained(MODEL_NAME)
config_model.num_labels = 15
config_model.save_pretrained('model')

backbone = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config_model)
backbone.save_pretrained('model')

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForTokenClassification were no

In [4]:
# 데이터 로딩
## 다운로드 (using Kaggle API)
if os.listdir('data') == []:
  !kaggle competitions download -q -c feedback-prize-2021
  !unzip -qq feedback-prize-2021.zip -d data

## train.csv
train_df = pd.read_csv('data/train.csv')

## train
train_ids, train_texts = [], []
for f in tqdm(os.listdir('data/train')):
    train_ids.append(f.replace('.txt', ''))
    train_texts.append(open(f'data/train/{f}', 'r', encoding='utf8').read())
train_text_df = pd.DataFrame({'id': train_ids, 'text': train_texts})

## test
test_ids, test_texts = [], []
for f in tqdm(os.listdir('data/test')):
    test_ids.append(f.replace('.txt', ''))
    test_texts.append(open(f'data/test/{f}', 'r', encoding='utf8').read())
test_text_df = pd.DataFrame({'id': test_ids, 'text': test_texts})

 11%|█         | 1747/15594 [00:03<00:34, 400.14it/s]

In [None]:
# 데이터 가공
if 'train_NER.csv' not in os.listdir('tokens'):
  all_entities = []
  for i, row in train_text_df.iterrows():
      if i%100 == 0: print(i, '', end='')
      total = len(row['text'].split())
      id = row['id']
      entities = ['O']*total
      for _, discource in train_df.query("id == @id").iterrows():
          disc_type = discource['discourse_type']
          list_ix = [int(x) for x in discource['predictionstring'].split(' ')]
          entities[list_ix[0]] = f'B-{disc_type}'
          for j in list_ix[1:]:
              entities[j] = f'I-{disc_type}'
      all_entities.append(entities)
  train_text_df['entities'] = all_entities
  train_text_df.to_csv('tokens/train_NER.csv', index=False)

train_text_df = pd.read_csv('tokens/train_NER.csv')

In [None]:
# 데이터셋 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, get_wids):
        self.len = len(data)
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.get_wids = get_wids

    def __getitem__(self, index):
        text = self.data.text[index]
        word_labels = eval(self.data.entities[index]) if not self.get_wids else None

        encoding = self.tokenizer(
            text.split(),
            is_split_into_words = True,
            padding = 'max_length',
            truncation = True,
            max_length = self.max_length
        )
        word_ids = encoding.word_ids()
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}

        if not self.get_wids:
            label_ids = []
            for word_idx in word_ids:
                label_ids.append(LABELS_TO_IDS[word_labels[word_idx]] if word_idx is not None else -100)
            item['labels'] = torch.as_tensor(label_ids)
        else:
            item['wids'] = torch.as_tensor([w if w is not None else -1 for w in word_ids])

        return item
                
    def __len__(self):
        return self.len

In [None]:
# 데이터 로더 생성
all_idx = train_df.id.unique()
np.random.seed(42)
train_idx = np.random.choice(np.arange(len(all_idx)), int(0.9*len(all_idx)), replace=False)
valid_idx = np.setdiff1d(np.arange(len(all_idx)), train_idx)
np.random.seed(None)

tokenizer = AutoTokenizer.from_pretrained('model')

train_dataset = train_text_df.query('id in @all_idx[@train_idx]')[['text', 'entities']].reset_index(drop=True)
valid_dataset = train_text_df.query('id in @all_idx[@valid_idx]').reset_index(drop=True)
test_dataset = test_text_df.copy()

training_set = CustomDataset(train_dataset, tokenizer, CONFIG['max_length'], False)
validating_set = CustomDataset(valid_dataset, tokenizer, CONFIG['max_length'], True)
testing_set = CustomDataset(test_dataset, tokenizer, CONFIG['max_length'], True)

train_params = {
    'batch_size': CONFIG['train_batch_size'],
    'shuffle': True,
    # 'num_workers': 4,
    'pin_memory': True
}

valid_params = {
    'batch_size': CONFIG['valid_batch_size'],
    'shuffle': False,
    # 'num_workers': 4,
    'pin_memory': True
}

training_loader = DataLoader(training_set, **train_params)
validating_loader = DataLoader(validating_set, **valid_params)
testing_loader = DataLoader(testing_set, **valid_params)

In [None]:
# 모델 설정
config_model = AutoConfig.from_pretrained('model/config.json')
model = AutoModelForTokenClassification.from_pretrained('model/pytorch_model.bin', config=config_model)
model.to(CONFIG['device'])
optimizer = torch.optim.Adam(params=model.parameters(), lr=CONFIG['learning_rates'][0])

In [None]:
# 모델 학습
for epoch in range(500):

    for g in optimizer.param_groups:
        g['lr'] = CONFIG['learning_rates'][0]

    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    model.train()
    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(CONFIG['device'], dtype=torch.long)
        mask = batch['attention_mask'].to(CONFIG['device'], dtype=torch.long)
        labels = batch['labels'].to(CONFIG['device'], dtype=torch.long)
        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 200==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss after {idx:04d} training steps: {loss_step}")

        flattened_targets = labels.view(-1)
        active_logits = tr_logits.view(-1, model.num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)
        active_accuracy = labels.view(-1) != -100
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        tr_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())

        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=CONFIG['max_grad_norm'])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

    torch.cuda.empty_cache()
    gc.collect()

torch.save(model.state_dict(), f'bigbird_v{VER}.pt')

model.load_state_dict(torch.load(f'bigbird_v{VER}.pt'))

Training loss after 0000 training steps: 0.3237726390361786
Training loss after 0200 training steps: 0.11143694938199863
Training loss after 0400 training steps: 0.10014219068285737
Training loss after 0600 training steps: 0.09455650073484659
Training loss after 0800 training steps: 0.09200195428571163
Training loss after 1000 training steps: 0.09061759118687131
Training loss after 1200 training steps: 0.09262294805070523
Training loss after 1400 training steps: 0.09222206074696601
Training loss after 1600 training steps: 0.09315148285860027
Training loss after 1800 training steps: 0.09471612558961287
Training loss after 2000 training steps: 0.09507517241969424
Training loss after 2200 training steps: 0.09546102405474333
Training loss after 2400 training steps: 0.09452099738103795
Training loss after 2600 training steps: 0.09505892524579687
Training loss after 2800 training steps: 0.09686047445250537
Training loss after 3000 training steps: 0.09791244794104108
Training loss after 3200 

KeyboardInterrupt: 

In [None]:
def inference(batch):
    ids = batch['input_ids'].to(CONFIG['device'])
    mask = batch['attention_mask'].to(CONFIG['device'])
    outputs = model(input_ids=ids, attention_mask=mask, return_dict=False)
    all_preds = torch.argmax(outputs[0], axis=-1).cpu().numpy()

    predictions = []
    for k, text_preds in enumerate(all_preds):
        token_preds = [IDS_TO_LABELS[i] for i in text_preds]

        prediction = []
        word_ids = batch['wids'][k].numpy()
        previous_word_idx = -1
        for idx, word_idx in enumerate(word_ids):
          if word_idx == -1:
            pass
          elif word_idx != previous_word_idx:
            prediction.append(token_preds[idx])
            previous_word_idx = word_idx
        predictions.append(prediction)

    return predictions  

In [None]:
def get_predictions(df, loader):
    model.eval()

    y_pred2 = []
    for batch in loader:
        labels = inference(batch)
        y_pred2.extend(labels)

    final_preds2 = []
    for i in range(len(df)):
        idx = df.id.values[i]
        pred = y_pred2[i]
        j = 0
        while j < len(pred):
            cls = pred[j]
            if cls == 'O':
              j += 1
            else:
              cls = cls.replace('B', 'I')
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            if cls != 'O' and cls != '' and end - j > 7:  
                final_preds2.append((idx, cls.replace('I-', ''), ' '.join(map(str, list(range(j, end))))))
            j = end
    oof = pd.DataFrame(final_preds2)
    oof.columns = ['id', 'class', 'predictionstring']

    torch.cuda.empty_cache()
    gc.collect()
    
    return oof

In [None]:
def calc_overlap(row):
    set_pred = set(row['predictionstring_pred'].split(' '))
    set_gt = set(row['predictionstring_gt'].split(' '))
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    len_inter = len(set_pred.intersection(set_gt))
    overlap_1 = len_inter/len_gt
    overlap_2 = len_inter/len_pred
    return [overlap_1, overlap_2]

def score_feedback_comp(pred_df, gt_df):
    gt_df = gt_df[['id', 'discourse_type', 'predictionstring']].rename(columns={'discourse_type': 'class'}).reset_index(drop=True).rename_axis('pred_id').reset_index()
    pred_df = pred_df.reset_index(drop=True).rename_axis('gt_id').reset_index()
    joined_df = pred_df.merge(gt_df, on =['id', 'class'], how='outer', suffixes=('_pred', '_gt'))

    joined_df = joined_df.assign(
        predictionstring_pred = lambda x: x['predictionstring_pred'].fillna(' '),
        predictionstring_gt = lambda x: x['predictionstring_gt'].fillna(' '),
        overlaps = lambda x: x.apply(calc_overlap, axis=1),
        overlap1 = lambda x: x['overlaps'].apply(lambda x: eval(str(x))[0]),
        overlap2 = lambda x: x['overlaps'].apply(lambda x: eval(str(x))[1]),
        potential_TP = lambda x: (x['overlap1'] >= 0.5) & (x['overlap2'] >= 0.5),
        max_overlap = lambda x: x[['overlap1', 'overlap2']].max(axis=1),
    )

    tp_pred_ids = (joined_df
      .query('potential_TP')
      .sort_values(by='max_overlap', ascending=False)
      .groupby(['id', 'predictionstring_gt']).first()['pred_id'].values
    )
    fp_pred_ids = [p for p in joined_df['pred_id'].unique() if p not in tp_pred_ids]
    matched_gt_ids = joined_df.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined_df['gt_id'].unique() if c not in matched_gt_ids]

    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)

    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score

In [None]:
valid = train_df.query('id in @all_idx[@valid_idx]')
oof = get_predictions(valid_dataset, validating_loader)

In [None]:
f1s = []
CLASSES = oof['class'].unique()
c = CLASSES[0]
for c in CLASSES:
    gt_df = valid.loc[lambda x: x['discourse_type'] == c].copy()
    pred_df = oof.loc[lambda x: x['class'] == c].copy()
    f1 = score_feedback_comp(pred_df, gt_df)
    print(c, f1)
    f1s.append(f1)
print('='*30)
print('Overall', np.mean(f1s))
print()

Position 0.6364238410596027
Evidence 0.6248108925869894
Counterclaim 0.44612068965517243
Rebuttal 0.32238805970149254
Claim 0.4830917874396135
Concluding Statement 0.7395315826827538
Lead 0.7908946532556909
Overall 0.5776087866259021

