In [1]:
# !pip --trusted-host pypi.org --trusted-host files.pythonhosted.org install -r requirements.txt -qq

import os
import gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime

import torch
from transformers import *
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from ast import literal_eval

In [2]:
# 환경설정
## 기타설정
os.makedirs('tokens', exist_ok=True)
os.makedirs('model', exist_ok=True)
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

## 환경변수
VER = 0.1
MODEL_NAME = 'google/bigbird-roberta-base'
CONFIG = {
    'model_name': MODEL_NAME,
    'max_length': 1024,
    'train_batch_size': 4,
    'valid_batch_size': 4,
    'epochs': 5,
    'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
    'max_grad_norm': 10,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
}
COMPUTE_VAL_SCORE = True if len(os.listdir('data/test')) <= 5 else False

OUTPUT_LABELS = [
    'O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position',
    'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
    'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence',
    'B-Concluding Statement', 'I-Concluding Statement'
]
LABELS_TO_IDS = {v: k for k, v in enumerate(OUTPUT_LABELS)}
IDS_TO_LEBELS = {k: v for k, v in enumerate(OUTPUT_LABELS)}

In [3]:
## 모델 설정
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
tokenizer.save_pretrained('model')

config_model = AutoConfig.from_pretrained(MODEL_NAME)
config_model.num_labels = 15
config_model.save_pretrained('model')

backbone = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config_model)
backbone.save_pretrained('model')

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForTokenClassification were no

In [4]:
# 데이터 로딩
## 다운로드 (using Kaggle API)
# !kaggle competitions download -q -c feedback-prize-2021
# !unzip feedback-prize-2021.zip -d data

## train.csv
train_df = pd.read_csv('data/train.csv')

## train
train_ids, train_texts = [], []
for f in tqdm(os.listdir('data/train')):
    train_ids.append(f.replace('.txt', ''))
    train_texts.append(open(f'data/train/{f}', 'r', encoding='utf8').read())
train_text_df = pd.DataFrame({'id': train_ids, 'text': train_texts})

## test
test_ids, test_texts = [], []
for f in tqdm(os.listdir('data/test')):
    test_ids.append(f.replace('.txt', ''))
    test_texts.append(open(f'data/test/{f}', 'r', encoding='utf8').read())
test_text_df = pd.DataFrame({'id': test_ids, 'text': test_texts})

100%|██████████| 15594/15594 [00:04<00:00, 3653.43it/s]
100%|██████████| 5/5 [00:00<00:00, 1663.35it/s]


In [5]:
# 데이터 가공
# all_entities = []
# for i, row in train_text_df.iterrows():
#     if i%100 == 0: print(i, '', end='')
#     total = len(row['text'].split())
#     id = row['id']
#     entities = ['O']*total
#     for _, discource in train_df.query("id == @id").iterrows():
#         disc_type = discource['discourse_type']
#         list_ix = [int(x) for x in discource['predictionstring'].split(' ')]
#         entities[list_ix[0]] = f'B-{disc_type}'
#         for j in list_ix[1:]:
#             entities[j] = f'I-{disc_type}'
#     all_entities.append(entities)
# train_text_df['entities'] = all_entities
# train_text_df.to_csv('tokens/train_NER.csv', index=False)

train_text_df = pd.read_csv('tokens/train_NER.csv')

In [6]:
# 데이터셋 클래스 정의
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length, get_wids):
        self.len = len(data)
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.get_wids = get_wids

    def __getitem__(self, index):
        text = self.data.text[index]
        word_labels = eval(self.data.entities[index]) if not self.get_wids else None

        encoding = self.tokenizer(
            text.split(),
            is_split_into_words = True,
            padding = 'max_length',
            truncation = True,
            max_length = self.max_length
        )
        word_ids = encoding.word_ids()
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}

        if not self.get_wids:
            label_ids = []
            for word_idx in word_ids:
                label_ids.append(LABELS_TO_IDS[word_labels[word_idx]] if word_idx is not None else -100)
            item['labels'] = torch.as_tensor(label_ids)
        else:
            item['wids'] = torch.as_tensor([w if w is not None else -1 for w in word_ids])

        return item
                
    def __len__(self):
        return self.len

In [7]:
# 데이터 로더 생성
IDS = train_df.id.unique()
np.random.seed(42)
train_idx = np.random.choice(np.arange(len(IDS)), int(0.9*len(IDS)), replace=False)
valid_idx = np.setdiff1d(np.arange(len(IDS)), train_idx)
np.random.seed(None)

tokenizer = AutoTokenizer.from_pretrained('model')

train_dataset = train_text_df.query('id in @IDS[@train_idx]')[['text', 'entities']].reset_index(drop=True)
valid_dataset = train_text_df.query('id in @IDS[@valid_idx]')[['text', 'entities']].reset_index(drop=True)
test_dataset = test_text_df.copy()

training_set = CustomDataset(train_dataset, tokenizer, CONFIG['max_length'], False)
validating_set = CustomDataset(valid_dataset, tokenizer, CONFIG['max_length'], True)
testing_set = CustomDataset(test_dataset, tokenizer, CONFIG['max_length'], True)

train_params = {
    'batch_size': CONFIG['train_batch_size'],
    'shuffle': True,
    'num_workers': 2,
    'pin_memory': True
}

valid_params = {
    'batch_size': CONFIG['valid_batch_size'],
    'shuffle': False,
    'num_workers': 2,
    'pin_memory': True
}

training_loader = DataLoader(training_set, **train_params)
validating_loader = DataLoader(validating_set, **valid_params)
testing_loader = DataLoader(testing_set, **valid_params)

In [24]:
# 모델 설정
config_model = AutoConfig.from_pretrained('model/config.json')
model = AutoModelForTokenClassification.from_pretrained('model/pytorch_model.bin', config=config_model)
model.to(CONFIG['device'])
optimizer = torch.optim.Adam(params=model.parameters(), lr=CONFIG['learning_rates'][0])

In [81]:
# 모델 학습
for epoch in range(CONFIG['epochs']):

    for g in optimizer.param_groups:
        g['lr'] = CONFIG['learning_rates'][epoch]
    lr = optimizer.param_groups[0]['lr']

    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    model.train()
    for _, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(CONFIG['device'], dtype=torch.long).unsqueeze(0)
        mask = batch['attention_mask'].to(CONFIG['device'], dtype=torch.long).unsqueeze(0)
        labels = batch['labels'].to(CONFIG['device'], dtype=torch.long).unsqueeze(0)
        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)
        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        flattened_targets = labels.view(-1)
        active_logits = tr_logits.view(-1, model.num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)
        active_accuracy = labels.view(-1) != -100
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        tr_accuracy += accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())

        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=CONFIG['max_grad_norm'])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    torch.cuda.empty_cache()
    gc.collect()

# torch.save(model.state_dict(), f'bigbird_v{VER}.pt')
# model.load_state_dict(torch.load(f'bigbird_v{VER}.pt'))

{'input_ids': tensor([  65, 2874,  762,  ...,    0,    0,    0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'labels': tensor([-100,    3,    4,  ..., -100, -100, -100])}

In [None]:
def inference(batch):
                
    # MOVE BATCH TO GPU AND INFER
    ids = batch["input_ids"].to(config['device'])
    mask = batch["attention_mask"].to(config['device'])
    outputs = model(ids, attention_mask=mask, return_dict=False)
    all_preds = torch.argmax(outputs[0], axis=-1).cpu().numpy() 

    # INTERATE THROUGH EACH TEXT AND GET PRED
    predictions = []
    for k,text_preds in enumerate(all_preds):
        token_preds = [ids_to_labels[i] for i in text_preds]

        prediction = []
        word_ids = batch['wids'][k].numpy()  
        previous_word_idx = -1
        for idx,word_idx in enumerate(word_ids):                            
            if word_idx == -1:
                pass
            elif word_idx != previous_word_idx:              
                prediction.append(token_preds[idx])
                previous_word_idx = word_idx
        predictions.append(prediction)
    
    return predictions

In [None]:
def get_predictions(df=test_dataset, loader=testing_loader):
    
    # put model in training mode
    model.eval()
    
    # GET WORD LABEL PREDICTIONS
    y_pred2 = []
    for batch in loader:
        labels = inference(batch)
        y_pred2.extend(labels)

    final_preds2 = []
    for i in range(len(df)):

        idx = df.id.values[i]
        #pred = [x.replace('B-','').replace('I-','') for x in y_pred2[i]]
        pred = y_pred2[i] # Leave "B" and "I"
        preds = []
        j = 0
        while j < len(pred):
            cls = pred[j]
            if cls == 'O': j += 1
            else: cls = cls.replace('B','I') # spans start with B
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            
            if cls != 'O' and cls != '' and end - j > 7:
                final_preds2.append((idx, cls.replace('I-',''),
                                     ' '.join(map(str, list(range(j, end))))))
        
            j = end
        
    oof = pd.DataFrame(final_preds2)
    oof.columns = ['id','class','predictionstring']

    return oof

In [None]:
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter/ len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id','discourse_type','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df = pred_df[['id','class','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id','class'],
                           right_on=['id','discourse_type'],
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])


    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP') \
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id','predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    #calc microf1
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score

In [None]:
if COMPUTE_VAL_SCORE: # note this doesn't run during submit
    # VALID TARGETS
    valid = train_df.loc[train_df['id'].isin(IDS[valid_idx])]

    # OOF PREDICTIONS
    oof = get_predictions(test_dataset, testing_loader)

    # COMPUTE F1 SCORE
    f1s = []
    CLASSES = oof['class'].unique()
    print()
    for c in CLASSES:
        pred_df = oof.loc[oof['class']==c].copy()
        gt_df = valid.loc[valid['discourse_type']==c].copy()
        f1 = score_feedback_comp(pred_df, gt_df)
        print(c,f1)
        f1s.append(f1)
    print()
    print('Overall',np.mean(f1s))
    print()

In [None]:
sub = get_predictions(test_texts, test_texts_loader)
sub.head()

In [None]:
sub.to_csv("submission.csv", index=False)