
# Libraries

In [None]:
import collections
import copy
import os
import time
import warnings

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer, logging

# QOL Settings

In [None]:
warnings.filterwarnings('ignore')
logging.set_verbosity_error()
pd.set_option('display.max_colwidth', 0)

# Config

In [None]:
output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

class settings:
    DATA_PATH   = '../input/feedback-prize-2021/'
    WORKERS     = os.cpu_count()
    MAX_TOK_LEN = 512
    STRIDE      = 256
    BATCH       = 4
    LR          = [2.5e-05, 2.5e-05, 2.5e-06, 2.5e-07, 2.5e-07]
    GRAD_NORM   = 10
    EPOCH       = 5
    FOLD        = 3

    TARGET_ID_MAP = {label:i for i,label in enumerate(output_labels)}
    ID_TARGET_MAP = {v:k for k,v in TARGET_ID_MAP.items()}

    DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

    MODEL_BASE_PATH = '../input/model-bin-fbp/'
    MODEL_NAME      = 'roberta-base'
    MODEL_PATH      = 'model'
    WEIGHTS_PATH    = [f'fbp_model_{fold}.pt' for fold in range(5)]


label_group = ['O', 'Lead', 'Position', 'Claim', 'Counterclaim', 'Rebuttal', 'Evidence', 'Concluding Statement']
label_to_lg_map = {0:0}
for target_id, label in settings.ID_TARGET_MAP.items():
    if label=='O':
        continue
    label_to_lg_map[target_id] = label_group.index(label[2:])

proba_thresh = {
    'Lead': 0.7,
    'Position': 0.55,
    'Evidence': 0.65,
    'Claim': 0.55,
    'Concluding Statement': 0.7,
    'Counterclaim': 0.5,
    'Rebuttal': 0.55,
}

min_thresh = {
    'Lead': 9,
    'Position': 5,
    'Evidence': 14,
    'Claim': 3,
    'Concluding Statement': 11,
    'Counterclaim': 6,
    'Rebuttal': 4,
}

# Utility Functions

In [None]:
def read_file(id: str, mode: str, split: bool=True):
    contents = None

    folder = os.path.join(settings.DATA_PATH, mode)
    fp = os.path.join(folder, f'{id}.txt')

    with open(fp, 'r', encoding='utf-8') as f:
        contents = f.read()
        
    if split:
        contents = contents.split()

    return contents

def seed_everything(seed: int):
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

# Preprocesing Functions

In [None]:
def tokenize(preprocessed_df, tokenizer, with_labels=True, to_tensor=True):
    encoded = tokenizer(
        preprocessed_df['split_text'].tolist(),
        is_split_into_words=True,
        return_overflowing_tokens=True,
        stride=settings.STRIDE,
        max_length=settings.MAX_TOK_LEN,
        padding='max_length',
        truncation=True
    )

    if with_labels:
        encoded['labels'] = []

    encoded['wids'] = []
    n = len(encoded['overflow_to_sample_mapping'])
    for i in range(n):

        # Map back to original row
        text_idx = encoded['overflow_to_sample_mapping'][i]
        
        # Get word indexes (this is a global index that takes into consideration the chunking :D )
        word_ids = encoded.word_ids(i)

        if with_labels:
            # Get word labels of the full un-chunked text
            word_labels = preprocessed_df['labels'].iloc[text_idx]

            # Get the labels associated with the word indexes
            label_ids = [-100 if wid is None else word_labels[wid] for wid in word_ids]
            encoded['labels'].append(label_ids)
        encoded['wids'].append([w if w is not None else -1 for w in word_ids])
    
    if to_tensor:
        encoded = {key: torch.as_tensor(val) for key, val in encoded.items()}
    return encoded

# Dataset Definition

In [None]:
class FeedbackPrizeDataset(Dataset):
    def __init__(self, tokenized_ds):
        self.data = tokenized_ds

    def __getitem__(self, index):
        item = {k: self.data[k][index] for k in self.data.keys()}
        return item

    def __len__(self):
        return len(self.data['input_ids'])

# Model Definition

In [None]:
class FeedbackModel(torch.nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(f'{os.path.join(settings.MODEL_BASE_PATH, settings.MODEL_PATH)}/config.json')

        config.update({
            'hidden_dropout_prob':0.1,
            'layer_norm_eps':1e-7
        })

        self.transformer = AutoModel.from_pretrained(f'{os.path.join(settings.MODEL_BASE_PATH, settings.MODEL_PATH)}/pytorch_model.bin', config=config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)
        self.dropout = nn.Dropout(0.1)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.output(sequence_output)

        return logits

# Training Function

In [None]:
def train(model, training_loader, optimizer, loss_func):
    model.train()
    optimizer.zero_grad()

    tr_loss = 0

    training_iter = tqdm(iter(training_loader))
    for n_batch, encoded_input in enumerate(training_iter):
        # load data and move to gpu
        ids    = encoded_input['input_ids'].to(settings.DEVICE)
        mask   = encoded_input['attention_mask'].to(settings.DEVICE)
        labels = encoded_input['labels'].to(settings.DEVICE)

        with torch.cuda.amp.autocast():
            # forward pass
            pred   = model(ids, mask)

            # Calculate loss
            loss   = loss_func(pred.transpose(1,2), labels)
            tr_loss += loss.item()

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=settings.GRAD_NORM
        )
        
        # propogate backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        if (n_batch+1) % 200 == 0:    
            training_iter.set_description(f'loss: {tr_loss/n_batch}')

# Inference function

In [None]:
def get_predictions(model, dataloader):
    '''Aggregate chunked predictions'''
    model.eval()

    predictions = collections.defaultdict(lambda : collections.defaultdict(lambda: np.zeros(len(settings.ID_TARGET_MAP))))
    pred_score = collections.defaultdict(lambda : collections.defaultdict(float))
    seen_word_idx = collections.defaultdict(lambda : collections.defaultdict(int))

    dataiter = iter(dataloader)
    for encoded_input in dataiter:
        ids  = encoded_input['input_ids'].to(settings.DEVICE)
        mask = encoded_input['attention_mask'].to(settings.DEVICE)

        output = model(ids, mask)
        output = torch.softmax(output, dim=-1).cpu().detach().numpy()

        osm = encoded_input['overflow_to_sample_mapping'].detach().numpy() # doc_id index
        word_ids = encoded_input['wids'].detach().numpy()

        for i, wids, pred in zip(osm, word_ids, output):
            for j, wid in enumerate(wids):
                if wid != -1:
                    predictions[i][wid] += pred[j]
                    seen_word_idx[i][wid] += 1


    for i in predictions:
        for wid in predictions[i]:
            argmax = np.argmax(predictions[i][wid])
            pred_score[i][wid] = float(predictions[i][wid][argmax])/seen_word_idx[i][wid]
            predictions[i][wid] /= seen_word_idx[i][wid]
            predictions[i][wid] = argmax

    return predictions, pred_score

def inference(model, predictions, pred_score):
    model.eval()
    result = {'id':[], 'class':[], 'predictionstring':[]}

    for idx, i in enumerate(predictions):
        
        ps = []

        last_cls_id = None
        for wid, cls_id in predictions[i].items():
            cls_id = cls_id if cls_id%2==0 else cls_id+1

            if last_cls_id is None or cls_id != last_cls_id:
                if len(ps) > 0 and last_cls_id != 0:
                    lg = label_group[label_to_lg_map[last_cls_id]]
                    if len(ps) >= min_thresh[lg] and np.mean([pred_score[idx][p] for p in ps])>proba_thresh[lg]:
                        result['id'].append(i)
                        result['class'].append(lg)
                        result['predictionstring'].append(' '.join(map(str, ps)))

                ps = []
                if cls_id == 0:
                    last_cls_id = None
                    continue

            ps.append(wid)
            last_cls_id = cls_id

        if len(ps) > 0:
            if last_cls_id != 0:
                if len(ps) >= min_thresh[lg] and np.mean([pred_score[idx][p] for p in ps])>proba_thresh[lg]:
                    lg = label_group[label_to_lg_map[last_cls_id]]
                    result['id'].append(i)
                    result['class'].append(lg)
                    result['predictionstring'].append(' '.join(map(str, ps)))

    return result

# Validation function

In [None]:
def validation(model, validation_ids, validation_dataloader, df_all):
    model.eval()
    
    predictions, pred_score = get_predictions(model, validation_dataloader)

    oof = pd.DataFrame(inference(model, predictions, pred_score))
    oof['id'] = [validation_ids[_id] for _id in oof.id.tolist()]

    f1scores = {}
    classes = oof['class'].unique()

    for c in classes:
        pred_df = oof.loc[oof['class']==c].copy()
        gt_df = df_all[df_all.id.isin(validation_ids)].copy()
        gt_df = gt_df[gt_df['discourse_type']==c]

        f1 = score_feedback_comp(pred_df, gt_df)
        f1scores[c] = f1

    return f1scores

# Eval Functions

In [None]:
def calc_overlap3(set_pred, set_gt):
    """
    Calculates if the overlap between prediction and
    ground truth is enough fora potential True positive
    """
    # Length of each and intersection
    try:
        len_gt = len(set_gt)
        len_pred = len(set_pred)
        inter = len(set_gt & set_pred)
        overlap_1 = inter / len_gt
        overlap_2 = inter/ len_pred
        return overlap_1 >= 0.5 and overlap_2 >= 0.5
    except:  # at least one of the input is NaN
        return False

def score_feedback_comp_micro3(pred_df, gt_df, discourse_type):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df.loc[gt_df['discourse_type'] == discourse_type, 
                      ['id', 'predictionstring']].reset_index(drop=True)
    pred_df = pred_df.loc[pred_df['class'] == discourse_type,
                      ['id', 'predictionstring']].reset_index(drop=True)
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    pred_df['predictionstring'] = [set(pred.split(' ')) for pred in pred_df['predictionstring']]
    gt_df['predictionstring'] = [set(pred.split(' ')) for pred in gt_df['predictionstring']]
    
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on='id',
                           right_on='id',
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    overlaps = [calc_overlap3(*args) for args in zip(joined.predictionstring_pred, 
                                                     joined.predictionstring_gt)]
    
    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    # we don't need to compute the match to compute the score
    TP = joined.loc[overlaps]['gt_id'].nunique()

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    TPandFP = len(pred_df)
    TPandFN = len(gt_df)
    
    #calc microf1
    my_f1_score = 2*TP / (TPandFP + TPandFN)
    return my_f1_score

def score_feedback_comp(pred_df, gt_df, return_class_scores=False):
    class_scores = {}
    for discourse_type in gt_df.discourse_type.unique():
        class_score = score_feedback_comp_micro3(pred_df, gt_df, discourse_type)
        class_scores[discourse_type] = class_score
    f1 = np.mean([v for v in class_scores.values()])
    if return_class_scores:
        return f1, class_scores
    return f1

# Main

## init

In [None]:
dtype = {
    'id' : str,
    'discourse_type' : str,
    'predictionstring' : str,
    'kfold' : int
}


tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME, add_prefix_space=True)
seed_everything(74)


df_all = pd.read_csv('train_folds.csv', dtype=dtype, usecols=list(dtype))
preprocessed_df = pd.read_pickle('./id_splittext_labels_df.pkl')

# get ids --> filer df --> tokenizer --> build dataset --> init dataloader
train_ids = sorted(df_all[~(df_all.kfold==settings.FOLD)].id.unique().tolist())
train_df = preprocessed_df[preprocessed_df.id.isin(train_ids)].sort_values(by='id')
tokenized_train = tokenize(preprocessed_df[preprocessed_df.id.isin(train_ids)], tokenizer)
train_dataset = FeedbackPrizeDataset(tokenized_train)
train_dataloader = DataLoader(train_dataset, batch_size=settings.BATCH, pin_memory=True)

# get ids --> filer df --> tokenizer --> build dataset --> init dataloader
val_ids = sorted(df_all[df_all.kfold==settings.FOLD].id.unique().tolist())
val_df = preprocessed_df[preprocessed_df.id.isin(val_ids)].sort_values(by='id')
tokenized_val = tokenize(val_df, tokenizer)
val_dataset = FeedbackPrizeDataset(tokenized_val)
val_dataloader = DataLoader(val_dataset, batch_size=settings.BATCH, pin_memory=True)

model = FeedbackModel(len(settings.TARGET_ID_MAP))
model.to(settings.DEVICE)
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=settings.LR[0])

## run

In [None]:
for i in range(settings.EPOCH):
    for g in optimizer.param_groups: 
        g['lr'] = settings.LR[i]

    train(model, train_dataloader, optimizer, loss_func)
    print('\n--------END OF EPOCH-------\n')
    v = validation(model, val_ids, val_dataloader, df_all)
    print(v)
    print('mean', np.mean(list(v.values())))
    time.sleep(2)

In [None]:
torch.save(model.state_dict(), f'fbp_model_{settings.FOLD}.pt')