In [13]:
import glob
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModel, AutoTokenizer, logging

### Settings for the stuff below

In [8]:
output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

class settings:
    DATA_PATH   = '../input/feedback-prize-2021/'
    WORKERS     = os.cpu_count()
    MAX_TOK_LEN = 512
    STRIDE      = 256
    BATCH       = 4
    LR          = [2.5e-05, 2.5e-05, 2.5e-06, 2.5e-07, 2.5e-07]
    GRAD_NORM   = 10
    EPOCH       = 5
    FOLD        = 3

    TARGET_ID_MAP = {label:i for i,label in enumerate(output_labels)}
    ID_TARGET_MAP = {v:k for k,v in TARGET_ID_MAP.items()}

    DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

    MODEL_BASE_PATH = '../input/model-bin-fbp/'
    MODEL_NAME      = 'roberta-base'
    MODEL_PATH      = 'model'
    WEIGHTS_PATH    = [f'fbp_model_{fold}.pt' for fold in range(5)]

### Download pretrained models from huggingface

- Download pretrained models from huggingface. The competition doesn't allow notebooks with internet access to be submitted. You have to download the model first and upload it to Kaggle.

In [None]:
# Download pretrained model
class settings():
    MODEL_PATH='./'
    MODEL_NAME='roberta-base'

os.mkdir(settings.MODEL_PATH)

tokenizer = AutoTokenizer.from_pretrained(settings.MODEL_NAME, add_prefix_space=True)
tokenizer.save_pretrained(settings.MODEL_PATH)

config_model = AutoConfig.from_pretrained(settings.MODEL_NAME) 
config_model.save_pretrained(settings.MODEL_PATH)

backbone = AutoModel.from_pretrained(settings.MODEL_NAME)
backbone.save_pretrained(settings.MODEL_PATH)

### Preprocessing the input data

- The tagged data doesn't contain all of the words in the corresponding file.
- I beleive this information might be useful, so I preprocess the data to create a new labeled dataset that contains all of the words and tags per file in a single Pandas row.

In [5]:
def read_file(id: str, mode: str, split: bool=True):
    contents = None

    folder = os.path.join(settings.DATA_PATH, mode)
    fp = os.path.join(folder, f'{id}.txt')

    with open(fp, 'r', encoding='utf-8') as f:
        contents = f.read()
        
    if split:
        contents = contents.split()

    return contents

In [6]:
files = glob.glob('../input/feedback-prize-2021/train/*')
df = pd.read_csv('../input/feedback-prize-2021/train.csv')
ids = [f.split('\\')[-1][:-4] for f in files]

In [9]:
results = {
    'id':[],
    'split_text':[],
    'labels':[]
}
for id in ids:
    split_text = read_file(id, 'train')

    fltr = df.id == id
    _ps = list([(list(map(int, x.split())),y) for x,y in df.loc[fltr,['predictionstring', 'discourse_type']].values])
    ps = []
    for lst, typ in _ps:
        fixed_ps = {}
        for i, elm in enumerate(lst):
            custom_typ = f'B-{typ}' if i==0 else f'I-{typ}'
            target_id = settings.TARGET_ID_MAP[custom_typ]
            fixed_ps[elm] = target_id
        ps.append(fixed_ps)

    final_ps = {}
    for tmp in ps:
        final_ps.update(tmp)

    labels = []
    for i,_ in enumerate(split_text):
        if i in final_ps:
            labels.append(final_ps[i])
        else:
            labels.append(0)


    results['id'].append(id)
    results['split_text'].append(split_text)
    results['labels'].append(labels)

In [66]:
pd.DataFrame(results).to_pickle('id_splittext_labels_df.pkl')

In [10]:
pd.DataFrame(results)

Unnamed: 0,id,split_text,labels
0,0000D23A521A,"[Some, people, belive, that, the, so, called, ...","[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ..."
1,00066EA9880D,"[Driverless, cars, are, exaclty, what, you, wo...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,000E6DE9E817,"[Dear:, Principal, I, am, arguing, against, th...","[0, 0, 3, 4, 4, 4, 4, 4, 4, 0, 7, 8, 8, 8, 8, ..."
3,001552828BD0,"[Would, you, be, able, to, give, your, car, up...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,0016926B079C,"[I, think, that, students, would, benefit, fro...","[3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 6, 6, 6, 6, ..."
...,...,...,...
15589,FFF1442D6698,"[Every, student, looks, forward, to, summer, b...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
15590,FFF1ED4F8544,"[Many, citizens, argue, that, the, Electoral, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
15591,FFF868E06176,"[Every, summer, break,, students, are, given, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
15592,FFFD0AF13501,"[In, the, article, ""A, Cowboy, Who, Rode, the,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


### Folding the original data

- One of the users on Kaggle showed that creating good folds can improve the models performance. This the that code copied verbatim

In [None]:
df = pd.read_csv("../input/feedback-prize-2021/train.csv")

dfx = pd.get_dummies(df, columns=["discourse_type"]).groupby(["id"], as_index=False).sum()
cols = [c for c in dfx.columns if c.startswith("discourse_type_") or c == "id" and c != "discourse_type_num"]
dfx = dfx[cols]

mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
labels = [c for c in dfx.columns if c != "id"]
dfx_labels = dfx[labels]
dfx["kfold"] = -1

for fold, (trn_, val_) in enumerate(mskf.split(dfx, dfx_labels)):
    print(len(trn_), len(val_))
    dfx.loc[val_, "kfold"] = fold

df = df.merge(dfx[["id", "kfold"]], on="id", how="left")
print(df.kfold.value_counts())
df.to_csv("train_folds.csv", index=False)

12477 3117
12474 3120
12475 3119
12475 3119
12475 3119
0    28997
2    28968
3    28904
1    28737
4    28687
Name: kfold, dtype: int64
