In [1]:
from datasets import load_dataset

DATASET_PREFIX = '../input/feedback-prize-english-language-learning/'

train_dataset = load_dataset('csv', data_files=DATASET_PREFIX+'train.csv', split='train')
train_dataset = train_dataset.train_test_split(test_size=0.05)

In [2]:
from transformers import AutoTokenizer

checkpoint = '../input/transformers/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
remove_columns = ['text_id', 'full_text']
label_names=[
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]

def preprocess_fn(example):
    example['label'] = [example[label] for label in label_names]
    return tokenizer(example['full_text'], padding='max_length', max_length=512, truncation=True)

train_dataset = train_dataset.map(preprocess_fn, batched=False, remove_columns=remove_columns+label_names)

  0%|          | 0/3715 [00:00<?, ?ex/s]

  0%|          | 0/196 [00:00<?, ?ex/s]

In [3]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_dataloader = DataLoader(
    train_dataset['train'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=32,
)

eval_dataloader = DataLoader(
    train_dataset['test'],
    collate_fn=data_collator,
    batch_size=len(train_dataset['test'])
)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
from transformers import AutoModelForSequenceClassification, AdamW
from sklearn.metrics import mean_squared_error

import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(
  checkpoint,
  num_labels=6,
  problem_type='regression',
)

optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {
        'rmse': mean_squared_error(labels, predictions, squared=False)
    }

Some weights of the model checkpoint at ../input/transformers/distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at ../input/transformers/distilbert-base-uncased and are newly initialized: ['pre_classi

In [5]:
from transformers import get_scheduler

num_epochs = 5
num_steps_per_epoch = len(train_dataloader)
num_training_steps = num_epochs * num_steps_per_epoch

lr_scheduler = get_scheduler(
    'linear',
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [13]:
from tqdm.auto import tqdm
import torch

progress_bar = tqdm(range(num_training_steps))
model.to('cuda:0')

for epoch in range(num_epochs):
    model.train()
    
    for batch in train_dataloader:
        batch = { key: value.to('cuda:0') for key, value in batch.items() }
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        progress_bar.update(1)

    model.eval()
    
    for batch in eval_dataloader:
        batch = { key: value.to('cuda:0') for key, value in batch.items() }
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits
        labels = batch["labels"]

        print(
            f"epoch {epoch}:",
            compute_metrics((predictions.to('cpu'), labels.to('cpu')))
        )

  0%|          | 0/585 [00:00<?, ?it/s]

epoch 0: {'rmse': {'rmse': 0.4640412}}
epoch 1: {'rmse': {'rmse': 0.48933372}}
epoch 2: {'rmse': {'rmse': 0.48933372}}
epoch 3: {'rmse': {'rmse': 0.48933372}}
epoch 4: {'rmse': {'rmse': 0.48933372}}


KeyboardInterrupt: 

In [16]:
def preprocess_test_fn(example):
    return tokenizer(example['full_text'], padding='max_length', max_length=512, truncation=True)

test_dataset = load_dataset('csv', data_files=DATASET_PREFIX+'test.csv')

text_ids = test_dataset['train']['text_id']
test_dataset = test_dataset.map(preprocess_test_fn, batched=False, remove_columns=['text_id', 'full_text'])
test_dataset = test_dataset.with_format('torch', device='cuda:0')

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?ex/s]

In [17]:
import torch
import pandas as pd

def label2dict(label):
  size_label = len(label_names)
  return { label_names[i]: round(float(label[i]) * 2.0) / 2.0 for i in range(size_label) }

model.eval()
outputs = []
with torch.no_grad():
    for i, input in enumerate(test_dataset['train']):
        output = model(**input)
        outputs.append({ 'text_id': text_ids[i], **label2dict(output['logits'].squeeze()) })

pd.DataFrame(outputs).to_csv('submission.csv', index=False)