In [1]:
!pip install transformers[torch] --quiet
!pip install datasets --quiet
!pip install accelerate -U --quiet
!pip install evaluate --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m101.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/kaggle-nlp-ell/* .

Mounted at /content/drive


In [13]:
from datasets import load_dataset

train_dataset = load_dataset('csv', data_files='train.csv', split='train')
train_dataset = train_dataset.train_test_split(test_size=0.1)



In [15]:
from transformers import AutoTokenizer

checkpoint = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
remove_columns = ['text_id']
label_names=[
    'cohesion',
    'syntax',
    'vocabulary',
    'phraseology',
    'grammar',
    'conventions',
]

def preprocess_fn(example):
    example['label'] = [example[label] for label in label_names]
    return tokenizer(example['full_text'], truncation=True)

train_dataset = train_dataset.map(preprocess_fn, batched=False, remove_columns=remove_columns)

Map:   0%|          | 0/3519 [00:00<?, ? examples/s]

Map:   0%|          | 0/392 [00:00<?, ? examples/s]

In [72]:
from transformers import AutoModelForSequenceClassification, Trainer, DataCollatorWithPadding, TrainingArguments
import evaluate
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(
  checkpoint,
  num_labels=6,
  problem_type='regression',
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
mse_metric = evaluate.load('mse')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return mse_metric.compute(
        predictions=predictions.reshape(-1),
        references=labels.reshape(-1),
        squared=False
    )

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier.weight', 'pre_classifier.

In [73]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=5e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset['train'],
    eval_dataset=train_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Mse
1,No log,0.268716,0.518378
2,0.558400,0.323968,0.569181
3,0.251100,0.25968,0.509588
4,0.191300,0.276283,0.525626
5,0.155300,0.268728,0.51839


TrainOutput(global_step=2200, training_loss=0.27596072977239433, metrics={'train_runtime': 886.0491, 'train_samples_per_second': 19.858, 'train_steps_per_second': 2.483, 'total_flos': 2329332140831616.0, 'train_loss': 0.27596072977239433, 'epoch': 5.0})

In [68]:
def preprocess_test_fn(example):
    return tokenizer(example['full_text'], padding='max_length', truncation=True)

test_dataset = load_dataset('csv', data_files='test.csv')

text_ids = test_dataset['train']['text_id']
test_dataset = test_dataset.map(preprocess_test_fn, batched=False, remove_columns=['text_id', 'full_text'])
test_dataset = test_dataset.with_format('torch', device='cuda:0')



  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [70]:
import torch
import pandas as pd

def label2dict(label):
  size_label = len(label_names)
  return { label_names[i]: round(float(label[i]) * 2.0) / 2.0 for i in range(size_label) }

model.eval()
outputs = []
with torch.no_grad():
    for i, input in enumerate(test_dataset['train']):
        output = model(**input)
        outputs.append({ 'text_id': text_ids[i], **label2dict(output['logits'].squeeze()) })

pd.DataFrame(outputs).to_csv('submission.csv', index=False)