In [1]:
!pip install transformers[torch] --quiet
!pip install datasets --quiet
!pip install accelerate -U --quiet
!pip install evaluate --quiet

In [2]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/drive/MyDrive/kaggle-nlp-disaster/* .

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from datasets import load_dataset

train_dataset = load_dataset('csv', data_files='train.csv', split='train')
train_dataset = train_dataset.train_test_split(test_size=0.1)

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-3b0fff111be24694/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-3b0fff111be24694/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


In [4]:
from transformers import AutoTokenizer

checkpoint = 'vinai/bertweet-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=False)
remove_columns = ['id', 'keyword', 'location', 'target']

def preprocess_fn(example):
  example['label'] = example['target']
  return tokenizer(example['text'], truncation=True)

train_dataset = train_dataset.map(preprocess_fn, batched=True, remove_columns=remove_columns)

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSequenceClassification, Trainer, DataCollatorWithPadding, TrainingArguments
import evaluate
import numpy as np

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [6]:
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset['train'],
    eval_dataset=train_dataset['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.4777,0.377689,0.860892
2,0.3738,0.418957,0.856955
3,0.301,0.538774,0.843832
4,0.2371,0.608979,0.845144
5,0.1918,0.649611,0.84252


TrainOutput(global_step=4285, training_loss=0.3090856358495866, metrics={'train_runtime': 517.7728, 'train_samples_per_second': 66.158, 'train_steps_per_second': 8.276, 'total_flos': 754341965161020.0, 'train_loss': 0.3090856358495866, 'epoch': 5.0})

In [8]:
import torch
import pandas as pd
from transformers import pipeline

test_dataset = load_dataset('csv', data_files='test.csv')
classifier = pipeline(
    'text-classification',
    model=model,
    tokenizer=tokenizer,
    device=0,
)

output = classifier(test_dataset['train']['text'])
label2id = {
    'LABEL_0': 0,
    'LABEL_1': 1,
}

pd.DataFrame({
    'id': test_dataset['train']['id'],
    'target': [label2id[out['label']] for out in output],
}).to_csv('submission.csv', index=False)



  0%|          | 0/1 [00:00<?, ?it/s]