In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
LABEL_NAMES = ['EDU', 'GAR']

DATA_PATH = '/content/drive/MyDrive/domain-helper/'
LOG_PATH = '/content/drive/MyDrive/domain-helper/logs/'

%env TOKENIZERS_PARALLELISM=false

!pip install Sentencepiece
!pip install torch
!pip install transformers
!pip install nlp

from transformers import CamembertForSequenceClassification, CamembertTokenizerFast, Trainer, TrainingArguments
import torch
from nlp import load_dataset, DatasetDict

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
camembert = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=len(LABEL_NAMES))

dataset = load_dataset('csv', data_files=str(DATA_PATH) + '/combined_csv.csv', split='train' )

# 90% train, 10% test + validation
train_test_valid = dataset.train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

train_set = train_test_valid_dataset['train']
test_set = train_test_valid_dataset['test']
validation_set = train_test_valid_dataset['valid']

def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

train_set = train_set.map(preprocess, batched=True,
                          batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))

train_set.set_format('torch',
                      columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch',
                     columns=['input_ids', 'attention_mask', 'label'])

batch_size = 8
epochs = 4

warmup_steps = 500
weight_decay = 0.01

training_args = TrainingArguments(
    output_dir=f'{DATA_PATH}sort_results',
    label_names=LABEL_NAMES,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)

trainer = Trainer(
    model=camembert,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

trainer.train()
trainer.evaluate()

env: TOKENIZERS_PARALLELISM=false
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 16.2 MB/s 
[?25hInstalling collected packages: Sentencepiece
Successfully installed Sentencepiece-0.1.97
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 14.8 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 72.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1

Downloading sentencepiece.bpe.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias

Downloading:   0%|          | 0.00/2.75k [00:00<?, ?B/s]



Downloading and preparing dataset csv/default-c5ee9e4a485cff65 (download: Unknown size, generated: Unknown size, post-processed: Unknown sizetotal: Unknown size) to /root/.cache/huggingface/datasets/csv/default-c5ee9e4a485cff65/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-c5ee9e4a485cff65/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b. Subsequent calls will reuse this data.


  0%|          | 0/21 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 20424
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10212


Epoch,Training Loss,Validation Loss
1,0.0871,No log
2,0.0548,No log
3,0.0407,No log
4,0.0244,No log


Saving model checkpoint to /content/drive/MyDrive/domain-helper/sort_results/checkpoint-500
Configuration saved in /content/drive/MyDrive/domain-helper/sort_results/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/domain-helper/sort_results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/domain-helper/sort_results/checkpoint-1000
Configuration saved in /content/drive/MyDrive/domain-helper/sort_results/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/domain-helper/sort_results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/domain-helper/sort_results/checkpoint-1500
Configuration saved in /content/drive/MyDrive/domain-helper/sort_results/checkpoint-1500/config.json
Model weights saved in /content/drive/MyDrive/domain-helper/sort_results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/domain-helper/sort_results/checkpoint-2000
Configu

{'eval_runtime': 3.8419,
 'eval_samples_per_second': 295.426,
 'eval_steps_per_second': 36.961,
 'epoch': 4.0}

In [4]:
Y_test = validation_set["label"]
X_test_tokenized = tokenizer(validation_set["text"], padding=True, truncation=True)

!pip install sklearn
import numpy as np

from sklearn.metrics import precision_score

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


validation_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, a, b = trainer.predict(validation_dataset)

# Preprocess raw predictions
Y_pred = np.argmax(raw_pred, axis=1)

print(precision_score(Y_test, Y_pred, average='micro'))

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1310 sha256=13428f5eab23f5dbd29fdab0cbfe3724640a3db4865f6be2dd67eaef2ac71a70
  Stored in directory: /root/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


***** Running Prediction *****
  Num examples = 1135
  Batch size = 8


0.9850220264317181
