In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
LABEL_NAMES = ['Arts plastiques', 'Biotech-ST2S', 'Documentation', 'EMC', 'EPS', 'Français', 'Grec ancien', 'Géographie',
            'Histoire', 'Langues vivantes', 'Latin', 'Mathématiques', 'Physique - Chimie', 'SES', 'SVT', 'Technologie',
            'Économie et gestion']
DATA_PATH = '/content/drive/MyDrive/domain-helper/'
LOG_PATH = '/content/drive/MyDrive/domain-helper/logs/'

%env TOKENIZERS_PARALLELISM=false

!pip install Sentencepiece
!pip install torch
!pip install transformers
!pip install nlp

from transformers import CamembertForSequenceClassification, CamembertTokenizerFast, Trainer, TrainingArguments
import torch
from nlp import load_dataset

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
camembert = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=len(LABEL_NAMES))

dataset = load_dataset('csv', data_files=str(DATA_PATH) + '/edubases_labeled_data.csv', split='train')

dataset = dataset.train_test_split(test_size=0.3)

train_set = dataset['train']
test_set = dataset['test']

def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

train_set = train_set.map(preprocess, batched=True,
                          batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))

train_set.set_format('torch',
                      columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch',
                     columns=['input_ids', 'attention_mask', 'label'])

batch_size = 8
epochs = 8

warmup_steps = 500
weight_decay = 0.01

training_args = TrainingArguments(
    output_dir=f'{DATA_PATH}/results',
    label_names=LABEL_NAMES,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)

trainer = Trainer(
    model=camembert,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

trainer.train()
trainer.evaluate()

env: TOKENIZERS_PARALLELISM=false


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 

Downloading:   0%|          | 0.00/2.75k [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset csv/default-b704a75bf62949e8 (download: Unknown size, generated: Unknown size, post-processed: Unknown sizetotal: Unknown size) to /root/.cache/huggingface/datasets/csv/default-b704a75bf62949e8/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b...


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-b704a75bf62949e8/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b. Subsequent calls will reuse this data.


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 9801
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 9808


Epoch,Training Loss,Validation Loss
1,0.8658,No log
2,0.4166,No log
3,0.2504,No log
4,0.1413,No log
5,0.0763,No log
6,0.0388,No log
7,0.0151,No log
8,0.0072,No log


Saving model checkpoint to /content/drive/MyDrive/domain-helper//results/checkpoint-500
Configuration saved in /content/drive/MyDrive/domain-helper//results/checkpoint-500/config.json
Model weights saved in /content/drive/MyDrive/domain-helper//results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/domain-helper//results/checkpoint-1000
Configuration saved in /content/drive/MyDrive/domain-helper//results/checkpoint-1000/config.json
Model weights saved in /content/drive/MyDrive/domain-helper//results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4201
  Batch size = 8
Saving model checkpoint to /content/drive/MyDrive/domain-helper//results/checkpoint-1500
Configuration saved in /content/drive/MyDrive/domain-helper//results/checkpoint-1500/config.json
Model weights saved in /content/drive/MyDrive/domain-helper//results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/domain-helper//r

{'epoch': 8.0,
 'eval_runtime': 40.7322,
 'eval_samples_per_second': 103.137,
 'eval_steps_per_second': 12.914}

In [None]:
eval = trainer.evaluate() 

***** Running Evaluation *****
  Num examples = 4201
  Batch size = 8


In [None]:
print(trainer)

<transformers.trainer.Trainer object at 0x7fe885faad90>


In [None]:
!pip install Sentencepiece
!pip install torch
!pip install transformers
!pip install nlp

import pandas as pd
import numpy as np
import transformers
import nlp
from transformers import CamembertForSequenceClassification, CamembertTokenizerFast, Trainer, TrainingArguments
import torch
from nlp import load_dataset

LABEL_NAMES = ['Arts plastiques', 'Biotech-ST2S', 'Documentation', 'EMC', 'EPS', 'Français', 'Grec ancien', 'Géographie',
            'Histoire', 'Langues vivantes', 'Latin', 'Mathématiques', 'Physique - Chimie', 'SES', 'SVT', 'Technologie',
            'Économie et gestion']

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

DATA_PATH = '/content/drive/MyDrive/domain-helper/'


# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


test_data = pd.read_csv(str(DATA_PATH) + '/gar_labeled_data2.csv')
X_test = list(test_data["text"])
Y_test = list(test_data["label"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized, Y_test)

# Load trained model
model_path = "./results/checkpoint-9500"
model = CamembertForSequenceClassification.from_pretrained(model_path, num_labels=len(LABEL_NAMES))

# Define test trainer
test_trainer = Trainer(model)

# Make prediction
raw_pred, a, b = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

print(y_pred)
print(a,b)



404 Client Error: Not Found for url: https://huggingface.co/./results/checkpoint-9500/resolve/main/config.json


OSError: ignored

In [None]:
print(a,b)
for y in (enumerate(y_pred)):
  print(y)

[12 11  8 ... 14  8  3] {'test_loss': 1.8375413417816162, 'test_runtime': 24.9237, 'test_samples_per_second': 98.782, 'test_steps_per_second': 12.358}
(0, 12)
(1, 9)
(2, 3)
(3, 5)
(4, 11)
(5, 14)
(6, 8)
(7, 5)
(8, 12)
(9, 12)
(10, 8)
(11, 5)
(12, 5)
(13, 3)
(14, 5)
(15, 13)
(16, 5)
(17, 12)
(18, 15)
(19, 5)
(20, 3)
(21, 14)
(22, 3)
(23, 13)
(24, 5)
(25, 5)
(26, 5)
(27, 13)
(28, 9)
(29, 8)
(30, 14)
(31, 8)
(32, 8)
(33, 8)
(34, 12)
(35, 5)
(36, 5)
(37, 5)
(38, 6)
(39, 12)
(40, 11)
(41, 14)
(42, 8)
(43, 1)
(44, 13)
(45, 12)
(46, 14)
(47, 5)
(48, 12)
(49, 13)
(50, 11)
(51, 14)
(52, 11)
(53, 12)
(54, 12)
(55, 11)
(56, 14)
(57, 14)
(58, 11)
(59, 5)
(60, 5)
(61, 13)
(62, 8)
(63, 14)
(64, 14)
(65, 6)
(66, 13)
(67, 5)
(68, 11)
(69, 12)
(70, 11)
(71, 8)
(72, 11)
(73, 11)
(74, 9)
(75, 11)
(76, 9)
(77, 12)
(78, 8)
(79, 9)
(80, 12)
(81, 5)
(82, 9)
(83, 5)
(84, 14)
(85, 12)
(86, 12)
(87, 5)
(88, 8)
(89, 12)
(90, 5)
(91, 11)
(92, 2)
(93, 5)
(94, 11)
(95, 14)
(96, 14)
(97, 11)
(98, 5)
(99, 5)
(100, 14