In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [13]:
from nlp import DatasetDict

LABEL_NAMES = ['Arts plastiques', 'Biotech-ST2S', 'Documentation', 'EMC', 'EPS', 'Français', 'Grec ancien', 'Géographie',
            'Histoire', 'Langues vivantes', 'Latin', 'Mathématiques', 'Physique - Chimie', 'SES', 'SVT', 'Technologie',
            'Économie et gestion']
DATA_PATH = '/content/drive/MyDrive/domain-helper/'
LOG_PATH = '/content/drive/MyDrive/domain-helper/logs/'

%env TOKENIZERS_PARALLELISM=false

!pip install Sentencepiece
!pip install torch
!pip install transformers
!pip install nlp

from transformers import CamembertForSequenceClassification, CamembertTokenizerFast, Trainer, TrainingArguments
import torch
from nlp import load_dataset

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
camembert = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=len(LABEL_NAMES))

dataset = load_dataset('csv', data_files=str(DATA_PATH) + '/edubases_labeled_data.csv', split='train' )

# 90% train, 10% test + validation
train_test_valid = dataset.train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

train_set = train_test_valid_dataset['train']
test_set = train_test_valid_dataset['test']
validation_set = train_test_valid_dataset['valid']

def preprocess(data):
    return tokenizer(data['text'], padding=True, truncation=True)

train_set = train_set.map(preprocess, batched=True,
                          batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))

train_set.set_format('torch',
                      columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch',
                     columns=['input_ids', 'attention_mask', 'label'])

batch_size = 8
epochs = 8

warmup_steps = 500
weight_decay = 0.01

training_args = TrainingArguments(
    output_dir=f'{DATA_PATH}/results',
    label_names=LABEL_NAMES,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)

trainer = Trainer(
    model=camembert,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

trainer.train()
trainer.evaluate()

env: TOKENIZERS_PARALLELISM=false


Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight'

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 12601
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12608


Epoch,Training Loss,Validation Loss



KeyboardInterrupt



In [None]:
eval = trainer.evaluate() 

In [None]:
print(trainer)

In [None]:
!pip install Sentencepiece
!pip install torch
!pip install transformers
!pip install nlp

import pandas as pd
import numpy as np
import transformers
import nlp
from transformers import CamembertForSequenceClassification, CamembertTokenizerFast, Trainer, TrainingArguments
import torch
from nlp import load_dataset

LABEL_NAMES = ['Arts plastiques', 'Biotech-ST2S', 'Documentation', 'EMC', 'EPS', 'Français', 'Grec ancien', 'Géographie',
            'Histoire', 'Langues vivantes', 'Latin', 'Mathématiques', 'Physique - Chimie', 'SES', 'SVT', 'Technologie',
            'Économie et gestion']

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

DATA_PATH = '/content/drive/MyDrive/domain-helper/'


# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


test_data = pd.read_csv(str(DATA_PATH) + '/gar_labeled_data2.csv')
X_test = list(test_data["text"])
Y_test = list(test_data["label"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized, Y_test)

# Load trained model
model_path = "./results/checkpoint-9500"
model = CamembertForSequenceClassification.from_pretrained(model_path, num_labels=len(LABEL_NAMES))

# Define test trainer
test_trainer = Trainer(model)

# Make prediction
raw_pred, a, b = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

print(y_pred)
print(a,b)

In [None]:
y_test = list(test_data["label"])
[[LABEL_NAMES[y_test[i]], LABEL_NAMES[y_pred[i]], X_test[i]] for i in range(0, 100)]