In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
LABEL_NAMES = ['scolomfr-voc-015-num-1179',
 'scolomfr-voc-015-num-1548',
 'scolomfr-voc-015-num-1032',
 'scolomfr-voc-015-num-1333',
 'scolomfr-voc-015-num-6360',
 'scolomfr-voc-015-num-980',
 'scolomfr-voc-015-num-1430',
 'scolomfr-voc-015-num-919',
 'scolomfr-voc-015-num-7755',
 'scolomfr-voc-015-num-1831',
 'scolomfr-voc-015-num-6364',
 'scolomfr-voc-015-num-1832',
 'scolomfr-voc-015-num-6365',
 'scolomfr-voc-015-num-1834',
 'scolomfr-voc-015-num-6369',
 'scolomfr-voc-015-num-7816']

DATA_PATH = '/content/drive/MyDrive/domain-helper/'
LOG_PATH = '/content/drive/MyDrive/domain-helper/logs/'

%env TOKENIZERS_PARALLELISM=false

!pip install Sentencepiece
!pip install torch
!pip install transformers
!pip install nlp

from transformers import CamembertForSequenceClassification, CamembertTokenizerFast, Trainer, TrainingArguments
import torch
from nlp import load_dataset, DatasetDict

tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")
camembert = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=len(LABEL_NAMES))

dataset = load_dataset('csv', data_files=[str(DATA_PATH) + '/edubases_domain_labeled_data.csv',str(DATA_PATH) + '/gar_domain_labeled_data.csv'], split='train' )

# 90% train, 10% test + validation
train_test_valid = dataset.train_test_split(test_size=0.1)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

train_set = train_test_valid_dataset['train']
test_set = train_test_valid_dataset['test']
validation_set = train_test_valid_dataset['valid']

def preprocess(data):
    return tokenizer(data['title'], padding=True, truncation=True)

train_set = train_set.map(preprocess, batched=True,
                          batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))

train_set.set_format('torch',
                      columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch',
                     columns=['input_ids', 'attention_mask', 'label'])

batch_size = 8
epochs = 4

warmup_steps = 500
weight_decay = 0.01

training_args = TrainingArguments(
    output_dir=f'{DATA_PATH}/results',
    label_names=LABEL_NAMES,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluation_strategy='epoch',
    logging_dir='./logs',
)

trainer = Trainer(
    model=camembert,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

trainer.train()
trainer.evaluate()

In [None]:
Y_test = validation_set["label"]
X_test_tokenized = tokenizer(validation_set["title"], padding=True, truncation=True)

!pip install sklearn
import numpy as np

from sklearn.metrics import precision_score

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


validation_dataset = Dataset(X_test_tokenized)

# Make prediction
raw_pred, a, b = trainer.predict(validation_dataset)

# Preprocess raw predictions
Y_pred = np.argmax(raw_pred, axis=1)

print(precision_score(Y_test, Y_pred, average='micro'))