In [1]:
# Model Training
from datasets import load_from_disk

datasets = load_from_disk("./encoded_data4")
datasets

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['String', 'Algorithm Design', 'Basic Machine Organisation', 'Computer System', 'Data Manipulation and Analysis', 'Data Organisation and Data Control', 'Elementary Web Authoring', 'Health and Ethical Issues', 'Information Processing', 'Intellectual Property', 'Internet Services and Applications', 'Multimedia Elements', 'Networking and Internet Basics', 'Program Development', 'Spreadsheets and Databases', 'Threats and Security on the Internet', '__index_level_0__'],
        num_rows: 644
    })
    valid: Dataset({
        features: ['String', 'Algorithm Design', 'Basic Machine Organisation', 'Computer System', 'Data Manipulation and Analysis', 'Data Organisation and Data Control', 'Elementary Web Authoring', 'Health and Ethical Issues', 'Information Processing', 'Intellectual Property', 'Internet Services and Applications', 'Multimedia Elements', 'Networking and Internet Basics', 'Program Development', 'Spreadsheets and Databases', '

In [2]:
datasets = datasets.remove_columns(['__index_level_0__'])

In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['String', 'Algorithm Design', 'Basic Machine Organisation', 'Computer System', 'Data Manipulation and Analysis', 'Data Organisation and Data Control', 'Elementary Web Authoring', 'Health and Ethical Issues', 'Information Processing', 'Intellectual Property', 'Internet Services and Applications', 'Multimedia Elements', 'Networking and Internet Basics', 'Program Development', 'Spreadsheets and Databases', 'Threats and Security on the Internet'],
        num_rows: 644
    })
    valid: Dataset({
        features: ['String', 'Algorithm Design', 'Basic Machine Organisation', 'Computer System', 'Data Manipulation and Analysis', 'Data Organisation and Data Control', 'Elementary Web Authoring', 'Health and Ethical Issues', 'Information Processing', 'Intellectual Property', 'Internet Services and Applications', 'Multimedia Elements', 'Networking and Internet Basics', 'Program Development', 'Spreadsheets and Databases', 'Threats and Security 

In [4]:
model_labels = [label for label in datasets['train'].features.keys() if label not in ['Unnamed: 0', 'String', '__index_level_0__']]

In [5]:
model_labels

['Algorithm Design',
 'Basic Machine Organisation',
 'Computer System',
 'Data Manipulation and Analysis',
 'Data Organisation and Data Control',
 'Elementary Web Authoring',
 'Health and Ethical Issues',
 'Information Processing',
 'Intellectual Property',
 'Internet Services and Applications',
 'Multimedia Elements',
 'Networking and Internet Basics',
 'Program Development',
 'Spreadsheets and Databases',
 'Threats and Security on the Internet']

In [6]:
datasets["train"].column_names

['String',
 'Algorithm Design',
 'Basic Machine Organisation',
 'Computer System',
 'Data Manipulation and Analysis',
 'Data Organisation and Data Control',
 'Elementary Web Authoring',
 'Health and Ethical Issues',
 'Information Processing',
 'Intellectual Property',
 'Internet Services and Applications',
 'Multimedia Elements',
 'Networking and Internet Basics',
 'Program Development',
 'Spreadsheets and Databases',
 'Threats and Security on the Internet']

In [7]:
datasets["train"].features

{'String': Value(dtype='string', id=None),
 'Algorithm Design': Value(dtype='int32', id=None),
 'Basic Machine Organisation': Value(dtype='int32', id=None),
 'Computer System': Value(dtype='int32', id=None),
 'Data Manipulation and Analysis': Value(dtype='int32', id=None),
 'Data Organisation and Data Control': Value(dtype='int32', id=None),
 'Elementary Web Authoring': Value(dtype='int32', id=None),
 'Health and Ethical Issues': Value(dtype='int32', id=None),
 'Information Processing': Value(dtype='int32', id=None),
 'Intellectual Property': Value(dtype='int32', id=None),
 'Internet Services and Applications': Value(dtype='int32', id=None),
 'Multimedia Elements': Value(dtype='int32', id=None),
 'Networking and Internet Basics': Value(dtype='int32', id=None),
 'Program Development': Value(dtype='int32', id=None),
 'Spreadsheets and Databases': Value(dtype='int32', id=None),
 'Threats and Security on the Internet': Value(dtype='int32', id=None)}

In [8]:
#print(f"{datasets.train.column_names}\n{datasets.train.features}")

In [9]:
model_labels

['Algorithm Design',
 'Basic Machine Organisation',
 'Computer System',
 'Data Manipulation and Analysis',
 'Data Organisation and Data Control',
 'Elementary Web Authoring',
 'Health and Ethical Issues',
 'Information Processing',
 'Intellectual Property',
 'Internet Services and Applications',
 'Multimedia Elements',
 'Networking and Internet Basics',
 'Program Development',
 'Spreadsheets and Databases',
 'Threats and Security on the Internet']

In [10]:
id2label = {idx:label for idx, label in enumerate(model_labels)}
label2id = {label:idx for idx, label in enumerate(model_labels)}

In [11]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

Downloading (…)lve/main/config.json: 100%|██████████| 480/480 [00:00<00:00, 104kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading (…)olve/main/vocab.json: 100%|██████████| 899k/899k [00:01<00:00, 551kB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 523kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.36M/1.36M [00:01<00:00, 799kB/s]


In [12]:
def encode_data(dataset):
    text = dataset["String"]
    # tokenize string
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=70)
    # create encoded array list with labels
    labelsBatch = {x: dataset[x] for x in dataset.keys() if x in model_labels}
    labels_matrix = np.zeros((len(text), len(model_labels)))
    for idx, label in enumerate(model_labels):
        labels_matrix[:, idx] = labelsBatch[label]
    
    encoding["labels"] = labels_matrix.tolist()
    return encoding

In [13]:
# tokenize the dataset
encoded_ds = datasets.map(encode_data, batched=True, remove_columns=datasets['train'].column_names)

100%|██████████| 1/1 [00:00<00:00, 19.27ba/s]
100%|██████████| 1/1 [00:00<00:00, 125.33ba/s]
100%|██████████| 1/1 [00:00<00:00, 126.13ba/s]


In [14]:
encoded_ds['train']['labels'][0]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]

In [15]:
[id2label[idx] for idx, label in enumerate(encoded_ds['train'][15]['labels']) if label == 1.0]

['Data Organisation and Data Control']

In [16]:
import torch
from torch import cuda
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

encoded_ds.set_format("torch")

Using device: cuda



In [17]:
encoded_ds["train"]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 644
})

In [18]:

# define model
model = AutoTokenizer.from_pretrained("distilroberta-base", problem_type="multi_label_classification", num_labels=len(model_labels), id2label=id2label, label2id=label2id)

In [19]:
# train model
batch_size = 8
metric_name = "f1"

In [20]:
import evaluate
metric  = evaluate.load("accuracy")

In [21]:
import time

train_name_dir = time.strftime(f"%Y%m%d_%H%M%S$-train-model-{model.name_or_path}")
train_name_dir

'20230329_074604$-train-model-distilroberta-base'

In [22]:
from transformers import TrainingArguments, Trainer


args = TrainingArguments(
    output_dir="./RBert-v1/"+ train_name_dir,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs= 81,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='./RBert-v1/logging/'+ train_name_dir
)

In [23]:
args.learning_rate

2e-05

In [24]:
# from transformers import AdamW
# optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

In [25]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, balanced_accuracy_score, hamming_loss
from transformers import EvalPrediction


def multi_label_metrics(predictions, labels, threshold=0.50):
    # apply sigmoid on predictions fitting (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    # logits, labels = predictions
    # prediction = np.argmax(logits, axis=-1)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy,
               'balanced_accuracy': balanced_accuracy,
               }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

def compute_metrics_new(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [26]:
encoded_ds['train'][0]['labels'].type()

'torch.FloatTensor'

In [27]:
# model.to(device)

In [30]:
outputs = model(input_ids=encoded_ds['train']['input_ids'][0].unsqueeze(0), labels=encoded_ds['train'][0]['labels'].unsqueeze(0))
outputs

ValueError: You need to specify either `text` or `text_target`.

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_ds["train"],
    eval_dataset=encoded_ds["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
prediction_result = trainer.predict(encoded_ds['test'])

In [None]:
print(prediction_result.metrics)

In [None]:
trainer.save_model("./RBert-model-v1")

In [None]:
tokenizer.save_pretrained("./RBert-model-v1/tokenizer")