In [29]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = "./distlBert-model-v9/"

#model = AutoModelForSequenceClassification.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                        local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.to('cuda')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [30]:
# Model Training
from datasets import load_from_disk

datasets = load_from_disk("./encoded_data4")
datasets

DatasetDict({
    train: Dataset({
        features: ['String', 'Algorithm Design', 'Basic Machine Organisation', 'Computer System', 'Data Manipulation and Analysis', 'Data Organisation and Data Control', 'Elementary Web Authoring', 'Health and Ethical Issues', 'Information Processing', 'Intellectual Property', 'Internet Services and Applications', 'Multimedia Elements', 'Networking and Internet Basics', 'Program Development', 'Spreadsheets and Databases', 'Threats and Security on the Internet', '__index_level_0__'],
        num_rows: 644
    })
    valid: Dataset({
        features: ['String', 'Algorithm Design', 'Basic Machine Organisation', 'Computer System', 'Data Manipulation and Analysis', 'Data Organisation and Data Control', 'Elementary Web Authoring', 'Health and Ethical Issues', 'Information Processing', 'Intellectual Property', 'Internet Services and Applications', 'Multimedia Elements', 'Networking and Internet Basics', 'Program Development', 'Spreadsheets and Databases', '

In [31]:
datasets = datasets.remove_columns(['__index_level_0__'])

In [32]:
model_labels = ['Algorithm Design',
 'Basic Machine Organisation',
 'Computer System',
 'Data Manipulation and Analysis',
 'Data Organisation and Data Control',
 'Elementary Web Authoring',
 'Health and Ethical Issues',
 'Information Processing',
 'Intellectual Property',
 'Internet Services and Applications',
 'Multimedia Elements',
 'Networking and Internet Basics',
 'Program Development',
 'Spreadsheets and Databases',
 'Threats and Security on the Internet']

In [33]:
id2label = {idx:label for idx, label in enumerate(model_labels)}
label2id = {label:idx for idx, label in enumerate(model_labels)}

In [34]:
Text = 'A bus company provides an online service for passengers to find bus routes. What information should passengers input?'


In [35]:
model.device 

device(type='cuda', index=0)

In [36]:
import torch
import numpy as np
from transformers import pipeline
from datasets import load_dataset
from evaluate import evaluator
import evaluate

In [63]:
def encode_data(dataset):
    text = dataset["String"]
    # tokenize string
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=70)
    # create encoded array list with labels
    labelsBatch = {x: dataset[x] for x in dataset.keys() if x in model_labels}
    labels_matrix = np.zeros((len(text), len(model_labels)))
    for idx, label in enumerate(model_labels):
        labels_matrix[:, idx] = labelsBatch[label]
    
    encoding["labels"] = labels_matrix.tolist()
    return encoding

In [64]:
encoded_ds = datasets.map(encode_data, batched=True, remove_columns=datasets['train'].column_names)

100%|██████████| 1/1 [00:00<00:00, 28.65ba/s]
100%|██████████| 1/1 [00:00<00:00, 125.40ba/s]
100%|██████████| 1/1 [00:00<00:00, 125.40ba/s]


In [41]:
encoding = tokenizer(Text, return_tensors="pt")
encoding = {k: v.to(model.device) for k,v in encoding.items()}

In [42]:
outputs = model(**encoding)
logits = outputs.logits
logits.shape

torch.Size([1, 15])

In [43]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['Information Processing']


In [51]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, balanced_accuracy_score, hamming_loss
from transformers import EvalPrediction


def multi_label_metrics(predictions, labels, threshold=0.50):
    # apply sigmoid on predictions fitting (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_true.argmax(axis=1), y_pred.argmax(axis=1))
    # logits, labels = predictions
    # prediction = np.argmax(logits, axis=-1)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy,
               'balanced_accuracy': balanced_accuracy,
               }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [65]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
# Model Training
from datasets import load_from_disk

# Load the test dataset
test_dataset = load_from_disk("./encoded_data4")

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Define the model labels
model_labels = ['Algorithm Design',
                'Basic Machine Organisation',
                'Computer System',
                'Data Manipulation and Analysis',
                'Data Organisation and Data Control',
                'Elementary Web Authoring',
                'Health and Ethical Issues',
                'Information Processing',
                'Intellectual Property',
                'Internet Services and Applications',
                'Multimedia Elements',
                'Networking and Internet Basics',
                'Program Development',
                'Spreadsheets and Databases',
                'Threats and Security on the Internet']

# Define the metric for evaluation
metric = load_metric("accuracy")

# Define the TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=1,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
)

# Evaluate the model on the test dataset
eval_results = trainer.evaluate(encoded_ds['test'])

# Print the evaluation results
print(eval_results)


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./distlBert-model-v9/config.json
Model config DistilBertConfig {
  "_name_or_path": "./distlBert-model-v9/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "Algorithm Design",
    "1": "Basic Machine Organisation",
    "2": "Computer System",
    "3": "Data Manipulation and Analysis",
    "4": "Data Organisation and Data Control",
    "5": "Elementary Web Authoring",
    "6": "Health and Ethical Issues",
    "7": "Information Processing",
    "8": "Intellectual Property",
    "9": "Internet Services and Applications",
    "10": "Multimedia Elements",
    "11": "Networking and Internet Basics",
    "12": "Program Development",
    "13": "Spreadsheets and Datab

{'eval_loss': 0.16424967348575592, 'eval_f1': 0.7878787878787878, 'eval_roc_auc': 0.8840574311379794, 'eval_accuracy': 0.7530864197530864, 'eval_balanced_accuracy': 0.8126695526695527, 'eval_runtime': 0.2352, 'eval_samples_per_second': 344.332, 'eval_steps_per_second': 46.761}





In [66]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
# Model Training
from datasets import load_from_disk

model_path = "./distlBert-model-v10/"
# Load the test dataset
test_dataset = load_from_disk("./encoded_data4")

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Define the model labels
model_labels = ['Algorithm Design',
                'Basic Machine Organisation',
                'Computer System',
                'Data Manipulation and Analysis',
                'Data Organisation and Data Control',
                'Elementary Web Authoring',
                'Health and Ethical Issues',
                'Information Processing',
                'Intellectual Property',
                'Internet Services and Applications',
                'Multimedia Elements',
                'Networking and Internet Basics',
                'Program Development',
                'Spreadsheets and Databases',
                'Threats and Security on the Internet']

# Define the metric for evaluation
metric = load_metric("accuracy")

# Define the TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    num_train_epochs=1,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
)

# Evaluate the model on the test dataset
eval_results = trainer.evaluate(encoded_ds['test'])

# Print the evaluation results
print(eval_results)


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ./distlBert-model-v10/config.json
Model config DistilBertConfig {
  "_name_or_path": "./distlBert-model-v10/",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "Algorithm Design",
    "1": "Basic Machine Organisation",
    "2": "Computer System",
    "3": "Data Manipulation and Analysis",
    "4": "Data Organisation and Data Control",
    "5": "Elementary Web Authoring",
    "6": "Health and Ethical Issues",
    "7": "Information Processing",
    "8": "Intellectual Property",
    "9": "Internet Services and Applications",
    "10": "Multimedia Elements",
    "11": "Networking and Internet Basics",
    "12": "Program Development",
    "13": "Spreadsheets and Dat

{'eval_loss': 0.15585118532180786, 'eval_f1': 0.7701863354037267, 'eval_roc_auc': 0.8664268380944272, 'eval_accuracy': 0.7283950617283951, 'eval_balanced_accuracy': 0.779071669071669, 'eval_runtime': 0.1941, 'eval_samples_per_second': 417.398, 'eval_steps_per_second': 56.684}



