In [1]:
import csv
count = 0
buffer = []
with open('producten_en_diensten_2024-09-13_21-47-37.csv', 'r', encoding='latin-1') as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        if len(row) == 47:
            buffer.append({
                'thema': row[5],
                'tpe': row[7],
                'beschrijving': row[1]
            })
buffer = buffer[1:]


In [2]:
import mlflow
mlflow.set_experiment('thema-ipdc-model')

2024/10/01 11:20:02 INFO mlflow.tracking.fluent: Experiment with name 'thema-ipdc-model' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/182', creation_time=1727781602503, experiment_id='182', last_update_time=1727781602503, lifecycle_stage='active', name='thema-ipdc-model', tags={}>

In [3]:
from transformers import AutoTokenizer
from datasets import Dataset
import pandas as pd
df = pd.DataFrame(buffer)
#tokenized_imdb = imdb.map(preprocess_function, batched=True)

In [None]:
from transformers import AutoTokenizer


dataset = Dataset.from_pandas(df[df.thema != '']).train_test_split(test_size=0.15)

classes = df.thema.str.get_dummies(sep=', ').columns
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

model_path = 'papluca/xlm-roberta-base-language-detection'
tokenizer = AutoTokenizer.from_pretrained(model_path)

def preprocess_function(example):
   text = example['beschrijving']
   all_labels = example['thema'].split(', ')
   labels = [0. for i in range(len(classes))]
   for label in all_labels:
       label_id = class2id[label]
       labels[label_id] = 1.
  
   example = tokenizer(text, truncation=True)
   example['labels'] = labels
   return example

tokenized_dataset = dataset.map(preprocess_function)



Map:   0%|          | 0/19540 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(int).reshape(-1))


In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_path, 
    num_labels=len(classes), 
    id2label=id2class, 
    label2id=class2id,
    problem_type = "multi_label_classification",
    ignore_mismatched_sizes=True
)

In [None]:
training_args = TrainingArguments(
   output_dir="thema_ipdc_model",
   learning_rate=2e-5,
   per_device_train_batch_size=3,
   per_device_eval_batch_size=3,
   num_train_epochs=1,
   weight_decay=0.01,
   eval_strategy="steps",
   eval_steps=1000,
   save_strategy="steps",
   save_steps=1000,
   load_best_model_at_end=True,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_dataset["train"],
   eval_dataset=tokenized_dataset["test"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

trainer.train()