In [None]:
!pip install accelerate
!pip install datasets
!pip install transformers
!pip install pandas

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding

URL_test = "https://raw.githubusercontent.com/laurenzbrahner/BigDataTask2/main/data/Recipes_Test.csv"
URL_training = "https://raw.githubusercontent.com/laurenzbrahner/BigDataTask2/main/data/Recipes_Training.csv"
URL_validation = "https://raw.githubusercontent.com/laurenzbrahner/BigDataTask2/main/data/Recipes_Validation.csv"

# Load the CSV files from the URLs
df_train = pd.read_csv(URL_training, sep=";")
df_test = pd.read_csv(URL_test, sep=";")
df_val = pd.read_csv(URL_validation, sep=";")


# Map the cuisines to numbers

cuisine_mapping = {
    "cajun_creole": 0,
    "chinese": 1,
    "french": 2,
    "indian": 3,
    "italian": 4,
    "mexican": 5,
    "southern_us": 6,
    "thai": 7
}

#df_train['cuisine'] = df_train['cuisine'].map(cuisine_mapping)
df_test['cuisine'] = df_test['cuisine'].map(cuisine_mapping)
#df_val['cuisine'] = df_val['cuisine'].map(cuisine_mapping)

# Create the datasets
raw_datasets = {}
raw_datasets['train'] = Dataset.from_pandas(df_train)
raw_datasets['test'] = Dataset.from_pandas(df_test)
raw_datasets['val'] = Dataset.from_pandas(df_val)


# Load the tokenizer and the model for pretraining
checkpoint = 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Tokenize the data
def tokenize_function(examples):
    # Tokenisieren der Zutaten und Hinzufügen der 'cuisines' als Labels
    tokenized_inputs = tokenizer(examples["ingredients"], truncation=True, padding="max_length")
    tokenized_inputs["labels"] = examples["cuisine"]
    return tokenized_inputs

# Anwenden der tokenize_function auf jeden Datensatz
tokenized_datasets = {x: raw_datasets[x].map(tokenize_function, batched=True) for x in raw_datasets}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)





In [None]:
# Check the distribution of the cuisines
df_train['cuisine'].value_counts()

In [None]:
from transformers import TrainingArguments

# Define the training arguments
training_args = TrainingArguments("test-trainer", evaluation_strategy="steps",
                                  num_train_epochs=6, metric_for_best_model="accuracy", load_best_model_at_end=True)

In [None]:
from sklearn.metrics import f1_score
import numpy as np

# Define the metrics
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids

    accuracy = (preds == labels).mean()
    macro_f1 = f1_score(labels, preds, average='macro')

    return {"accuracy": accuracy, "f1": macro_f1}

In [None]:
from transformers import AutoModelForSequenceClassification

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=8, ignore_mismatched_sizes=True)

In [None]:
from transformers import Trainer

# Define the trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Train the model
trainer.train()