In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
!pip install transformers datasets evaluate scikit-learn --quiet
!pip install --upgrade transformers



In [12]:
import ast
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/LLM ASSIGNMENT/cocktails_recipe.csv')

# Parse ingredients from stringified lists to actual lists
df['parsed_ingredients'] = df['ingredients'].apply(lambda x: [item[1] for item in ast.literal_eval(x)])

# Create tags manually based on common categories (you can expand this list)
tags = ['vodka', 'gin', 'rum', 'tequila', 'sweet', 'sour', 'whiskey', 'brandy', 'lime', 'lemon', 'orange']

# Create a function to assign tags
def extract_tags(ingredient_list):
    tags_found = []
    for ing in ingredient_list:
        for tag in tags:
            if tag.lower() in ing.lower():
                tags_found.append(tag)
    return list(set(tags_found))

# Apply tagging
df['tags'] = df['parsed_ingredients'].apply(extract_tags)

# Remove empty tags
df = df[df['tags'].map(len) > 0].reset_index(drop=True)

# Binarize the tags for multi-label classification
mlb = MultiLabelBinarizer()
tag_matrix = mlb.fit_transform(df['tags'])

# Add a column with full ingredients as a string for BERT input
df['text'] = df['parsed_ingredients'].apply(lambda x: ' '.join(x))


In [13]:
from datasets import Dataset
import torch

# Create HuggingFace dataset
hf_dataset = Dataset.from_pandas(df[['text', 'tags']])
hf_dataset = hf_dataset.train_test_split(test_size=0.1)

# Add multi-hot encoded labels
hf_dataset = hf_dataset.map(lambda x: {
    'labels': mlb.transform([x['tags']])[0]
})


Map:   0%|          | 0/5655 [00:00<?, ? examples/s]

Map:   0%|          | 0/629 [00:00<?, ? examples/s]

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

hf_dataset = hf_dataset.map(tokenize, batched=True)
hf_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map:   0%|          | 0/5655 [00:00<?, ? examples/s]

Map:   0%|          | 0/629 [00:00<?, ? examples/s]

In [15]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load BERT for multi-label classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

# Define basic training arguments (safe across most versions)
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=False  # disable model checkpointing if not supported
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    tokenizer=tokenizer  # safe to include even if unused internally
)

# Start training
trainer.train()


  trainer = Trainer(
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


RuntimeError: WandbCallback requires wandb to be installed. Run `pip install wandb`.

In [None]:
# Pick one example
sample = df.iloc[0]
inputs = tokenizer(sample["text"], return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
probs = torch.sigmoid(outputs.logits).detach().numpy()[0]
predicted_tags = [mlb.classes_[i] for i, p in enumerate(probs) if p > 0.5]

print("Cocktail:", sample["title"])
print("Ingredients:", sample["text"])
print("True Tags:", sample["tags"])
print("Predicted Tags:", predicted_tags)
