In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch


In [None]:
def extract_keywords_bert(data):
    # 1. Load Tokenizer and Model
    model_name = "bert-base-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=3)  # O, B-KEY, I-KEY

    # 2. Preprocess Data (Only needed if fine-tuning)
    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["ad_line"], truncation=True, is_split_into_words=False)
        labels = []
        for i, label in enumerate(examples["labels"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            label_ids = []
            prev_word_idx = None
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)  # Special token, ignore loss
                elif word_idx != prev_word_idx:
                    label_ids.append(label[word_idx])  # Assign label
                else:
                    label_ids.append(-100)  # Subword of keyword, ignore loss
                prev_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    def _extract_keywords(text):
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)

        predictions = torch.argmax(outputs.logits, dim=2)[0]
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        keywords = []
        current_keyword = ""
        for token, prediction in zip(tokens, predictions):
            if prediction == 1:  # B-KEY
                if current_keyword:
                    keywords.append(current_keyword)
                current_keyword = token
            elif prediction == 2:  # I-KEY
                current_keyword += token.replace("##", "")
            elif current_keyword:
                keywords.append(current_keyword)
                current_keyword = ""
        if current_keyword:
            keywords.append(current_keyword)
        return keywords

    data['bert_keywords'] = data['ad_line'].apply(_extract_keywords)
    return data


In [None]:
data = pd.DataFrame({'ad_line': [
    "Buy the best running shoes online",
    "Affordable wireless headphones",
    "Watch 4k smart TV sale"
]})

In [None]:
# Extract keywords using BERT
data = extract_keywords_bert(data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(data[['ad_line', 'bert_keywords']])

                             ad_line  \
0  Buy the best running shoes online   
1     Affordable wireless headphones   
2             Watch 4k smart TV sale   

                                     bert_keywords  
0                [thebest, running, shoes, online]  
1  [##ff, ##ord, ##able, wireless, head, ##phones]  
2                             [4, smart, TV, sale]  
