In [None]:
!pip install transformers spacy pandas
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

In [None]:
def load_data_from_csv(csv_file):
    """Loads ad line data from a CSV file."""
    try:
        data = pd.read_csv(csv_file)
        if 'ad' not in data.columns:
            raise ValueError("CSV must contain an 'ad' column.")
        return data
    except FileNotFoundError:
        print(f"Error: File not found: {csv_file}")
        return None
    except ValueError as e:
        print(f"Error: {e}")
        return None


In [None]:
def extract_keywords_roberta(data):
    """Extracts keywords from ad lines using a RoBERTa model."""
    model_name = "roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=3)  # O, B-KEY, I-KEY

    def _extract_keywords(text):
        inputs = tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            outputs = model(**inputs)

        predictions = torch.argmax(outputs.logits, dim=2)[0]
        tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

        keywords = []
        current_keyword = ""
        for token, prediction in zip(tokens, predictions):
            if prediction == 1:  # B-KEY
                if current_keyword:
                    keywords.append(current_keyword)
                current_keyword = token
            elif prediction == 2:  # I-KEY
                current_keyword += token.replace("##", "")
            elif current_keyword:
                keywords.append(current_keyword)
                current_keyword = ""
        if current_keyword:
            keywords.append(current_keyword)
        return keywords

    data['roberta_keywords'] = data['ad'].apply(_extract_keywords)
    return data


In [None]:
def process_keywords_contextual(data):
    """Processes the *entire ad line* with SpaCy to get contextual information."""
    nlp = spacy.load("en_core_web_sm")

    def _process_ad_line(ad_line, keywords):
        """Processes the ad line and matches entities to extracted keywords."""
        doc = nlp(ad_line)
        processed_keywords = []
        for keyword in keywords:
            keyword_info = {'keyword': keyword}
            lemmatized_keyword = ""
            entities = []

            for token in doc:
                if keyword.lower() in token.text.lower():  # Basic matching
                    lemmatized_keyword = token.lemma_
                    for ent in doc.ents:
                        if ent.start_char <= token.idx < ent.end_char:
                            entities.append((ent.text, ent.label_))
            keyword_info['lemmatized'] = lemmatized_keyword
            keyword_info['entities'] = entities
            processed_keywords.append(keyword_info)
        return processed_keywords

    data['processed_keywords'] = data.apply(lambda row: _process_ad_line(row['ad'], row['roberta_keywords']), axis=1)
    return data


In [None]:
csv_file = "amazon_women-fashion_watches_watch-bands.csv"  # Replace with your CSV file path
data = load_data_from_csv(csv_file)

if data is not None:
    # 6. Extract keywords using RoBERTa
    data = extract_keywords_roberta(data)

    # 7. Process the extracted keywords (contextual)
    data = process_keywords_contextual(data)

    # 8. Display the processed data (or use it for your optimization)
    print(data[['ad', 'processed_keywords']].head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                                  ad  \
0  20mm Rubber Watchband Strap w/Tang Buckle Fit ...   
1  CIVO Genuine Leather Watch Bands Top Calf Grai...   
2  RuenTech Compatible for Fossil Gen 4 Venture H...   
3  MLQSS Soft Silicone Watch Band with Quick Rele...   
4  YISUYA Shark MESH 20MM 22MM 24MM Full Stainles...   

                                  processed_keywords  
0  [{'keyword': '20', 'lemmatized': '20', 'entiti...  
1  [{'keyword': 'C', 'lemmatized': 'Watch', 'enti...  
2  [{'keyword': 'Ru', 'lemmatized': 'RuenTech', '...  
3  [{'keyword': 'ML', 'lemmatized': 'MLQSS', 'ent...  
4  [{'keyword': 'Y', 'lemmatized': 'Duty', 'entit...  
