<a href="https://colab.research.google.com/github/kanchanraiii/Amazon-Ads---LOE/blob/main/Ranking_Keywords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install spacy  keybert transformers pandas nltk

Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading nvi

In [4]:
import pandas as pd
import spacy
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from keybert import KeyBERT
from collections import Counter
from sentence_transformers import SentenceTransformer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [5]:
# Download necessary resources
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [11]:
# Load CSV File
file_path = "/content/amazon_combined_scrapped_data.csv"  # Change this path
df = pd.read_csv(file_path)
df['ad'] = df['ad'].fillna('')

In [12]:
column_name = df.columns[0]
df.rename(columns={column_name: "ad"}, inplace=True)

In [13]:
bert_model = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight transformer
kw_extractor = KeyBERT(bert_model)

In [14]:
vectorizer = TfidfVectorizer(stop_words="english")

In [15]:
tfidf_matrix = vectorizer.fit_transform(df["ad"])
feature_names = vectorizer.get_feature_names_out()

In [17]:
import numpy as np

# Take only the first 10,000 rows
subset_tfidf = tfidf_matrix[:10000]

# Convert to NumPy array for fast computation
tfidf_array = subset_tfidf.toarray()

# Compute sum across selected rows
tfidf_sums = np.sum(tfidf_array, axis=0)

# Create dictionary of words and their scores
tfidf_scores = dict(zip(feature_names, tfidf_sums))


In [19]:
word_freq = Counter(" ".join(df["ad"]).split())


In [20]:
def extract_ner(text):
    doc = nlp(text)
    return [ent.text.lower() for ent in doc.ents]


In [21]:
def extract_bert_keywords(text, top_n=5):
    keywords = kw_extractor.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words="english", top_n=top_n)
    return [kw[0] for kw in keywords]


In [22]:
def compute_score(word):
    ner_bonus = 1.5 if word in named_entities else 1.0
    tfidf_score = tfidf_scores.get(word, 0)
    bert_score = 1.2 if word in bert_keywords else 1.0
    freq_score = word_freq.get(word, 1)

    # Weighted formula
    final_score = (tfidf_score * 0.4) + (freq_score * 0.3) + (bert_score * 0.2) + (ner_bonus * 0.1)
    return final_score


In [27]:
nltk.download('punkt_tab')
all_scores = []
for desc in df["ad"][:1000]:
    named_entities = extract_ner(desc)
    bert_keywords = extract_bert_keywords(desc)

    tokens = word_tokenize(desc.lower())  # Tokenization
    token_scores = {word: compute_score(word) for word in tokens}

    sorted_keywords = sorted(token_scores.items(), key=lambda x: x[1], reverse=True)[:10]  # Top 10 keywords
    all_scores.append(sorted_keywords)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [29]:
# Add Ranked Keywords to DataFrame
# Limit DataFrame to 1000
df_subset = df.iloc[:1000]
df_subset["Ranked_Keywords"] = all_scores
# Display results
print(df_subset[["ad", "Ranked_Keywords"]].head()) # Changed to ad, as there is no column 'Product_Description'

# Save to CSV
df_subset.to_csv("/content/keyword_ranking_output.csv", index=False)

                                                  ad  \
0                     DL380 Gen10 4112 1P 16G 8LFF S   
1  Мícrоsoft Wíndоws Server Enterprise 2008 R2 SP...   
2  QNAP TS-831XU-4G-US 8-Bay Arm-Based 10G NAS, Q...   
3     HPE RAM Memory - 8GB - DDR4 SDRAM (815097-B21)   
4  SuperMicro SuperServer 5019D-FN8TP - Rack-Moun...   

                                     Ranked_Keywords  
0  [(16g, 575.5319156735715), (s, 100.19999999999...  
1  [(25, 3342.0979791804793), (2008, 160.32310045...  
2  [(x, 46228.2), (2, 35572.85), (,, 1449.0), (po...  
3  [(-, 144474.90000000002), ((, 387.9), (), 342....  
4  [(-, 144474.90000000002), (0, 842.7), (gb, 31....  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["Ranked_Keywords"] = all_scores
