<a href="https://colab.research.google.com/github/kanchanraiii/Amazon-Ads---LOE/blob/main/Extracting_Keywords_NLP_Techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Intsalling Necessary Libraries

In [1]:
!pip install pandas nltk spacy keybert scikit-learn transformers sentence-transformers
!python -m spacy download en_core_web_sm


Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

####Importing Necessary Libs

In [2]:
import pandas as pd
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np


In [9]:
# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
# Load NLP models
nlp = spacy.load("en_core_web_sm")

#####Loading the CSV File

In [33]:
df = pd.DataFrame(data, columns=["ad"])

####Applying Named Entity Recognition

In [34]:

def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

df["NER_Entities"] = df["ad"].apply(extract_entities)

####Stemming & Lemmization

In [35]:

ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization
    tokens = [word for word in tokens if word.isalpha()]  # Remove punctuation
    tokens = [word for word in tokens if word not in stopwords.words("english")]  # Remove stopwords

    stemmed = [ps.stem(word) for word in tokens]  # Apply stemming
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]  # Apply lemmatization

    return " ".join(tokens), " ".join(stemmed), " ".join(lemmatized)

df[["Cleaned_Text", "Stemmed_Text", "Lemmatized_Text"]] = df["ad"].apply(preprocess_text).apply(pd.Series)


####TF-IDF Keyword Extraction

In [37]:
tfidf_vectorizer = TfidfVectorizer(max_features=5)
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Lemmatized_Text"])
tfidf_keywords = tfidf_vectorizer.get_feature_names_out()
df["TFIDF_Keywords"] = [tfidf_keywords] * len(df)

#####Keyword BERT for Keyword Extraction

In [38]:
kw_model = KeyBERT()
df["KeyBERT_Keywords"] = df["Lemmatized_Text"].apply(lambda text: kw_model.extract_keywords(text, keyphrase_ngram_range=(1, 2), stop_words="english"))


In [39]:
#BERT Embeddings + Cosine Similarity
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [40]:
def extract_bert_keywords(text, top_n=5):
    doc_embedding = bert_model.encode([text])
    words = text.split()
    if not words:
        return []
    word_embeddings = bert_model.encode(words)
    similarities = np.inner(word_embeddings, doc_embedding).flatten()
    top_indices = similarities.argsort()[-top_n:][::-1]
    return [words[i] for i in top_indices]

df["BERT_Keywords"] = df["Lemmatized_Text"].apply(lambda text: extract_bert_keywords(text))

In [41]:
print(df[["ad", "NER_Entities", "Stemmed_Text", "Lemmatized_Text", "TFIDF_Keywords", "KeyBERT_Keywords", "BERT_Keywords"]])

                                                     ad  \
0                        DL380 Gen10 4112 1P 16G 8LFF S   
1     Мícrоsoft Wíndоws Server Enterprise 2008 R2 SP...   
2     QNAP TS-831XU-4G-US 8-Bay Arm-Based 10G NAS, Q...   
3        HPE RAM Memory - 8GB - DDR4 SDRAM (815097-B21)   
4     SuperMicro SuperServer 5019D-FN8TP - Rack-Moun...   
...                                                 ...   
9995  WiFi Extender, Aigital 2.4G Wireless Internet ...   
9996  NETGEAR 52-Port Gigabit Ethernet Smart Managed...   
9997  BUFFALO TeraStation 3410DN Desktop 16 TB NAS H...   
9998  CISCO SYSTEMS Sg250-10P 10-Port Gigabit PoE Sw...   
9999  Nyrius 5.8GHz 4 Channel Wireless Video & Audio...   

                                           NER_Entities  \
0                      [(1P 16, DATE), (G 8LFF S, ORG)]   
1     [(Мícrоsoft Wíndоws Server Enterprise, ORG), (...   
2     [(US, GPE), (10, CARDINAL), (Quad Core 1.7GHz,...   
3            [(RAM Memory - 8, PERSON), (815097, DATE)]