In [20]:
# prompt: OPEN GOOGLE DRIVE

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# ✅ Install required libraries
!pip install keybert -q
!pip install -U sentence-transformers -q
# ✅ Import necessary libraries
import pandas as pd
from google.colab import drive
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# ✅ Read Excel file and extract only 'Category Name' column (skip 'ID')
file_path = '/content/drive/MyDrive/EVENT/Social_Event.xlsx'
df = pd.read_excel(file_path)
category_names = df['Category Name'][df['Category Name'] != 'ID'].dropna().tolist()
# ✅ Extract keywords using KeyBERT
kw_model = KeyBERT(model='all-MiniLM-L6-v2')


In [22]:
# ✅ Paragraph to test
paragraph = """
The Oncology Department is pleased to host a seminar focused on raising awareness about colon cancer, its early detection, and advancements in treatment.
This event aims to educate the public and healthcare professionals on the importance of timely screening, preventive measures, and supportive care.
Featuring expert talks by leading oncologists and real-life experiences from survivors, the seminar will serve as a platform for knowledge sharing
and community engagement. Attendees will have the opportunity to ask questions, access resources, and contribute to spreading awareness that could help save lives.
"""


keywords = kw_model.extract_keywords(paragraph, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=10)
keyword_list = [kw[0] for kw in keywords]
keyword_text = " ".join(keyword_list)

# ✅ Match keywords with category names using TF-IDF similarity
texts = category_names + [keyword_text]
vectorizer = TfidfVectorizer().fit_transform(texts)
cosine_sim = cosine_similarity(vectorizer[-1], vectorizer[:-1])

# ✅ Get best and top 5 matching categories
best_index = cosine_sim.argmax()
top5_indices = cosine_sim[0].argsort()[-5:][::-1]
top5_matches = [category_names[i] for i in top5_indices]

# ✅ Print results
print("🔍 Extracted Keywords:", keyword_list)
print("✅ Best Matching Category:", category_names[best_index])
print("✅ Top 5 Matching Categories:", top5_matches)


🔍 Extracted Keywords: ['awareness colon cancer', 'colon cancer', 'colon cancer early', 'raising awareness colon', 'talks leading oncologists', 'oncology', 'oncology department pleased', 'survivors seminar serve', 'oncologists real life', 'oncology department']
✅ Best Matching Category: Colon Cancer Seminar
✅ Top 5 Matching Categories: ['Colon Cancer Seminar', 'Colon Cancer Forum', 'Colon Cancer Forum', 'Colon Cancer Forum', 'Colon Cancer Forum']
