In [1]:
! pip install sentence_transformers keybert bertopic spacy yake




[notice] A new release of pip is available: 25.1.1 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import yake




In [3]:
df = pd.read_csv("data\\Emotion NLP\\journal_texts\\journal_texts.csv")   # or read_excel
df["text"] = df["text"].str.lower()

## KeyBERT

In [4]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=embedding_model)

nlp = spacy.load("en_core_web_sm")

In [5]:
def extract_keybert_keywords(text, top_n=5):
  kws = kw_model.extract_keywords(
    text,
    keyphrase_ngram_range=(1, 2),
    stop_words="english",
    top_n=top_n
  )
  return [kw for kw, score in kws]

In [6]:
df["keybert_keywords"] = df["text"].apply(extract_keybert_keywords)


In [7]:
df['keybert_keywords'][0]

['felt heavier', 'heavier expected', 'heavier', 'today felt', 'conversation']

# YAKE

In [8]:
yake_extractor = yake.KeywordExtractor(
  lan="en",
  n=2,
  dedupLim=0.9,
  top=10
)

In [9]:
def extract_yake_keywords(text):
  if not isinstance(text, str) or not text.strip():
    return []
  keywords = yake_extractor.extract_keywords(text)
  # YAKE returns (keyword, score) â€” lower is better
  keywords = sorted(keywords, key=lambda x: x[1])
  return [kw for kw, score in keywords]


In [10]:
df["yake_keywords"] = df["text"].apply(extract_yake_keywords)

In [11]:
df['yake_keywords'][0]

['today felt',
 'felt heavier',
 'today',
 'expected',
 'felt',
 'heavier',
 'head',
 'wondering',
 'replaying',
 'conversation']

# Keyword Filtering

In [12]:
ALLOWED_PATTERNS = {
  ("NOUN",),
  ("PROPN",),
  ("ADJ", "NOUN"),
  ("ADJ", "PROPN"),
}

In [13]:
def get_head_noun_lemma(phrase):
  doc = nlp(phrase)
  for token in doc:
    if token.dep_ == "ROOT" and token.pos_ in ("NOUN", "PROPN"):
      return token.lemma_
  return None

In [14]:
def is_valid_noun_phrase(phrase):
  doc = nlp(phrase)
  tokens = [t for t in doc]
  pos_pattern = tuple(t.pos_ for t in tokens)
  return pos_pattern in ALLOWED_PATTERNS

In [None]:
def select_best_noun_phrases(keywords):
  concepts = {}

  for phrase in keywords:
    if not is_valid_noun_phrase(phrase):
      continue

    head = get_head_noun_lemma(phrase)
    if not head:
      continue

    if head not in concepts:
      concepts[head] = phrase

  return [v for v in concepts.values()]
# should be v[0]

In [16]:
df["keybert_best"] = df["keybert_keywords"].apply(select_best_noun_phrases)
df["yake_best"] = df["yake_keywords"].apply(select_best_noun_phrases)

In [17]:
df["keybert_best"]

0                                 [conversation]
1                   [slow breathing, tense body]
2                                     [thoughts]
3                           [frustrated session]
4                                         [calm]
5                                             []
6                               [tired sleeping]
7     [defensive feel, defensive, misunderstood]
8                                     [feelings]
9                                             []
10                          [small interactions]
11                           [uncomfortable sit]
12                                [restlessness]
13                  [disconnected conversations]
14                                 [reassurance]
15                                        [mood]
16                               [terrible wasn]
17                                            []
18                             [relief, evening]
19                                            []
Name: keybert_best, 

In [18]:
df['yake_best']

0                           [today, head, conversation]
1                            [morning, body, shoulders]
2                               [head, thoughts, paper]
3                                             [session]
4                                 [calm, moment, today]
5                                      [choice, things]
6                                                    []
7     [today, pattern, reactions, misunderstood, def...
8                                            [feelings]
9                                          [day, sense]
10                          [small interactions, today]
11                                           [emotions]
12                           [day, sense, restlessness]
13                                      [conversations]
14                                  [reassurance, part]
15                                        [mood, plans]
16                                              [today]
17                                             [

In [20]:
df.to_csv("data\\Keyword_NLP\\Keyword_NLP.csv", index=False)