In [1]:
import pandas as pd
import os
import re

In [2]:
# File paths
data_path = "../data/processed/cleaned_tiktok_data.csv"
output_path = "../insights/problems.csv"

In [3]:
# Load cleaned TikTok data
df = pd.read_csv(data_path)

In [4]:
# ✅ Use the correct text source
text_col = "text" if "text" in df.columns else df.columns[df.columns.str.contains("desc|caption|transcript", case=False)].tolist()[0]


In [5]:
# Define common problems (expand this list over time)
problem_keywords = [
    "acne", "dry skin", "oily skin", "dark circles", "hair loss",
    "burnout", "stress", "overthinking", "low energy", "depression",
    "bad wifi", "slow internet", "sensitive skin", "anxiety", "breakouts",
    "insomnia", "thin hair", "bloating", "brain fog", "irritation"
]


In [6]:
# Lowercase everything for matching
problem_keywords = [kw.lower() for kw in problem_keywords]

# Function to find problems mentioned
def extract_problems(text):
    if pd.isna(text) or not isinstance(text, str):
        return []
    text = text.lower()
    return [kw for kw in problem_keywords if kw in text]

# Apply tagging
df["mentioned_problems"] = df[text_col].apply(extract_problems)
df["has_problem"] = df["mentioned_problems"].apply(lambda x: len(x) > 0)

# Filter down to relevant columns
problem_df = df[["text", "mentioned_problems", "has_problem", "country", "is_viral"]].copy()


In [7]:
# Save output
os.makedirs("../insights", exist_ok=True)
problem_df.to_csv(output_path, index=False)

print(f"✅ Problems extracted and saved to: {output_path}")



✅ Problems extracted and saved to: ../insights/problems.csv


In [9]:
problem_df[problem_df["has_problem"]].sample(min(5, problem_df["has_problem"].sum()), random_state=42)


Unnamed: 0,text,mentioned_problems,has_problem,country,is_viral
213,social anxiety fears me #outfit #milano #italy...,[anxiety],True,Italy,False


In [10]:
from collections import Counter
from itertools import chain

problem_counts = Counter(chain.from_iterable(problem_df["mentioned_problems"]))
print(problem_counts.most_common())


[('anxiety', 1)]


In [11]:
pip install keybert


Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m530.4 kB/s[0m eta [36m0:00:00[0m00:01[0m
Collecting sentence-transformers>=0.3.8
  Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: sentence-transformers, keybert
Successfully installed keybert-0.9.0 sentence-transformers-5.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
from keybert import KeyBERT
import pandas as pd

# Load data
df = pd.read_csv("../data/processed/cleaned_tiktok_data.csv")
df = df[df["text"].notna()]  # Drop empty rows

# Init model
kw_model = KeyBERT()

# Run keyword extraction on a sample of posts
df["keyphrases"] = df["text"].apply(lambda x: kw_model.extract_keywords(x, top_n=3, stop_words='english'))

# Preview
df[["text", "keyphrases"]].head()


2025-07-25 01:45:43.030513: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Unnamed: 0,text,keyphrases
0,みんなのおすすめなTシャツブランド教えて\n#street #ootd #outfit #f...,"[(みんなのおすすめなtシャツブランド教えて, 0.7021), (ストリートファッション,..."
1,夏服に迷ってる人必見個人的におすすめのTシャツ6選✨\n\n#ファッション #ストリートファ...,"[(夏服に迷ってる人必見個人的におすすめのtシャツ6選, 0.8537), (夏服, 0.6..."
2,とうとう無地Tの季節がやってきましたね！\nみなさん好きな無地Tブランドはなんですか！\n#...,"[(みなさん好きな無地tブランドはなんですか, 0.8022), (とうとう無地tの季節がや..."
3,綺麗なAラインシルエットが作れるコスパ最強デニムはここ#デニム #バギーデニム #ストリート...,"[(綺麗なaラインシルエットが作れるコスパ最強デニムはここ, 0.8916), (バギーデニ..."
4,ストリートファッション女性編\n#ファッション #おしゃれ #cityboy #ストリートフ...,"[(ストリートファッション女性編, 0.7955), (ストリートファッション, 0.723..."


In [14]:
problem_words = [
    "迷ってる", "困る", "嫌い", "悩み", "ストレス", "失敗", "できない", "難しい",  # Japanese
    "confused", "stressed", "hate", "problem", "struggle", "fail", "cannot", "hard", "bad"
]


In [15]:
def is_problem_phrase(phrases):
    return any(
        any(word in kw for word in problem_words)
        for kw, _ in phrases if isinstance(kw, str)
    )

df["is_problem"] = df["keyphrases"].apply(is_problem_phrase)


In [16]:
df[["text", "keyphrases", "is_problem"]].to_csv("../insights/problems_nlp.csv", index=False)
print("✅ Saved problem analysis (NLP) to problems_nlp.csv")


✅ Saved problem analysis (NLP) to problems_nlp.csv


In [17]:
df["is_problem"].value_counts()


is_problem
False    380
True       2
Name: count, dtype: int64