In [1]:
import pandas as pd
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load cleaned dataset
df = pd.read_csv("../data/processed/cleaned_tiktok_data.csv")

In [3]:
# Combine text + transcript
df["text"] = df["text"].fillna("").astype(str)
df["videotranscript"] = df["videotranscript"].fillna("").astype(str)
df["full_text"] = df["text"] + " " + df["videotranscript"]


In [4]:
# Clean text: lowercase, remove hashtags, mentions, punctuation
def clean_text(text):
    text = text.lower()
    text = re.sub(r"#\w+", "", text)  # remove hashtags
    text = re.sub(r"@\w+", "", text)  # remove mentions
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # remove extra spaces
    return text

df["cleaned_text"] = df["full_text"].apply(clean_text)

In [5]:
# Extract keywords using CountVectorizer
vectorizer = CountVectorizer(stop_words="english", max_features=1000)
X = vectorizer.fit_transform(df["cleaned_text"])
keywords = vectorizer.get_feature_names_out()

In [6]:
# Total keyword frequency
total_counts = X.sum(axis=0).A1
keyword_freq = pd.DataFrame({
    "keyword": keywords,
    "count": total_counts
}).sort_values(by="count", ascending=False)

In [7]:


# Viral keyword frequency
viral_df = df[df["is_viral"] == True]
viral_counts = vectorizer.transform(viral_df["cleaned_text"]).sum(axis=0).A1
viral_freq = pd.DataFrame({
    "keyword": keywords,
    "viral_count": viral_counts
})

In [8]:
# Merge both counts
keyword_stats = pd.merge(keyword_freq, viral_freq, on="keyword")
keyword_stats["viral_ratio"] = keyword_stats["viral_count"] / keyword_stats["count"]
keyword_stats = keyword_stats.sort_values(by="viral_ratio", ascending=False)

# Save insights
keyword_stats.to_csv("../insights/viral_keywords.csv", index=False)

print("✅ Keyword analysis completed and saved to insights/viral_keywords.csv")


✅ Keyword analysis completed and saved to insights/viral_keywords.csv
