In [1]:
import pandas as pd
import ast
from collections import Counter

In [2]:
# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_tiktok_data.csv")

In [3]:
# Parse 'cleaned_hashtags' from stringified list to actual list
df["cleaned_hashtags"] = df["cleaned_hashtags"].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])


In [4]:
# Explode hashtags into individual rows
exploded = df.explode("cleaned_hashtags")
exploded = exploded.rename(columns={"cleaned_hashtags": "hashtag"})

# Drop empty hashtags
exploded = exploded[exploded["hashtag"].notnull() & (exploded["hashtag"] != "")]

# Count top hashtags overall
top_hashtags = exploded["hashtag"].value_counts().reset_index()
top_hashtags.columns = ["hashtag", "count"]

In [6]:
# Top hashtags per country
country_hashtags = (
    exploded.groupby(["country", "hashtag"])
    .size()
    .reset_index(name="count")
    .sort_values(["country", "count"], ascending=[True, False])
)

# Hashtag counts in viral vs. non-viral
viral_hashtags = (
    exploded.groupby(["is_viral", "hashtag"])
    .size()
    .reset_index(name="count")
    .sort_values(["is_viral", "count"], ascending=[False, False])
)

# Save to CSV
top_hashtags.to_csv("../insights/hashtags.csv", index=False)
country_hashtags.to_csv("../insights/hashtags_by_country.csv", index=False)
viral_hashtags.to_csv("../insights/hashtags_by_viral.csv", index=False)

print("✅ Hashtag insights saved to:")
print("- insights/hashtags.csv")
print("- insights/hashtags_by_country.csv")
print("- insights/hashtags_by_viral.csv")


✅ Hashtag insights saved to:
- insights/hashtags.csv
- insights/hashtags_by_country.csv
- insights/hashtags_by_viral.csv
