# Exploratory Analysis: Best Time to Post

This notebook inspects engagement patterns (hour/day) and trending hashtags/audio for the cleaned TikTok dataset.


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

sns.set_theme(style="whitegrid")

data_path = Path("data/features/training_set.parquet")
if not data_path.exists():
    raise FileNotFoundError(
        f"{data_path} not found. Run src/features/build_features.py before this notebook."
    )

df = pd.read_parquet(data_path)
df.head()


In [None]:
hourly = (df.groupby("created_hour")
            .agg(mean_target=("target_metric", "mean"),
                 viral_rate=("is_viral", "mean"))
            .reset_index())

fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=False)
sns.barplot(x="created_hour", y="mean_target", data=hourly, ax=axes[0])
axes[0].set_title("Avg target metric by hour")
axes[0].set_xlabel("Hour of day (UTC)")
axes[0].set_ylabel("Avg plays per hour")

sns.barplot(x="created_hour", y="viral_rate", data=hourly, ax=axes[1])
axes[1].set_title("Viral probability by hour")
axes[1].set_xlabel("Hour of day (UTC)")
axes[1].set_ylabel("P(is_viral)")
plt.tight_layout()
plt.show()


In [None]:
weekday_map = {
    0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"
}
df["weekday_label"] = df["created_weekday"].map(weekday_map)

weekday = (df.groupby("weekday_label")
           .agg(mean_target=("target_metric", "mean"), viral_rate=("is_viral", "mean"))
           .reindex(["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]))

fig, ax = plt.subplots(figsize=(8, 4))
sns.lineplot(data=weekday, x=weekday.index, y="mean_target", marker="o", label="Avg target")
sns.lineplot(data=weekday, x=weekday.index, y="viral_rate", marker="o", label="P(is_viral)", ax=ax)
ax.set_title("Performance by weekday")
ax.set_xlabel("Weekday")
ax.set_ylabel("Score")
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
def explode_hashtags(df: pd.DataFrame) -> pd.DataFrame:
    if "hashtags_list" in df:
        return df.copy()
    if "hashtags" in df:
        df = df.copy()
        df["hashtags_list"] = df["hashtags"].fillna("")
    else:
        df["hashtags_list"] = [[] for _ in range(len(df))]
    return df

hashtags_df = explode_hashtags(df)
hashtag_counts = hashtags_df.explode("hashtags_list")
hashtag_counts = hashtag_counts["hashtags_list"].str.lower().str.strip()
top_hashtags = hashtag_counts.value_counts().head(10)

top_hashtags.plot(kind="bar", figsize=(8,4), title="Top hashtags in dataset")
plt.ylabel("Count")
plt.show()

top_viral = (hashtags_df[hashtags_df["is_viral"] == 1]
             .explode("hashtags_list")["hashtags_list"].value_counts().head(10))

top_viral.plot(kind="bar", figsize=(8,4), title="Top hashtags in viral posts")
plt.ylabel("Count")
plt.show()


In [None]:
audio_stats = (df.groupby("music", dropna=False)
                .agg(plays_avg=("target_metric", "mean"), count=("music", "size"))
                .sort_values(by="plays_avg", ascending=False)
                .head(10))

audio_stats.plot(kind="bar", y="plays_avg", figsize=(8,4), legend=False)
plt.title("Top audio tracks by avg target metric")
plt.ylabel("Avg plays per hour")
plt.xlabel("Audio title")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
