In [1]:
import pandas as pd
import re

In [2]:
# Load cleaned data
df = pd.read_csv("../data/processed/cleaned_tiktok_data.csv")

In [3]:
# Fill NaNs with empty string
df["text"] = df["text"].fillna("").astype(str)
df["videotranscript"] = df["videotranscript"].fillna("").astype(str)

In [4]:
# Combine both sources into one for format detection
df["combined_text"] = df["text"] + " " + df["videotranscript"]

# Define format patterns
format_keywords = {
    "tutorial": ["how to", "step by step", "tutorial", "guide", "tips"],
    "transition": ["transition", "before and after", "watch me", "from this to this"],
    "voiceover": ["voice over", "narrating", "explaining", "talk over"],
    "talking_head": ["i think", "i feel", "in my opinion", "let me tell you"],
    "unboxing": ["unbox", "unboxing", "open this", "got this today"],
    "outfit_showcase": ["outfit", "ootd", "get ready", "fit check", "lookbook"],
    "vlog": ["day in the life", "come with me", "follow me", "my morning", "spend the day"]
}

In [5]:
# Format detection function
def detect_format(text):
    text = text.lower()
    for fmt, keywords in format_keywords.items():
        if any(re.search(rf"\b{re.escape(kw)}\b", text) for kw in keywords):
            return fmt
    return "unknown"

# Apply detection
df["video_format"] = df["combined_text"].apply(detect_format)

# Save result
df[["text", "videotranscript", "video_format", "country", "is_viral"]].to_csv("../insights/formats.csv", index=False)

print("✅ Video formats extracted and saved to insights/formats.csv")


✅ Video formats extracted and saved to insights/formats.csv
