In [11]:
import pandas as pd
import re
import os



In [12]:
# Paths
data_path = "../data/processed/cleaned_tiktok_data.csv"
output_path = "../insights/hooks.csv"

In [13]:
# Load data
df = pd.read_csv(data_path)

In [14]:
# Preview columns to choose best source
print("Available columns:", df.columns[df.columns.str.contains("transcript|caption|desc", case=False)].tolist())


Available columns: ['videotranscript']


In [15]:
# 🔍 Find potential text columns
text_cols = [col for col in df.columns if any(k in col.lower() for k in ["desc", "transcript", "caption", "text"])]
print("Potential text columns:", text_cols)

# 🔍 Preview content
sample_texts = df[text_cols].dropna(how="all").sample(5, random_state=1)
print("\nSample content from potential text columns:")
print(sample_texts)

# ✅ Choose the column with actual text
text_col = None
for col in text_cols:
    if df[col].dropna().str.len().mean() > 10:
        text_col = col
        print(f"\n✅ Using '{col}' as text source.")
        break

# Fallback if none found
if not text_col:
    print("\n⚠️ No valid text column found! Defaulting to 'desc'")
    text_col = "desc"
    df[text_col] = df.get(text_col, "")


Potential text columns: ['text', 'textlanguage', 'videotranscript']

Sample content from potential text columns:
                                                  text textlanguage  \
81   前後2wayで楽しめるプチプラブラウス♡\n@ClassicalElf  クラシカルエルフ ...           ja   
165  3 business fashion hacks @MOTF  15% off DC C0D...           en   
351  What should I name this series frennnn? This f...           en   
119  3 Outfit Mistakes That Instantly Age you. You ...           en   
379  Linked in LTK wearing @Revolve @Edikted @Tony ...           en   

                                       videotranscript  
81            プチプラで見つけたおしりまで隠れるテプラム全国入絵でつめてとっても可愛いこれだよ  
165   Stop wearing your top like this. It looks bor...  
351   I want to do a series where I show you all ho...  
119   Three outfits mistake that instantly itch you...  
379   I ordered these for the trip but they just ca...  

✅ Using 'text' as text source.


In [16]:
# --- Hook Extraction ---

def extract_hook(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""

    # Use first sentence or first 12 words
    sentence = re.split(r'[.?!\n]', text)[0]  # first sentence
    words = sentence.strip().split()
    return " ".join(words[:12])  # limit to 12 words

def classify_hook(hook):
    if "?" in hook or hook.lower().startswith(("what", "why", "how", "did", "do", "can", "have")):
        return "question"
    elif any(hook.lower().startswith(word) for word in ["stop", "never", "you need", "warning", "don’t"]):
        return "strong statement"
    elif hook.lower().startswith(("storytime", "so this happened", "once", "i remember")):
        return "story"
    elif len(hook.split()) < 4:
        return "short punch"
    else:
        return "neutral"


In [17]:
# Apply extraction and classification
df["hook_text"] = df[text_col].apply(extract_hook)
df["hook_type"] = df["hook_text"].apply(classify_hook)

In [18]:
# Output relevant columns
hook_df = df[["hook_text", "hook_type", "country", "is_viral"]].copy()

In [19]:
# Save to CSV
os.makedirs("../insights", exist_ok=True)
hook_df.to_csv(output_path, index=False)

print(f"✅ Hooks extracted and saved to {output_path}")
hook_df.head()

✅ Hooks extracted and saved to ../insights/hooks.csv


Unnamed: 0,hook_text,hook_type,country,is_viral
0,みんなのおすすめなTシャツブランド教えて,short punch,Japan,False
1,夏服に迷ってる人必見個人的におすすめのTシャツ6選✨,short punch,Japan,False
2,とうとう無地Tの季節がやってきましたね！,short punch,Japan,False
3,綺麗なAラインシルエットが作れるコスパ最強デニムはここ#デニム #バギーデニム #ストリート...,short punch,Japan,False
4,ストリートファッション女性編,short punch,Japan,False


In [20]:
df["hook_type"].value_counts()

hook_type
neutral        233
short punch    133
question        23
story            3
Name: count, dtype: int64