In [1]:
import pandas as pd
import numpy as np
import re
import joblib
from pathlib import Path
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# ===== CONFIGURATION =====
INPUT_PATH = "/kaggle/input/data-cleaning/comments_enriched.parquet"
OUTPUT_DIR = "/kaggle/working"
SEGMENT_LABELS_PATH = f"{OUTPUT_DIR}/segments_labels.csv"
ARTIFACTS_DIR = f"{OUTPUT_DIR}/artifacts"
Path(ARTIFACTS_DIR).mkdir(parents=True, exist_ok=True)

# ===== 1. LOAD DATA =====
print(f"[INFO] Loading comments_enriched.parquet...")
df = pd.read_parquet(INPUT_PATH)
print(f"[INFO] Loaded dataset shape: {df.shape}")

[INFO] Loading comments_enriched.parquet...
[INFO] Loaded dataset shape: (4725012, 17)


In [2]:
# ===== 2. FEATURE ENGINEERING =====
print("[INFO] Starting feature engineering...")

# Text length feature
df['text_len'] = df['text_norm'].str.len().fillna(0)

# Time-of-day buckets
def time_of_day(hour):
    if 5 <= hour < 12: return 'morning'
    elif 12 <= hour < 17: return 'afternoon'
    elif 17 <= hour < 22: return 'evening'
    else: return 'night'
df['time_of_day'] = df['hour'].apply(time_of_day)

# Slang detection
def detect_slang(text):
    if not isinstance(text, str) or len(text) < 5:
        return 0
    slang_indicators = [
        r'\bu\b', r'\br\b', r'\bl8r\b', r'\n2\b', r'\bthx\b', r'\bomg\b', r'\byo\b',
        r'\bbtw\b', r'\bimo\b', r'\bfomo\b', r'\bsmh\b', r'\btbh\b', r'\bily\b',
        r'\bfr\b', r'\bngl\b', r'\bdeez\b', r'\bwyd\b', r'\bwyll\b', r'\bwyg\b',
        r'\bperiodt\b', r'\bstan\b', r'\bbussin\b', r'\bsus\b', r'\bglowup\b'
    ]
    score = sum(1 for p in slang_indicators if re.search(p, text.lower()))
    if len(text) < 30:
        score += 1
    return min(score, 5)
df['slang_score'] = df['text_norm'].apply(detect_slang)

# Topic extraction from hashtags
def extract_topics(hashtags):
    if hashtags is None or (isinstance(hashtags, float) and pd.isna(hashtags)):
        return []
    if hasattr(hashtags, 'tolist'):
        hashtags = hashtags.tolist()
    if isinstance(hashtags, str):
        if hashtags.startswith('[') and hashtags.endswith(']'):
            try:
                import ast
                hashtags = ast.literal_eval(hashtags)
            except:
                hashtags = [hashtags]
        else:
            hashtags = [hashtags]
    if not isinstance(hashtags, list):
        return []
    common_tags = {'love', 'like', 'comment', 'share', 'video', 'youtube', 'subscribe'}
    beauty_topics = {
        'skincare': ['skincare', 'skincareroutine', 'skincaretips', 'skincarehacks'],
        'makeup': ['makeup', 'makeuproutine', 'makeuptutorial', 'makeuplook'],
        'haircare': ['haircare', 'hairroutin', 'hairtutorial', 'hairgoals'],
        'beauty': ['beauty', 'beautytips', 'beautyhacks', 'beautyblogger'],
        'fashion': ['fashion', 'outfit', 'style', 'outfitideas'],
        'lifestyle': ['lifestyle', 'dailyroutine', 'dayinmylife', 'lifetips']
    }
    topics = []
    for tag in hashtags:
        if not isinstance(tag, str):
            continue
        tag = tag.lower().replace('#', '')
        if tag in common_tags:
            continue
        for category, keywords in beauty_topics.items():
            if any(keyword in tag for keyword in keywords):
                topics.append(category)
                break
        else:
            topics.append(tag)
    return list(set(topics))
df['topics'] = df['hashtags'].apply(extract_topics)

[INFO] Starting feature engineering...


In [3]:
# ===== 3. FEATURE SELECTION & PREPROCESSING =====
print("[INFO] Preparing features for GMM...")

# Base features
features = df[['emoji_count', 'text_len', 'hour', 'lang', 'time_of_day']].copy()

# --- ENHANCED FEATURE ENGINEERING ---
# Emoji density (better than raw count)
word_counts = df['text_norm'].str.split().str.len().replace(0, 1)
features['emoji_per_word'] = (features['emoji_count'] / word_counts).fillna(0)
features['is_high_emoji'] = (features['emoji_count'] >= 2).astype(int)

# Short comment indicator
features['is_short_comment'] = (df['text_len'] <= 30).astype(int)

# Slang count
slang_terms = [
    'bussin', 'rizz', 'cap', 'no cap', 'gyat', 'gyatt', 'sigma', 'skibidi', 'grimace', 'fanum',
    'periodt', 'slay', 'delulu', 'cheugy', 'vibe', 'ate', 'snatched', 'glowup', 'stan', 'sus',
    'fr', 'tbh', 'omg', 'u', 'r', 'l8r', 'wyd', 'wyll', 'ily', 'bff', 'fomo', 'smh'
]
df['slang_count'] = df['text_norm'].str.lower().apply(
    lambda text: sum(1 for term in slang_terms if term in text)
)
features['slang_count'] = df['slang_count']
features['is_high_slang'] = (df['slang_count'] >= 1).astype(int)

# Night owl behavior
features['is_night_owl'] = ((df['hour'] >= 22) | (df['hour'] <= 4)).astype(int)

# Circular time encoding (important for GMM)
features['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
features['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

# Beauty keyword focus
beauty_keywords = [
    'skincare', 'makeup', 'routine', 'tutorial', 'review', 'hack', 'tip', 'look', 'glow', 'glass',
    'filter', 'foundation', 'concealer', 'eyeshadow', 'lipstick', 'blush', 'serum', 'moisturizer',
    'cleanser', 'toner', 'mask', 'acne', 'pimple', 'dark spot', 'brighten', 'exfoliate'
]
df['has_beauty_keyword'] = df['text_norm'].str.lower().apply(
    lambda text: any(kw in text for kw in beauty_keywords)
).astype(int)
features['has_beauty_keyword'] = df['has_beauty_keyword']

# Long-form content
features['is_long_comment'] = (df['text_len'] > 100).astype(int)

# Drop raw features
features.drop(['emoji_count'], axis=1, inplace=True)

# Final feature sets
numeric_features = ['emoji_per_word', 'slang_count', 'hour_sin', 'hour_cos']
categorical_features = [
    'is_high_emoji', 'is_short_comment', 'is_high_slang',
    'is_night_owl', 'is_long_comment', 'has_beauty_keyword',
    'lang', 'time_of_day'
]

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
], remainder='drop')

# Fit and transform
X = preprocessor.fit_transform(features)
print(f"[OK] Feature matrix shape: {X.shape}")

[INFO] Preparing features for GMM...
[OK] Feature matrix shape: (4725012, 76)


In [4]:
# ===== 4. OPTIMAL COMPONENT SELECTION (on sample) =====
print("[INFO] Finding optimal number of components with BIC...")

# Downsample for efficiency
SAMPLE_SIZE = 100_000
if len(X) > SAMPLE_SIZE:
    print(f"[INFO] Sampling {SAMPLE_SIZE} rows...")
    np.random.seed(42)
    sample_idx = np.random.choice(len(X), SAMPLE_SIZE, replace=False)
    X_sample = X[sample_idx]
else:
    X_sample = X

# Test components using BIC (better than silhouette for GMM)
n_components_range = range(3, 6)
bics = []

for n in n_components_range:
    print(f"[INFO] Fitting GMM with n_components={n}...")
    gmm = GaussianMixture(n_components=n, covariance_type='full', random_state=42, max_iter=500)
    gmm.fit(X_sample)
    bics.append(gmm.bic(X_sample))
    print(f"  → BIC = {bics[-1]:.2f}")

optimal_k = n_components_range[np.argmin(bics)]
print(f"[OK] Selected optimal k = {optimal_k} (lowest BIC)")

[INFO] Finding optimal number of components with BIC...
[INFO] Sampling 100000 rows...
[INFO] Fitting GMM with n_components=3...
  → BIC = -49974632.39
[INFO] Fitting GMM with n_components=4...
  → BIC = -52188005.14
[INFO] Fitting GMM with n_components=5...
  → BIC = -51069638.67
[OK] Selected optimal k = 4 (lowest BIC)


In [5]:
# ===== 5. FINAL GMM FITTING ON FULL DATA =====
print(f"[INFO] Fitting final GMM on full data with n_components={optimal_k}...")
gmm_final = GaussianMixture(
    n_components=optimal_k,
    covariance_type='full',
    random_state=42,
    max_iter=500
)
gmm_final.fit(X)

# Soft assignment (probabilities) + hard labels
proba = gmm_final.predict_proba(X)  # shape: (N, k)
labels = gmm_final.predict(X)
confidence = proba.max(axis=1)  # Highest probability = confidence

[INFO] Fitting final GMM on full data with n_components=4...


In [6]:
# ===== 6. INTERPRET CLUSTERS & ASSIGN SEGMENTS (Robust Version) =====
print("[INFO] Interpreting clusters using GMM means (Robust Heuristics)...")

# Get cluster centers
cluster_centers = gmm_final.means_
print(f"[INFO] Cluster centers shape: {cluster_centers.shape}")

# Get feature names after preprocessing
numeric_feature_names = numeric_features
categorical_feature_names_out = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names = list(numeric_feature_names) + list(categorical_feature_names_out)

# Create index map
feature_index_map = {name: i for i, name in enumerate(feature_names)}

# --- Extract key metrics for all clusters first ---
cluster_profiles = []
print("\n[INFO] Analyzing raw cluster characteristics...")
for i in range(optimal_k):
    center = cluster_centers[i]

    # Extract features
    slang_count = center[feature_index_map['slang_count']]
    emoji_per_word = center[feature_index_map['emoji_per_word']]
    hour_sin = center[feature_index_map['hour_sin']]
    hour_cos = center[feature_index_map['hour_cos']]
    
    # Convert circular time to hour (0–23)
    hour_angle_rad = np.arctan2(hour_sin, hour_cos)
    hour = (hour_angle_rad * 12 / np.pi) % 24
    if hour < 0:
        hour += 24

    # Find binary feature indices (robust lookup)
    def find_feature(name_part):
        for name in feature_names:
            if name_part in name:
                return name
        return None

    long_comment_feature = find_feature('is_long_comment')
    beauty_keyword_feature = find_feature('has_beauty_keyword')
    
    is_long_comment_prob = center[feature_index_map[long_comment_feature]] if long_comment_feature else 0.0
    has_beauty_keyword_prob = center[feature_index_map[beauty_keyword_feature]] if beauty_keyword_feature else 0.0

    # Store profile
    profile = {
        'cluster_id': i,
        'slang_count': slang_count,
        'emoji_per_word': emoji_per_word,
        'hour': hour,
        'is_long_comment': is_long_comment_prob,
        'has_beauty_keyword': has_beauty_keyword_prob
    }
    cluster_profiles.append(profile)
    
    # Print diagnostics
    print(f"  Cluster {i}: "
          f"slang={slang_count:.2f}, "
          f"emoji/word={emoji_per_word:.2f}, "
          f"hour={hour:.1f}, "
          f"is_long_text={is_long_comment_prob:.2f}, "
          f"has_beauty_kw={has_beauty_keyword_prob:.2f}")

# --- Assign labels based on relative differences ---
# 1. Find Gen Z candidate: highest slang, late night, lowest emoji
genz_score = []
for p in cluster_profiles:
    # Gen Z score: high slang, late hour (penalize early hours), low emoji
    score = p['slang_count'] + (1 if (p['hour'] > 20 or p['hour'] < 5) else -1) - p['emoji_per_word']
    genz_score.append((p['cluster_id'], score))

# Sort by score descending, top candidate is Gen Z
genz_score.sort(key=lambda x: x[1], reverse=True)
genz_cluster_id = genz_score[0][0]

# 2. Find Millennial candidates: daytime, moderate slang, moderate emojis
millennial_score = []
for p in cluster_profiles:
    # Millennial score: daytime, moderate slang, moderate emojis
    hour_score = 1 if (8 <= p['hour'] <= 18) else 0
    slang_score = 1 if (0.1 < p['slang_count'] < 1.5) else 0
    emoji_score = 1 if (p['emoji_per_word'] > -0.5) else 0 # More positive than others
    score = hour_score + slang_score + emoji_score
    millennial_score.append((p['cluster_id'], score))

# Sort and pick the best non-Gen Z cluster
millennial_score.sort(key=lambda x: x[1], reverse=True)
millennial_cluster_id = None
for cid, _ in millennial_score:
    if cid != genz_cluster_id:
        millennial_cluster_id = cid
        break

# 3. Assign labels
segment_mapping = {}
for p in cluster_profiles:
    cid = p['cluster_id']
    if cid == genz_cluster_id:
        segment_mapping[cid] = "GenZ"
    elif cid == millennial_cluster_id:
        segment_mapping[cid] = "Millennial"
    elif p['has_beauty_keyword'] > 0.7: # High topic focus
        segment_mapping[cid] = "InterestDriven"
    else:
        segment_mapping[cid] = "Other"

print(f"\n[INFO] Cluster-to-Segment Mapping (Robust): {segment_mapping}")
print(f"[OK] Assigned Gen Z to Cluster {genz_cluster_id}")
if millennial_cluster_id is not None:
    print(f"[OK] Assigned Millennial to Cluster {millennial_cluster_id}")
else:
    print("[WARNING] Could not confidently assign a Millennial cluster.")

# --- Final Validation ---
genz_candidates = [cid for cid, seg in segment_mapping.items() if seg == "GenZ"]
if not genz_candidates:
    print("[CRITICAL] No Gen Z cluster assigned. Review data or features.")

[INFO] Interpreting clusters using GMM means (Robust Heuristics)...
[INFO] Cluster centers shape: (4, 76)

[INFO] Analyzing raw cluster characteristics...
  Cluster 0: slang=-1.21, emoji/word=0.31, hour=5.3, is_long_text=1.00, has_beauty_kw=0.97
  Cluster 1: slang=0.49, emoji/word=-0.14, hour=10.6, is_long_text=0.88, has_beauty_kw=0.80
  Cluster 2: slang=0.56, emoji/word=-0.18, hour=22.6, is_long_text=0.86, has_beauty_kw=0.81
  Cluster 3: slang=-1.20, emoji/word=0.43, hour=16.1, is_long_text=0.98, has_beauty_kw=0.97

[INFO] Cluster-to-Segment Mapping (Robust): {0: 'InterestDriven', 1: 'Millennial', 2: 'GenZ', 3: 'InterestDriven'}
[OK] Assigned Gen Z to Cluster 2
[OK] Assigned Millennial to Cluster 1


In [7]:
# ===== 7. GENERATE segments_labels.csv =====
print("[INFO] Generating segments_labels.csv...")

# Create per-commenter labels DataFrame
# Select only the necessary base columns from the original df to avoid column conflicts
# Assuming 'commentId' and 'videoId' are in the original df
segment_labels = df[["commentId", "videoId"]].copy()
segment_labels["segment_id"] = labels
segment_labels["segment"] = pd.Series(labels).map(segment_mapping)

# Add confidence score (max probability from GMM)
segment_labels["confidence"] = confidence

# Optional: Add back some key features for debugging/diagnostics if needed
# segment_labels["slang_count"] = df["slang_count"]
# segment_labels["emoji_per_word"] = (df['emoji_count'] / df['text_norm'].str.split().str.len().replace(0, 1)).fillna(0)

# Ensure output directory exists
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# Save the per-commenter segment labels
segment_labels.to_csv(SEGMENT_LABELS_PATH, index=False)
print(f"[OK] Saved segments_labels.csv to {SEGMENT_LABELS_PATH}")
print(f"      Shape: {segment_labels.shape}")
print(f"      Columns: {list(segment_labels.columns)}")
print(f"      Sample rows:")
print(segment_labels.head())

[INFO] Generating segments_labels.csv...
[OK] Saved segments_labels.csv to /kaggle/working/segments_labels.csv
      Shape: (4725012, 5)
      Columns: ['commentId', 'videoId', 'segment_id', 'segment', 'confidence']
      Sample rows:
   commentId  videoId  segment_id         segment  confidence
0    1781382    74288           2            GenZ         1.0
1     289571    79618           1      Millennial         1.0
2     569077    51826           1      Millennial         1.0
3    2957962    58298           3  InterestDriven         1.0
4     673093     1265           0  InterestDriven         1.0


In [8]:
# ===== 8. GENERATE segments_video.csv (Aggregated per-video) =====
print("[INFO] Generating segments_video.csv (aggregated per video)...")

# Aggregate to get the percentage mix per video
# Using groupby and apply with lambda for flexible calculations
video_segments = segment_labels.groupby("videoId").agg(
    total_comments=("commentId", "count"),
    genz_pct=("segment", lambda x: (x == "GenZ").sum() / len(x) * 100),
    millennial_pct=("segment", lambda x: (x == "Millennial").sum() / len(x) * 100),
    interest_pct=("segment", lambda x: (x == "InterestDriven").sum() / len(x) * 100),
    other_pct=("segment", lambda x: (x == "Other").sum() / len(x) * 100),
    top_interest=("segment", lambda x: x.mode()[0] if not x.mode().empty else "Other")
).reset_index()

# Ensure output directory exists (redundant but safe)
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

# Save the per-video aggregated segments
VIDEO_SEGMENTS_PATH = f"{OUTPUT_DIR}/segments_video.csv"
video_segments.to_csv(VIDEO_SEGMENTS_PATH, index=False)
print(f"[OK] Saved segments_video.csv to {VIDEO_SEGMENTS_PATH}")
print(f"      Shape: {video_segments.shape}")
print(f"      Columns: {list(video_segments.columns)}")
print(f"      Sample rows:")
print(video_segments.head())

[INFO] Generating segments_video.csv (aggregated per video)...
[OK] Saved segments_video.csv to /kaggle/working/segments_video.csv
      Shape: (39938, 7)
      Columns: ['videoId', 'total_comments', 'genz_pct', 'millennial_pct', 'interest_pct', 'other_pct', 'top_interest']
      Sample rows:
   videoId  total_comments   genz_pct  millennial_pct  interest_pct  \
0        0             526   32.69962       33.460076     33.840304   
1        1               1    0.00000        0.000000    100.000000   
2        2               1  100.00000        0.000000      0.000000   
3        6               5   80.00000       20.000000      0.000000   
4        7               1    0.00000        0.000000    100.000000   

   other_pct    top_interest  
0        0.0  InterestDriven  
1        0.0  InterestDriven  
2        0.0            GenZ  
3        0.0            GenZ  
4        0.0  InterestDriven  


In [9]:
# ===== 9. SAVE MODEL ARTIFACTS =====
print("[INFO] Saving model artifacts...")

# Ensure artifacts directory exists
Path(ARTIFACTS_DIR).mkdir(parents=True, exist_ok=True)

# 1. Save the fitted preprocessor (scaler, encoder)
PREPROCESSOR_PATH = f"{ARTIFACTS_DIR}/scaler.pkl"
joblib.dump(preprocessor, PREPROCESSOR_PATH)
print(f"[OK] Saved preprocessor to {PREPROCESSOR_PATH}")

# 2. Save the fitted final GMM model
GMM_MODEL_PATH = f"{ARTIFACTS_DIR}/segments_model.pkl"
joblib.dump(gmm_final, GMM_MODEL_PATH)
print(f"[OK] Saved GMM model to {GMM_MODEL_PATH}")

# 3. Save the segment mapping dictionary
SEGMENT_MAPPING_PATH = f"{ARTIFACTS_DIR}/segment_mapping.pkl"
joblib.dump(segment_mapping, SEGMENT_MAPPING_PATH)
print(f"[OK] Saved segment mapping to {SEGMENT_MAPPING_PATH}")

# 4. Save feature specification for documentation/reproducibility
FEATURE_SPEC_PATH = f"{ARTIFACTS_DIR}/feature_spec.json"
feature_spec = {
    "numeric_features": numeric_features,
    "categorical_features": list(categorical_feature_names_out), # Save the actual encoded names
    "final_feature_names": feature_names, # Full list of features after preprocessing
    "optimal_k": optimal_k,
    "covariance_type": "full",
    "segment_mapping": segment_mapping,
    "heuristic_notes": "GenZ: High slang, late night, low emoji. Millennial: Daytime, moderate slang/emoji. InterestDriven: High beauty keywords."
}
import json
with open(FEATURE_SPEC_PATH, 'w') as f:
    json.dump(feature_spec, f, indent=2)
print(f"[OK] Saved feature specification to {FEATURE_SPEC_PATH}")

[INFO] Saving model artifacts...
[OK] Saved preprocessor to /kaggle/working/artifacts/scaler.pkl
[OK] Saved GMM model to /kaggle/working/artifacts/segments_model.pkl
[OK] Saved segment mapping to /kaggle/working/artifacts/segment_mapping.pkl
[OK] Saved feature specification to /kaggle/working/artifacts/feature_spec.json


In [10]:
# ===== 10. FINAL REPORTING =====
print("\n" + "="*50)
print("[INFO] AUDIENCE SEGMENTATION COMPLETE")
print("="*50)
print(f"Input Data: {INPUT_PATH}")
print(f"Final Clusters (k): {optimal_k}")
print(f"Model Used: Gaussian Mixture Model (GMM)")
print(f"Output Files Generated:")
print(f"  - Per-commenter labels: {SEGMENT_LABELS_PATH}")
print(f"  - Per-video aggregation: {VIDEO_SEGMENTS_PATH}")
print(f"  - Model artifacts in: {ARTIFACTS_DIR}")
print("-" * 30)
print("Segment Distribution:")
for segment, count in segment_labels['segment'].value_counts().items():
    pct = count / len(segment_labels) * 100
    print(f"  - {segment}: {count:,} ({pct:.2f}%)")
print("="*50)
print("[SUCCESS] You can now proceed to fundamentals.py")
print("="*50)


[INFO] AUDIENCE SEGMENTATION COMPLETE
Input Data: /kaggle/input/data-cleaning/comments_enriched.parquet
Final Clusters (k): 4
Model Used: Gaussian Mixture Model (GMM)
Output Files Generated:
  - Per-commenter labels: /kaggle/working/segments_labels.csv
  - Per-video aggregation: /kaggle/working/segments_video.csv
  - Model artifacts in: /kaggle/working/artifacts
------------------------------
Segment Distribution:
  - Millennial: 1,696,089 (35.90%)
  - GenZ: 1,592,642 (33.71%)
  - InterestDriven: 1,436,281 (30.40%)
[SUCCESS] You can now proceed to fundamentals.py
