# NJ Influencer Ecosystem Analysis

Analysis of New Jersey-based content creators and their influence on the information landscape.

## Overview
- **Data Sources**: TikTok, YouTube, Instagram
- **Scope**: 39 NJ Influencers
- **Content**: Up to 50 most recent posts per platform per influencer

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from datetime import datetime
from collections import Counter
import re

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

print('Libraries loaded successfully')

## 1. Data Loading & Preparation

In [None]:
# Load consolidated data
data_dir = Path('data')

# Load all posts
all_posts_df = pd.read_csv(data_dir / 'all_posts.csv')
influencer_metrics_df = pd.read_csv(data_dir / 'influencer_metrics.csv')

# Load platform-specific data
tiktok_df = pd.read_csv(data_dir / 'tiktok_posts.csv')
youtube_df = pd.read_csv(data_dir / 'youtube_posts.csv')
instagram_df = pd.read_csv(data_dir / 'instagram_posts.csv')

print(f'Total posts: {len(all_posts_df):,}')
print(f'  - TikTok: {len(tiktok_df):,}')
print(f'  - YouTube: {len(youtube_df):,}')
print(f'  - Instagram: {len(instagram_df):,}')
print(f'\nTotal influencers: {len(influencer_metrics_df)}')

In [None]:
# Data overview
print('All Posts DataFrame Info:')
print(all_posts_df.dtypes)
print('\nFirst few rows:')
all_posts_df.head()

## 2. Ecosystem Overview

In [None]:
# Platform distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Posts by platform
platform_counts = all_posts_df['platform'].value_counts()
axes[0].pie(platform_counts, labels=platform_counts.index, autopct='%1.1f%%', 
            colors=sns.color_palette('husl', len(platform_counts)))
axes[0].set_title('Posts by Platform')

# Engagement by platform
engagement_by_platform = all_posts_df.groupby('platform').agg({
    'like_count': 'sum',
    'comment_count': 'sum'
}).sum(axis=1)
axes[1].bar(engagement_by_platform.index, engagement_by_platform.values, 
            color=sns.color_palette('husl', len(engagement_by_platform)))
axes[1].set_title('Total Engagement by Platform')
axes[1].set_ylabel('Total Engagement (Likes + Comments)')
axes[1].ticklabel_format(style='plain', axis='y')

plt.tight_layout()
plt.savefig('figures/platform_overview.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Overall metrics
total_views = (
    tiktok_df['view_count'].sum() + 
    youtube_df['view_count'].sum() + 
    instagram_df['video_view_count'].sum()
)
total_likes = all_posts_df['like_count'].sum()
total_comments = all_posts_df['comment_count'].sum()

print('=== ECOSYSTEM METRICS ===')
print(f'Total Views: {total_views:,.0f}')
print(f'Total Likes: {total_likes:,.0f}')
print(f'Total Comments: {total_comments:,.0f}')
print(f'Total Engagement: {total_likes + total_comments:,.0f}')

## 3. Influence Rankings

In [None]:
# Top influencers by total engagement
top_20 = influencer_metrics_df.head(20)

fig, ax = plt.subplots(figsize=(12, 10))
bars = ax.barh(top_20['influencer_name'][::-1], top_20['total_engagement'][::-1], 
               color=sns.color_palette('viridis', len(top_20))[::-1])
ax.set_xlabel('Total Engagement')
ax.set_title('Top 20 Influencers by Total Engagement')
ax.ticklabel_format(style='plain', axis='x')

plt.tight_layout()
plt.savefig('figures/top_influencers.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Platform breakdown for top influencers
top_10 = influencer_metrics_df.head(10)

fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(top_10))
width = 0.25

tiktok_eng = top_10['tiktok_likes'] + top_10['tiktok_comments'] + top_10['tiktok_reposts']
youtube_eng = top_10['youtube_likes'] + top_10['youtube_comments']
instagram_eng = top_10['instagram_likes'] + top_10['instagram_comments']

ax.bar(x - width, tiktok_eng, width, label='TikTok', color='#00F2EA')
ax.bar(x, youtube_eng, width, label='YouTube', color='#FF0000')
ax.bar(x + width, instagram_eng, width, label='Instagram', color='#E1306C')

ax.set_xlabel('Influencer')
ax.set_ylabel('Total Engagement')
ax.set_title('Platform Engagement Breakdown - Top 10 Influencers')
ax.set_xticks(x)
ax.set_xticklabels(top_10['influencer_name'], rotation=45, ha='right')
ax.legend()
ax.ticklabel_format(style='plain', axis='y')

plt.tight_layout()
plt.savefig('figures/platform_breakdown.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Engagement rate analysis
fig, ax = plt.subplots(figsize=(10, 6))

# Filter out influencers with no views
with_views = influencer_metrics_df[influencer_metrics_df['total_views'] > 0].copy()
with_views['engagement_rate_pct'] = with_views['engagement_rate'] * 100

ax.scatter(with_views['total_views'], with_views['engagement_rate_pct'], 
           s=with_views['total_posts']*3, alpha=0.6)

# Label top engagement rate influencers
top_rate = with_views.nlargest(5, 'engagement_rate_pct')
for _, row in top_rate.iterrows():
    ax.annotate(row['influencer_name'], (row['total_views'], row['engagement_rate_pct']),
                fontsize=8)

ax.set_xlabel('Total Views')
ax.set_ylabel('Engagement Rate (%)')
ax.set_title('Engagement Rate vs Total Views\n(bubble size = post count)')
ax.set_xscale('log')

plt.tight_layout()
plt.savefig('figures/engagement_rate.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Content Analysis

In [None]:
# Content categorization based on keywords
def categorize_content(text):
    """Categorize content based on keywords"""
    if pd.isna(text):
        return 'Other'
    
    text = str(text).lower()
    
    categories = {
        'News/Politics': ['news', 'breaking', 'update', 'politics', 'election', 'vote', 'government', 'mayor', 'governor'],
        'Sports': ['sports', 'game', 'basketball', 'football', 'soccer', 'baseball', 'hockey', 'score', 'win', 'team', 'player', 'coach'],
        'Food/Restaurant': ['food', 'restaurant', 'eat', 'dinner', 'lunch', 'breakfast', 'recipe', 'cook', 'chef', 'menu', 'pizza', 'bagel'],
        'Lifestyle': ['lifestyle', 'home', 'family', 'kids', 'parent', 'mom', 'dad', 'fashion', 'style', 'beauty'],
        'Entertainment': ['comedy', 'funny', 'joke', 'laugh', 'music', 'concert', 'show', 'movie', 'podcast'],
        'Local/Community': ['jersey', 'nj', 'newark', 'trenton', 'atlantic city', 'shore', 'community', 'local', 'neighborhood'],
        'Emergency/Public Safety': ['fire', 'police', 'emergency', 'accident', 'crash', 'rescue', 'ems', 'ambulance']
    }
    
    for category, keywords in categories.items():
        if any(kw in text for kw in keywords):
            return category
    
    return 'Other'

# Apply categorization
all_posts_df['content_category'] = all_posts_df.apply(
    lambda row: categorize_content(f"{row.get('title', '')} {row.get('description', '')} {row.get('caption', '')}"),
    axis=1
)

# Category distribution
category_counts = all_posts_df['content_category'].value_counts()
print('Content Categories:')
print(category_counts)

In [None]:
# Visualize content categories
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Post count by category
axes[0].barh(category_counts.index[::-1], category_counts.values[::-1],
             color=sns.color_palette('Set2', len(category_counts))[::-1])
axes[0].set_xlabel('Number of Posts')
axes[0].set_title('Posts by Content Category')

# Engagement by category
category_engagement = all_posts_df.groupby('content_category').agg({
    'like_count': 'sum',
    'comment_count': 'sum'
}).sum(axis=1).sort_values(ascending=True)

axes[1].barh(category_engagement.index, category_engagement.values,
             color=sns.color_palette('Set2', len(category_engagement)))
axes[1].set_xlabel('Total Engagement')
axes[1].set_title('Engagement by Content Category')
axes[1].ticklabel_format(style='plain', axis='x')

plt.tight_layout()
plt.savefig('figures/content_categories.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Temporal Analysis

In [None]:
# Convert dates
all_posts_df['upload_date'] = pd.to_datetime(all_posts_df['upload_date'], errors='coerce')

# Posts over time
posts_with_dates = all_posts_df.dropna(subset=['upload_date'])
posts_by_date = posts_with_dates.groupby(posts_with_dates['upload_date'].dt.date).size()

fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(posts_by_date.index, posts_by_date.values, linewidth=0.5, alpha=0.7)
ax.fill_between(posts_by_date.index, posts_by_date.values, alpha=0.3)
ax.set_xlabel('Date')
ax.set_ylabel('Number of Posts')
ax.set_title('Posting Activity Over Time')

plt.tight_layout()
plt.savefig('figures/temporal_activity.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Ecosystem Map

In [None]:
# Create ecosystem heatmap
# Platform activity by influencer
platform_matrix = influencer_metrics_df.set_index('influencer_name')[[
    'tiktok_posts', 'youtube_posts', 'instagram_posts'
]].head(20)

fig, ax = plt.subplots(figsize=(10, 12))
sns.heatmap(platform_matrix, annot=True, fmt='d', cmap='YlOrRd', ax=ax)
ax.set_title('Platform Activity Matrix - Top 20 Influencers')
ax.set_xlabel('Platform')

plt.tight_layout()
plt.savefig('figures/ecosystem_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Platform specialization
def get_primary_platform(row):
    platforms = {
        'TikTok': row['tiktok_posts'],
        'YouTube': row['youtube_posts'],
        'Instagram': row['instagram_posts']
    }
    return max(platforms, key=platforms.get)

influencer_metrics_df['primary_platform'] = influencer_metrics_df.apply(get_primary_platform, axis=1)

platform_specialization = influencer_metrics_df['primary_platform'].value_counts()

fig, ax = plt.subplots(figsize=(8, 8))
colors = {'TikTok': '#00F2EA', 'YouTube': '#FF0000', 'Instagram': '#E1306C'}
ax.pie(platform_specialization, labels=platform_specialization.index, 
       autopct='%1.1f%%', colors=[colors[p] for p in platform_specialization.index])
ax.set_title('Influencer Primary Platforms')

plt.tight_layout()
plt.savefig('figures/platform_specialization.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Key Findings Summary

In [None]:
# Generate summary report
print('=' * 60)
print('NJ INFLUENCER ECOSYSTEM - KEY FINDINGS')
print('=' * 60)

print(f'\n1. SCALE')
print(f'   - Total influencers analyzed: {len(influencer_metrics_df)}')
print(f'   - Total posts collected: {len(all_posts_df):,}')
print(f'   - Total reach (views): {total_views:,.0f}')
print(f'   - Total engagement: {total_likes + total_comments:,.0f}')

print(f'\n2. PLATFORM DISTRIBUTION')
for platform in ['tiktok', 'youtube', 'instagram']:
    count = len(all_posts_df[all_posts_df['platform'] == platform])
    pct = count / len(all_posts_df) * 100
    print(f'   - {platform.capitalize()}: {count:,} posts ({pct:.1f}%)')

print(f'\n3. TOP INFLUENCERS BY ENGAGEMENT')
for i, row in influencer_metrics_df.head(5).iterrows():
    print(f'   {i+1}. {row["influencer_name"]}: {row["total_engagement"]:,.0f}')

print(f'\n4. CONTENT CATEGORIES')
for cat, count in category_counts.head(5).items():
    print(f'   - {cat}: {count:,} posts')

print(f'\n5. PLATFORM SPECIALIZATION')
for platform, count in platform_specialization.items():
    print(f'   - {platform}: {count} influencers')

In [None]:
# Save updated data with categories
all_posts_df.to_csv(data_dir / 'all_posts_categorized.csv', index=False)
influencer_metrics_df.to_csv(data_dir / 'influencer_metrics_updated.csv', index=False)
print('Updated data saved successfully')