In [19]:
import json
import sys
import re
from collections import Counter
from datetime import datetime, timedelta
import glob
import os
import tqdm

DATA_DIRECTORY = "<FILL IN>"

def process_tweet_text(text):
    # Remove t.co URLs
    text = re.sub(r'https://t\.co/\w+', '', text)
    # Remove any resulting double spaces
    text = re.sub(r'\s+', ' ', text)
    # Trim whitespace and lowercase
    return text.strip().lower()

def find_trending_tokens(recent_tokens, old_tokens, total_recent, total_old):
    trending_scores = {}
    
    for token in recent_tokens:
        recent_freq = recent_tokens[token] / total_recent
        old_freq = (old_tokens[token] + 1) / total_old  # Add 1 for smoothing
        expected = old_freq
        trending_scores[token] = (recent_freq - expected) # ** 2 / expected
        
        # Calculate trending score
        #trending_scores[token] = recent_freq / (old_freq ** 0.5)  # Using sqrt for log effect
    
    # Get top 100 trending tokens
    top_trending = sorted(trending_scores.items(), key=lambda x: x[1], reverse=True)[:100]
    return top_trending


def load_tweet_data(json_files):
    """Load all tweet data into memory for reuse.
    Returns list of (text, date, username) tuples for non-retweet, non-reply tweets."""
    tweet_data = []
    
    for json_file in tqdm.tqdm(json_files):
        with open(json_file, 'r') as f:
            data = json.load(f)
        
        tweets = data.get('tweets', [])
        print("Loaded " + str(len(tweets)) + "tweets")
        
        for tweet_obj in tweets:
            if 'tweet' not in tweet_obj or 'full_text' not in tweet_obj['tweet']:
                continue
                
            text = tweet_obj['tweet']['full_text']
            # Skip retweets and replies
            if text.startswith('RT @') or text.startswith('@'):
                continue
                
            # Get tweet timestamp
            created_at = tweet_obj['tweet'].get('created_at')
            if not created_at:
                continue
            
            username = json_file.split('/')[-2]
            if not username:
                continue
                
            tweet_date = datetime.strptime(created_at, '%a %b %d %H:%M:%S %z %Y')
            text = process_tweet_text(text)
            
            if text:  # Only include if there's text after filtering
                tweet_data.append((text, tweet_date, username))
    
    return tweet_data
def analyze_trending_tokens(tweet_data):
    """Analyze trending tokens from preprocessed tweet data."""
    recent_tokens = Counter()
    old_tokens = Counter()
    total_recent = 0
    total_old = 0
    
    cutoff_date = datetime.now(datetime.now().astimezone().tzinfo) - timedelta(days=360)
    
    # Process tweets
    for text, tweet_date, _ in tweet_data:
        tokens = text.split()
        if tweet_date > cutoff_date:
            recent_tokens.update(tokens)
            total_recent += len(tokens)
        else:
            old_tokens.update(tokens)
            total_old += len(tokens)
    
    # Find tokens with 10x growth and at least 10 recent occurrences
    trending_tokens = []
    for token in recent_tokens:
        recent_freq = recent_tokens[token] / total_recent
        old_freq = (old_tokens[token] + 1) / total_old  # Add 1 for smoothing
        
        if recent_freq > (old_freq * 10) and recent_tokens[token] >= 10:
            trending_tokens.append({
                'token': token,
                'recent_count': recent_tokens[token],
                'old_count': old_tokens[token],
                'recent_freq': recent_freq,
                'old_freq': old_freq,
                'growth': recent_freq / old_freq
            })
    
    # Sort by recent frequency (popularity)
    trending_tokens.sort(key=lambda x: x['recent_count'], reverse=True)
    
    print("\nTokens that grew >10x, sorted by recent popularity:")
    print("\ntoken: recent_count (old_count) - growth_factor")
    for t in trending_tokens:
        print(f"{t['token']}: {t['recent_count']} ({t['old_count']}) - {t['growth']:.1f}x")
    
    return trending_tokens, recent_tokens, old_tokens, total_recent, total_old

In [None]:
json_files = glob.glob(DATA_DIRECTORY + 'downloads/archives/*/*.json')
if not json_files:
    print("No JSON files found in ../data/downloads/archives/")

print("Loading tweet data...")
tweet_data = load_tweet_data(json_files)
print(f"Loaded {len(tweet_data)} tweets")

# Save tweet data to disk as a pickle file
import pickle
print("Saving tweet data to disk...")
with open(DATA_DIRECTORY + 'tweet_data.pkl', 'wb') as f:
    pickle.dump(tweet_data, f)


# Contagion modelling

In [None]:
len(tweet_data)

In [97]:
def process_tweet_text(text):
    # Remove t.co URLs
    text = re.sub(r'https://t\.co/\w+', '', text)
    # Remove any resulting double spaces
    text = re.sub(r'\s+', ' ', text)
    # Trim whitespace and lowercase
    return text.strip().lower()

def tokenize(text):
    """Split text into tokens, handling punctuation and case."""
    # Convert to lowercase
    text = text.lower()
    # Split on punctuation and whitespace
    tokens = re.split(r'[\s\.,!?;:"\'()\[\]{}|/\\+=\-_~`@#$%^&*]+', text)
    # Remove empty tokens
    return [t for t in tokens if t]

def analyze_contagions(tweet_data):
    token_first_use = {}  # token -> (date, username)
    token_users = {}      # token -> set of usernames
    token_uses = Counter()  # token -> total uses count
    
    sorted_tweets = sorted(tweet_data, key=lambda x: x[1])
    
    for text, date, username in tqdm.tqdm(sorted_tweets):
        # Use new tokenize function
        tokens = set(tokenize(text))  # use set to handle each token once per tweet
        for token in tokens:
            if not token:  # Skip empty tokens
                continue
                
            if token not in token_first_use:
                token_first_use[token] = (date, username)
                token_users[token] = set()
            
            token_users[token].add(username)
            token_uses[token] += 1
    
    # Find tokens that match our criteria
    contagions = []
    for token, (first_date, first_user) in token_first_use.items():
        # Skip if first use before 2023 or not enough users
        if first_date.year < 2018 or len(token_users[token]) < 30:
            continue
            
        contagions.append({
            'token': token,
            'patient_zero': first_user,
            'first_date': first_date,
            'total_users': len(token_users[token]),
            'total_uses': token_uses[token]
        })
    
    # Sort by number of eventual users
    contagions.sort(key=lambda x: x['total_users'], reverse=True)
    
    print("\nViral tokens that spread from single origin in 2024:")
    print("\ntoken: first_user (first_date) - total_users/total_uses")
    for c in contagions:
        print(f"{c['token']}: {c['patient_zero']} ({c['first_date'].strftime('%Y-%m-%d')}) - {c['total_users']}/{c['total_uses']} users/uses")
    
    return contagions

In [None]:
import numpy as np
import scipy
from scipy import stats
import pandas as pd
from collections import defaultdict

def analyze_contagions_detailed(tweet_data):
    token_first_use = {}  # token -> (date, username)
    token_history = defaultdict(list)  # token -> list of (date, username, full_text)
    user_adoption_ranks = defaultdict(list)  # username -> list of percentile ranks
    
    sorted_tweets = sorted(tweet_data, key=lambda x: x[1])
    
    # First pass: collect full history
    for text, date, username in sorted_tweets:
        tokens = set(tokenize(text))
        for token in tokens:
            if not token:
                continue
                
            if token not in token_first_use:
                token_first_use[token] = (date, username)
            token_history[token].append((date, username, text))
    
    # Second pass: analyze viral tokens and compute adoption ranks
    viral_tokens = []
    for token, history in token_history.items():
        # Sort history by date first
        history.sort(key=lambda x: x[0])
        
        # Get total unique users for this token
        total_users = len(set(username for _, username, _ in history))
        
        if total_users >= 30 and token_first_use[token][0].year >= 2018:
            unique_users = []
            seen_users = set()
            current_rank = 0
            
            for _, username, _ in history:
                if username not in seen_users:
                    current_rank += 1
                    unique_users.append(username)
                    seen_users.add(username)
                    
                    # Calculate percentile rank based on final total users
                    percentile = (current_rank - 1) / (total_users - 1)  # -1 to make range 0 to 1
                    user_adoption_ranks[username].append(percentile)
            
            viral_tokens.append({
                'token': token,
                'first_use': token_first_use[token],
                'second_user': unique_users[1] if len(unique_users) > 1 else None,
                'total_users': total_users,
                'unique_users': unique_users,
                'full_history': history
            })
    
    # Calculate user leaderboard stats
    leaderboard = []
    for username, ranks in user_adoption_ranks.items():
        n = len(ranks)
        mean = np.mean(ranks)
        
            
        leaderboard.append({
            'username': username,
            'mean_percentile': mean,
            'num_words': n,
        })
    
    # Sort viral tokens by total users and leaderboard by mean percentile
    viral_tokens.sort(key=lambda x: x['total_users'], reverse=True)
    leaderboard.sort(key=lambda x: x['mean_percentile'])
    
    return viral_tokens, leaderboard

def generate_html_report(viral_tokens, leaderboard):
    html = """
    <html>
    <head>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            .token-card { 
                border: 1px solid #ddd; 
                margin: 5px 0; 
                padding: 10px; 
            }
            .token-header { 
                display: flex; 
                justify-content: space-between; 
                align-items: center;
                gap: 20px;
            }
            .token-header > * {
                margin: 0;
                white-space: nowrap;
            }
            .details { display: none; margin-top: 10px; }
            .show-details { cursor: pointer; color: blue; }
            .leaderboard { margin-top: 30px; }
            table { border-collapse: collapse; width: 100%; }
            th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        </style>
        <script>
            function toggleDetails(id) {
                var details = document.getElementById(id);
                details.style.display = details.style.display === 'none' ? 'block' : 'none';
            }
        </script>
    </head>
    <body>
        <h1>Viral Token Analysis</h1>
        
        <h2>Token Timeline</h2>
    """
    
    # Add token cards
    for i, token in enumerate(viral_tokens):
        html += f"""
        <div class="token-card">
            <div class="token-header">
                <strong style="min-width: 150px;">{token['token']}</strong>
                <span>First: {token['first_use'][1]} ({token['first_use'][0].strftime('%Y-%m-%d')})</span>
                <span>Second: {token['second_user'] or 'N/A'}</span>
                <span>Total Users: {token['total_users']}</span>
                <span class="show-details" onclick="toggleDetails('details-{i}')">Show Details</span>
            </div>
            <div id="details-{i}" class="details">
                <h4>All Users (in order):</h4>
                <ol>
        """
        seen_users = set()
        for _, username, _ in token['full_history']:
            if username not in seen_users:
                html += f"<li>{username}</li>"
                seen_users.add(username)
        
        html += """
                </ol>
                <h4>All Tweets (chronological):</h4>
                <ul>
        """
        
        for date, username, text in token['full_history']:
            html += f"<li><b>{username}</b> ({date.strftime('%Y-%m-%d')}): {text}</li>"
        
        html += """
                </ul>
            </div>
        </div>
        """
    
    # Add leaderboard
    html += """
        <h2>Early Adopter Leaderboard</h2>
        <table class="leaderboard">
            <tr>
                <th>Username</th>
                <th>Average Percentile</th>
                <th>Number of Words</th>
            </tr>
    """
    
    for user in leaderboard:
        html += f"""
            <tr>
                <td>{user['username']}</td>
                <td>{user['mean_percentile']:.3f}</td>
                <td>{user['num_words']}</td>
            </tr>
        """
    
    html += """
        </table>
    </body>
    </html>
    """
    
    return html

# Generate and save the report
#viral_tokens, leaderboard = analyze_contagions_detailed(tweet_data)
html_report = generate_html_report(viral_tokens, leaderboard)

with open('viral_tokens_report2.html', 'w') as f:
    f.write(html_report)

print("Report generated as viral_tokens_report.html")

In [None]:
leaderboard

# Identity analysis

In [None]:
from collections import defaultdict
import pandas as pd

# Create dictionaries for monthly stats
monthly_users = defaultdict(set)  # month -> set of unique users
monthly_tweets = defaultdict(int) # month -> total tweet count

for _, tweet_date, username in tweet_data:
    # Create a year-month key (e.g., "2024-01")
    month_key = tweet_date.strftime("%Y-%m")
    monthly_users[month_key].add(username)
    monthly_tweets[month_key] += 1

# Convert to DataFrame
df = pd.DataFrame({
    'unique_users': {k: len(v) for k, v in monthly_users.items()},
    'total_tweets': monthly_tweets
})
df.index.name = 'month'
df = df.sort_index()

# Add tweets per user column
df['tweets_per_user'] = df['total_tweets'] / df['unique_users']

# Display results
print("\nMonthly statistics:")
print(df)

# Find the month with most unique users
max_month = df['unique_users'].idxmax()
print(f"\nMonth with most unique users: {max_month}")
print(f"Unique users: {df.loc[max_month, 'unique_users']:,}")
print(f"Total tweets: {df.loc[max_month, 'total_tweets']:,}")
print(f"Tweets per user: {df.loc[max_month, 'tweets_per_user']:.2f}")

In [None]:
!pip install sentence-transformers

In [None]:
import torch
from sentence_transformers import SentenceTransformer
import numpy as np
from tqdm import tqdm

# Get all tweets from August 2024
august_tweets = []
for text, date, username in tweet_data:
    if date.strftime("%Y-%m") == "2024-08":
        august_tweets.append(text)

print(f"Found {len(august_tweets)} tweets from August 2024")

# Load the model on GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = SentenceTransformer('all-mpnet-base-v2')
model.to(device)

# Embed tweets in batches with progress bar
batch_size = 256  # Increased batch size since we're using GPU
embeddings = []

for i in tqdm(range(0, len(august_tweets), batch_size)):
    batch = august_tweets[i:i + batch_size]
    batch_embeddings = model.encode(batch, device=device)
    embeddings.append(batch_embeddings)

# Combine all batches
embeddings = np.vstack(embeddings)

print(f"Generated embeddings with shape: {embeddings.shape}")

# Save embeddings to disk
np.save('august_embeddings.npy', embeddings)

In [None]:
import numpy as np
import seaborn as sns
import tqdm
import matplotlib.pyplot as plt
from collections import defaultdict
import torch

# First create a mapping of original indices to embedding indices for August tweets
august_index_map = {}  # Maps original index to embedding index
august_user_tweets = defaultdict(list)  # Maps username to list of embedding indices

embedding_idx = 0
for orig_idx, (text, date, username) in enumerate(tweet_data):
    if date.strftime("%Y-%m") == "2024-08":
        august_index_map[orig_idx] = embedding_idx
        august_user_tweets[username].append(embedding_idx)
        embedding_idx += 1

# Get list of users with enough tweets
min_tweets = 5
users = [user for user, indices in august_user_tweets.items() if len(indices) >= min_tweets]

# Convert embeddings to torch tensor on GPU
embeddings_tensor = torch.tensor(embeddings, device='cuda')
# Normalize embeddings for faster cosine similarity
embeddings_tensor = embeddings_tensor / embeddings_tensor.norm(dim=1, keepdim=True)

# Create similarity matrix between users
n_users = len(users)
user_sim_matrix = np.zeros((n_users, n_users))

# Calculate similarities using batch matrix multiplication
for i, user1 in tqdm.tqdm(enumerate(users)):
    for j, user2 in enumerate(users):
        if i == j:
            # For same user, calculate average similarity between different tweets
            tweets1 = embeddings_tensor[august_user_tweets[user1]]
            # Compute all pairwise similarities at once
            sim_matrix = torch.mm(tweets1, tweets1.T)
            # Mask out self-similarities
            mask = torch.triu(torch.ones_like(sim_matrix), diagonal=1)
            similarities = sim_matrix[mask.bool()]
            user_sim_matrix[i, j] = similarities.mean().cpu().item() if len(similarities) > 0 else 0
        else:
            # For different users, calculate average similarity between all tweet pairs
            tweets1 = embeddings_tensor[august_user_tweets[user1]]
            tweets2 = embeddings_tensor[august_user_tweets[user2]]
            # Compute all similarities at once
            similarities = torch.mm(tweets1, tweets2.T)
            user_sim_matrix[i, j] = similarities.mean().cpu().item()


In [None]:

# Plot heatmap
plt.figure(figsize=(22,20))
sns.heatmap(user_sim_matrix, 
            xticklabels=users,
            yticklabels=users,
            cmap='viridis')
plt.xticks(rotation=90)
plt.yticks(rotation=0)
plt.title('Average Tweet Similarity Between Users')
plt.tight_layout()
plt.show()

# Print some stats
diagonal_mean = np.mean(np.diag(user_sim_matrix))
offdiagonal_mean = np.mean(user_sim_matrix[~np.eye(user_sim_matrix.shape[0], dtype=bool)])
print(f"Average within-user similarity: {diagonal_mean:.3f}")
print(f"Average between-user similarity: {offdiagonal_mean:.3f}")
print(f"Identity strength ratio: {diagonal_mean/offdiagonal_mean:.2f}x")

In [None]:
# For each user, count how many other users have higher average similarity
more_similar_counts = []
for i, user in enumerate(users):
    self_similarity = user_sim_matrix[i, i]
    # Count users with higher similarity (excluding self)
    more_similar = np.sum(user_sim_matrix[:, i] > self_similarity) - 1  # -1 to exclude self
    more_similar_counts.append(more_similar)

# Plot distribution
plt.figure(figsize=(10, 6))
plt.hist(more_similar_counts, bins=30, edgecolor='black')
plt.title('Distribution of Users with Higher Cross-Similarity than Self-Similarity')
plt.xlabel('Number of Users More Similar')
plt.ylabel('Count')
plt.axvline(np.mean(more_similar_counts), color='red', linestyle='--', 
            label=f'Mean: {np.mean(more_similar_counts):.1f}')
plt.legend()
plt.show()

print(f"Average number of users more similar than self: {np.mean(more_similar_counts):.1f}")
print(f"Median number of users more similar than self: {np.median(more_similar_counts):.1f}")
print(f"Percentage of users with at least one more similar user: {np.mean(np.array(more_similar_counts) > 0) * 100:.1f}%")
# Create leaderboard data
leaderboard = []
for i, user in enumerate(users):
    self_similarity = user_sim_matrix[i, i]
    more_similar = np.sum(user_sim_matrix[:, i] > self_similarity) - 1  # -1 to exclude self
    n_tweets = len(august_user_tweets[user])
    leaderboard.append({
        'username': user,
        'more_similar_users': more_similar,
        'self_similarity': self_similarity,
        'tweet_count': n_tweets
    })

# Sort by number of more similar users
leaderboard.sort(key=lambda x: x['more_similar_users'], reverse=True)

# Print top and bottom of leaderboard
print("Users with LEAST distinct tweet identity (most users more similar than self):")
print("\nRank  Username                More Similar Users  Self-Similarity  Tweet Count")
print("-" * 75)
for i, entry in enumerate(leaderboard[:20]):
    print(f"{i+1:4d}  {entry['username']:<22} {entry['more_similar_users']:17d} {entry['self_similarity']:14.3f} {entry['tweet_count']:11d}")

print("\n\nUsers with MOST distinct tweet identity (fewest users more similar than self):")
print("\nRank  Username                More Similar Users  Self-Similarity  Tweet Count")
print("-" * 75)
# Sort by self-similarity.
for i, entry in enumerate(sorted(leaderboard, key=lambda x: x['self_similarity'], reverse=True)[:20]):
    print(f"{len(leaderboard)-19+i:4d}  {entry['username']:<22} {entry['more_similar_users']:17d} {entry['self_similarity']:14.3f} {entry['tweet_count']:11d}")

In [None]:
import torch
import plotly.graph_objects as go
import numpy as np
from collections import defaultdict
from scipy.spatial.distance import squareform

# First get a week of data (let's take the most active week)
weekly_counts = defaultdict(int)
for text, date, username in tweet_data:
    if date.strftime("%Y-%m") == "2024-08":
        week = date.strftime("%Y-%W")  # ISO week number
        weekly_counts[week] += 1

# Get the busiest week
busiest_week = max(weekly_counts.items(), key=lambda x: x[1])[0]
print(f"Using week {busiest_week} with {weekly_counts[busiest_week]} tweets")

# Get indices for that week
week_index_map = {}  # Maps original index to embedding index
week_user_tweets = defaultdict(list)  # Maps username to list of embedding indices

embedding_idx = 0
for orig_idx, (text, date, username) in enumerate(tweet_data):
    if date.strftime("%Y-%W") == busiest_week:
        week_index_map[orig_idx] = embedding_idx
        week_user_tweets[username].append(embedding_idx)
        embedding_idx += 1

# Get embeddings for just that week
week_embeddings = embeddings_tensor[list(week_index_map.values())]

# Compute full similarity matrix on GPU
print("Computing similarity matrix...")
sim_matrix = torch.mm(week_embeddings, week_embeddings.T)

# Convert to distances (1 - similarity since vectors are normalized)
dist_matrix = 1 - sim_matrix.cpu().numpy()

# Ensure diagonal is exactly zero
np.fill_diagonal(dist_matrix, 0)

# Convert to condensed form (upper triangular, no diagonal)
condensed_dist = squareform(dist_matrix)

# Compute linkage using the condensed distance matrix
print("Computing hierarchical clustering...")
linkage_matrix = linkage(condensed_dist, method='average')


# Create cluster points with jitter
cluster_points = []
n_samples = len(week_embeddings)

print(f"Starting with {n_samples} samples")

# Initialize: each point starts as its own cluster with one tweet
cluster_tweets = {}  # Maps cluster index -> set of tweets
for i in range(n_samples):
    orig_idx = list(week_index_map.values()).index(i)
    orig_idx = list(week_index_map.keys())[orig_idx]
    cluster_tweets[i] = {(tweet_data[orig_idx][0], tweet_data[orig_idx][2])}  # (text, username)

for i, merge in enumerate(linkage_matrix):
    # Get indices of clusters/points being merged
    left_idx = int(merge[0])
    right_idx = int(merge[1])
    new_idx = n_samples + i
    
    # Get the correct tweets based on whether it's an original point or merged cluster
    if left_idx < n_samples:
        left_tweets = cluster_tweets[left_idx]
    else:
        left_tweets = cluster_tweets[n_samples + int(left_idx - n_samples)]
        
    if right_idx < n_samples:
        right_tweets = cluster_tweets[right_idx]
    else:
        right_tweets = cluster_tweets[n_samples + int(right_idx - n_samples)]
    
    # Store merged tweets
    cluster_tweets[new_idx] = left_tweets.union(right_tweets)
    
    # Only create visualization points for clusters >= 3
    if merge[3] >= 3:
        cluster_points.append({
            'size': int(merge[3]),
            'similarity': float(1 - merge[2]),
            'hover_text': f"Natural Cluster<br>Size: {int(merge[3])}<br>Similarity: {1-merge[2]:.3f}",
            'tweets': [f"@{username}: {text}" for text, username in cluster_tweets[new_idx]]
        })

# Add jitter to similarities
jitter = np.random.normal(0, 0.0001, len(cluster_points))
for i, p in enumerate(cluster_points):
    p['similarity'] += jitter[i]
 

print(f"Found {len(cluster_points)} natural clusters of size >= 3")

# Create user points with jitter
user_points = []
users = [u for u, tweets in week_user_tweets.items() if len(tweets) >= 5]
jitter = np.random.normal(0, 0.0001, len(users))

for i, username in enumerate(users):
    indices = week_user_tweets[username]
    n_tweets = len(indices)
    
    # Get similarities for this user's tweets
    user_sim = sim_matrix[indices][:, indices]
    mask = torch.triu(torch.ones_like(user_sim), diagonal=1)
    similarities = user_sim[mask.bool()]
    avg_similarity = similarities.mean().item() + jitter[i]
    
    # Get the actual tweets for this user
    user_tweets = []
    for idx in indices:
        orig_idx = list(week_index_map.keys())[list(week_index_map.values()).index(idx)]
        user_tweets.append(f"{tweet_data[orig_idx][0]}")
    
    user_points.append({
        'size': n_tweets,
        'similarity': avg_similarity,
        'username': username,
        'hover_text': f"User: {username}<br>Tweets: {n_tweets}<br>Similarity: {avg_similarity:.3f}",
        'tweets': user_tweets
    })

# Create figure
fig = go.Figure()

# Add natural clusters
fig.add_trace(go.Scatter(
    x=[p['similarity'] for p in cluster_points],
    y=[p['size'] for p in cluster_points],
    mode='markers',
    name='Natural Clusters',
    marker=dict(color='gray', size=8, opacity=0.6),
    hovertext=[p['hover_text'] for p in cluster_points],
    hoverinfo='text',
    customdata=list(range(len(cluster_points)))
))

# Add users
fig.add_trace(go.Scatter(
    x=[p['similarity'] for p in user_points],
    y=[p['size'] for p in user_points],
    mode='markers+text',
    name='Users',
    marker=dict(color='red', size=10, opacity=0.7),
    hovertext=[p['hover_text'] for p in user_points],
    hoverinfo='text',
    text=[p['username'] if (p['size'] > np.percentile([u['size'] for u in user_points], 95) or 
                           p['similarity'] > np.percentile([u['similarity'] for u in user_points], 95))
          else '' for p in user_points],
    textposition="top right",
    customdata=list(range(len(cluster_points), len(cluster_points) + len(user_points)))
))

# Update layout
fig.update_layout(
    title=f'Natural Clusters vs User Clusters (Week {busiest_week})',
    xaxis_title='Average Similarity',
    yaxis_title='Cluster Size (# tweets)',
    yaxis_type="log",
    hovermode='closest',
    width=1200,
    height=800,
    showlegend=True,
    template='plotly_white',
    margin=dict(r=300)
)

# Add grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

# Create the HTML with the same JavaScript as before
# [Previous Python code stays exactly the same until the HTML template]

html_content = """
<html>
<head>
    <style>
        .container { display: flex; }
        .plot { flex: 7; }
        .tweets { 
            flex: 3; 
            padding: 10px; 
            max-height: 800px; 
            overflow-y: auto; 
            border-left: 1px solid #ccc;
        }
        .tweet {
            padding: 10px;
            border-bottom: 1px solid #eee;
            font-size: 14px;
        }
        .username {
            font-weight: bold;
            color: #1DA1F2;
        }
    </style>
</head>
<body>
    <div class="container">
        <div class="plot" id="plot"></div>
        <div class="tweets" id="tweetDisplay"></div>
    </div>
    <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
    <script>
        const userPoints = """ + json.dumps(user_points) + """;
        const clusterPoints = """ + json.dumps(cluster_points) + """;
        
        const plot = """ + fig.to_json() + """;
        Plotly.newPlot('plot', plot.data, plot.layout);

        document.getElementById('plot').on('plotly_click', function(data) {
            const point = data.points[0];
            const idx = point.customdata;
            let tweets;
            let header;
            
            if (idx < clusterPoints.length) {
                tweets = clusterPoints[idx].tweets;
                header = `Natural Cluster (${clusterPoints[idx].size} tweets)`;
            } else {
                const userIdx = idx - clusterPoints.length;
                tweets = userPoints[userIdx].tweets;
                header = `User: ${userPoints[userIdx].username} (${userPoints[userIdx].size} tweets)`;
            }
            
            let tweetHtml = `<h3>${header}</h3>`;
            for (let i = 0; i < tweets.length; i++) {
                tweetHtml += `
                    <div class="tweet">
                        <div class="text">${tweets[i]}</div>
                    </div>
                `;
            }
            
            document.getElementById('tweetDisplay').innerHTML = tweetHtml;
        });
    </script>
</body>
</html>
"""

# Save to HTML
with open("cluster_comparison.html", "w") as f:
    f.write(html_content)

print("\nStatistics:")
print(f"Number of natural clusters: {len(cluster_points)}")
print(f"Number of users: {len(user_points)}")
print(f"Average similarity: {np.mean([p['similarity'] for p in user_points]):.3f}")

In [None]:
list(cluster_tweets.items())[:10]