# Network Analysis

This notebook builds social networks, calculates network metrics, performs community detection, and analyzes structural differences between misinformation and legitimate news networks.


In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

sys.path.append(str(Path().resolve().parent))
from src import network_builder, data_preprocessing, visualization
import config

plt.style.use('seaborn-v0_8')


## Load and Prepare Data


In [None]:
# Load data (using sample dataset for demonstration)
# In practice, load your actual dataset here
df = data_preprocessing.create_sample_dataset(n_samples=1000)

# For network analysis, we need user interactions
# Create interaction data if not present
if 'interaction_type' not in df.columns:
    # Simulate retweet/reply interactions
    df['interaction_type'] = np.random.choice(['retweet', 'reply', 'mention'], size=len(df))
    df['target_user_id'] = df['user_id'].sample(frac=0.3).values

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
df.head()


## Build Interaction Graph


In [None]:
# Build directed graph from user interactions
G = network_builder.build_interaction_graph(
    df,
    user_column="user_id",
    interaction_column="interaction_type",
    target_column="target_user_id",
    timestamp_column="timestamp",
    directed=True
)

print(f"Graph created:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"  Is directed: {G.is_directed()}")


## Network Statistics


In [None]:
# Calculate comprehensive network statistics
stats = network_builder.calculate_network_statistics(G)

print("Network Statistics:")
print("="*50)
for key, value in stats.items():
    print(f"{key:25s}: {value}")


## Centrality Measures


In [None]:
# Calculate centrality measures for all nodes
centrality_df = network_builder.calculate_centrality_measures(
    G,
    include_betweenness=True,
    include_closeness=True,
    include_eigenvector=True
)

print("Centrality Measures (Top 10 nodes by degree):")
print("="*80)
print(centrality_df.head(10))

print("\n\nCentrality Statistics:")
print("="*80)
print(centrality_df.describe())


## Community Detection


In [None]:
# Detect communities using Louvain algorithm
communities = network_builder.detect_communities(G, algorithm="louvain")

print(f"Number of communities detected: {len(set(communities.values()))}")
print(f"\nCommunity size distribution:")
community_sizes = pd.Series(list(communities.values())).value_counts().sort_index()
print(community_sizes.head(10))

# Visualize community distribution
fig, ax = plt.subplots(figsize=(10, 6))
community_sizes.head(20).plot(kind='bar', ax=ax, color='steelblue')
ax.set_xlabel('Community ID', fontsize=12)
ax.set_ylabel('Number of Nodes', fontsize=12)
ax.set_title('Community Size Distribution (Top 20)', fontsize=14, fontweight='bold')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Visualize Network


In [None]:
# Color nodes by community
node_colors = {node: f"C{comm_id % 10}" for node, comm_id in communities.items()}

# If we have labels, color by misinformation vs legitimate
if 'label' in df.columns:
    user_labels = df.groupby('user_id')['label'].first().to_dict()
    node_colors = {
        node: 'red' if user_labels.get(node, 0) == 1 else 'blue' 
        for node in G.nodes()
    }
    title = "Social Network - Misinformation Spreaders (Red) vs Legitimate (Blue)"
else:
    title = "Social Network with Communities"

# Plot network (using smaller sample for visualization if network is large)
if G.number_of_nodes() > 100:
    # Sample nodes for visualization
    nodes_sample = list(G.nodes())[:100]
    G_viz = G.subgraph(nodes_sample)
    node_colors_viz = {node: node_colors.get(node, 'gray') for node in nodes_sample}
    visualization.plot_network_graph(
        G_viz,
        node_colors=node_colors_viz,
        title=f"{title} (Sample of 100 nodes)",
        layout="spring"
    )
else:
    visualization.plot_network_graph(
        G,
        node_colors=node_colors,
        title=title,
        layout="spring"
    )


## Information Cascades


In [None]:
# Identify information cascades
if 'post_id' in df.columns:
    cascades = network_builder.identify_information_cascades(
        df,
        user_column="user_id",
        timestamp_column="timestamp",
        post_id_column="post_id"
    )
    
    print(f"Number of cascades identified: {len(cascades)}")
    
    # Analyze cascade metrics
    cascade_metrics = pd.DataFrame({
        'depth': [c['depth'] for c in cascades.values()],
        'breadth': [c['breadth'] for c in cascades.values()],
        'propagation_speed': [c['propagation_speed'] for c in cascades.values()]
    })
    
    print("\nCascade Metrics:")
    print(cascade_metrics.describe())
    
    # Visualize a sample cascade
    if len(cascades) > 0:
        sample_cascade_id = list(cascades.keys())[0]
        visualization.plot_information_cascade(cascades[sample_cascade_id])
else:
    print("Post ID column not found. Skipping cascade analysis.")


## Export Network


In [None]:
# Export network for later use
output_path = Path("../data/networks/social_network.graphml")
network_builder.export_network(G, output_path, format="graphml")
print(f"Network exported to {output_path}")
