In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import loadmat
import networkx as nx
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

In [None]:
# global settings
# Set up plotting parameters
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
sns.set_style("whitegrid")

# Dataset directory
dataset_dir = '../dataset/'

# GLOBAL CONFIGURATION: Set to True for weighted analysis, False for unweighted
is_weighted = False
file_prefix = 'weighted' if is_weighted else 'unweighted'

In [None]:
print("Power Grid Network Analysis")
print("=" * 50)

# Load all networks and basic statistics
networks = {}
network_stats = []

print("Loading networks...")
for i in range(1, 59):
    try:
        data = loadmat(os.path.join(dataset_dir, f'{i}.mat'))
        adj_matrix = data['A']
        adj_np = adj_matrix.toarray()
        assert (adj_np == adj_np.T).all(), "Adjacency matrix is not symmetric"
        assert np.diag(adj_np).sum() == 0, "Adjacency matrix has self-loops"
        
        # Convert sparse matrix to NetworkX graph
        if is_weighted and 'W' in data:
            # Use distance matrix as weights
            weight_matrix = data['W']
            G = nx.from_scipy_sparse_array(weight_matrix)
            print(f"Network {i}: Using weighted graph with distance matrix")
        else:
            # Use unweighted adjacency matrix
            G = nx.from_scipy_sparse_array(adj_matrix)
            if is_weighted:
                print(f"Network {i}: Warning - 'W' matrix not found, using unweighted")
        
        # Remove self-loops and ensure undirected
        G.remove_edges_from(nx.selfloop_edges(G))
        G = G.to_undirected()
        
        networks[i] = G
        
        # Collect basic statistics
        stats = {
            'network_id': i,
            'nodes': G.number_of_nodes(),
            'edges': G.number_of_edges(),
            'density': nx.density(G),
            'is_connected': nx.is_connected(G)
        }
        network_stats.append(stats)
        
        if i % 10 == 0:
            print(f"Loaded network {i}")
            
    except Exception as e:
        print(f"Error loading network {i}: {e}")

print(f"\nSuccessfully loaded {len(networks)} networks")

# Convert to DataFrame for easy analysis
stats_df = pd.DataFrame(network_stats)
print(f"\nNetwork Statistics Summary:")
print(stats_df.describe())

print(f"\nTop 5 largest networks by number of nodes:")
largest_networks = stats_df.nlargest(5, 'nodes')
print(largest_networks[['network_id', 'nodes', 'edges']])

In [None]:
# =============================================================================
# TASK 1: Average Nearest Neighbor Degree (Knn) Analysis
# =============================================================================

print("\n" + "="*50)
print("TASK 1: Average Nearest Neighbor Degree Analysis")
print("="*50)

def compute_knn(G):
    """
    Compute average nearest neighbor degree for each degree value
    Returns dictionary {degree: avg_neighbor_degree}
    """
    knn = nx.average_neighbor_degree(G)
    degree_dict = dict(G.degree())
    
    # Group by degree and compute average knn for each degree
    degree_knn = defaultdict(list)
    for node, neighbor_deg in knn.items():
        node_degree = degree_dict[node]
        degree_knn[node_degree].append(neighbor_deg)
    
    # Average knn for each degree
    avg_knn = {}
    for degree, neighbor_degrees in degree_knn.items():
        avg_knn[degree] = np.mean(neighbor_degrees)
    
    return avg_knn

# Compute Knn for all networks
all_knn_data = {}
all_degrees = set()

print("Computing Knn for all networks...")
for net_id, G in networks.items():
    knn_dict = compute_knn(G)
    all_knn_data[net_id] = knn_dict
    all_degrees.update(knn_dict.keys())

all_degrees = sorted(all_degrees)

# Plot 1: All Knn curves together
plt.figure(figsize=(14, 8))
colors = plt.cm.tab20(np.linspace(0, 1, len(networks)))

for i, (net_id, knn_dict) in enumerate(all_knn_data.items()):
    degrees = sorted(knn_dict.keys())
    knn_values = [knn_dict[d] for d in degrees]
    plt.plot(degrees, knn_values, alpha=0.9, color=colors[i], linewidth=1, 
             label=f'Grid {net_id}')

plt.xlabel('Node Degree (k)')
plt.ylabel('Average Nearest Neighbor Degree Knn(k)')
plt.title('Average Nearest Neighbor Degree for All 58 Power Grids')
# plt.xscale('log')
# plt.yscale('log')
plt.grid(True, alpha=0.9)
plt.savefig('../results/{}_knn_plot.pdf'.format(file_prefix), dpi=600)
# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', ncol=2)
# plt.tight_layout()
plt.show()

# Plot 2: Average Knn across all networks
print("Computing ensemble average...")

# Compute average Knn across all networks for each degree
degree_knn_collections = defaultdict(list)
for net_id, knn_dict in all_knn_data.items():
    for degree, knn_val in knn_dict.items():
        degree_knn_collections[degree].append(knn_val)

# Calculate mean and std for each degree
ensemble_degrees = []
ensemble_mean = []
ensemble_std = []

for degree in sorted(degree_knn_collections.keys()):
    if len(degree_knn_collections[degree]) >= 3:  # Only include degrees with at least 3 data points
        ensemble_degrees.append(degree)
        ensemble_mean.append(np.mean(degree_knn_collections[degree]))
        ensemble_std.append(np.std(degree_knn_collections[degree]))

ensemble_degrees = np.array(ensemble_degrees)
ensemble_mean = np.array(ensemble_mean)
ensemble_std = np.array(ensemble_std)

plt.figure(figsize=(12, 8))
plt.errorbar(ensemble_degrees, ensemble_mean, yerr=ensemble_std, 
             fmt='o-', capsize=5, capthick=2, linewidth=2, markersize=6,
             color='red', ecolor='darkred', alpha=0.8)
plt.fill_between(ensemble_degrees, ensemble_mean - ensemble_std, 
                 ensemble_mean + ensemble_std, alpha=0.2, color='red')

plt.xlabel('Node Degree (k)')
plt.ylabel('Ensemble Average Knn(k)')
plt.title('Ensemble Average of Nearest Neighbor Degree Across 58 Power Grids')
# plt.xscale('log')
# plt.yscale('log')
plt.grid(True, alpha=0.8)
# plt.tight_layout()
plt.savefig('../results/{}_knn_avg_plot.pdf'.format(file_prefix), dpi=600)
plt.show()

print(f"Infrastructure Ensemble Trends Discussion:")
print(f"- Total degree range observed: {min(all_degrees)} to {max(all_degrees)}")
print(f"- Number of networks contributing to ensemble: {len(networks)}")
print(f"- Degree values with sufficient data (≥3 networks): {len(ensemble_degrees)}")

# Analyze trend
if len(ensemble_degrees) > 5:
    # Fit power law: Knn(k) ~ k^γ
    log_k = np.log10(ensemble_degrees)
    log_knn = np.log10(ensemble_mean)
    
    # Linear fit in log-log space
    coeffs = np.polyfit(log_k, log_knn, 1)
    gamma = coeffs[0]
    
    print(f"- Power law exponent γ ≈ {gamma:.3f}")
    if gamma < -0.1:
        print("- Networks show disassortative mixing (high-degree nodes connect to low-degree nodes)")
    elif gamma > 0.1:
        print("- Networks show assortative mixing (high-degree nodes connect to high-degree nodes)")
    else:
        print("- Networks show neutral mixing patterns")

In [None]:
# =============================================================================
# TASK 2: Normalized Betweenness Centrality Analysis
# =============================================================================

print("\n" + "="*50)
print("TASK 2: Normalized Betweenness Centrality Analysis")
print("="*50)

all_betweenness = []

print("Computing betweenness centrality for all networks...")
for net_id, G in networks.items():
    # Compute normalized betweenness centrality
    if is_weighted:
        # For weighted graphs, use edge weights (distances) in shortest path calculation
        bc = nx.betweenness_centrality(G, normalized=True, weight='weight')
    else:
        # For unweighted graphs
        bc = nx.betweenness_centrality(G, normalized=True)
    all_betweenness.extend(list(bc.values()))
    
    if net_id % 10 == 0:
        print(f"Processed network {net_id}")

all_betweenness = np.array(all_betweenness)

print(f"\nBetweenness Centrality Statistics:")
print(f"Total nodes across all networks: {len(all_betweenness)}")
print(f"Mean betweenness centrality: {np.mean(all_betweenness):.6f}")
print(f"Std betweenness centrality: {np.std(all_betweenness):.6f}")
print(f"Max betweenness centrality: {np.max(all_betweenness):.6f}")
print(f"Min betweenness centrality: {np.min(all_betweenness):.6f}")

# Plot histogram of betweenness centrality
plt.figure(figsize=(10, 12))

# Subplot 1: Linear scale histogram
plt.subplot(2, 1, 1)
plt.hist(all_betweenness, bins=50, alpha=0.7, color='skyblue', edgecolor='black')
plt.xlabel('Normalized Betweenness Centrality')
plt.ylabel('Frequency')
plt.title('Distribution of Betweenness Centrality (Linear Scale)')
plt.grid(True, alpha=0.3)

# Subplot 2: Log scale histogram (for better visualization of tail)
plt.subplot(2, 1, 2)
plt.hist(all_betweenness[all_betweenness > 0], bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
plt.xlabel('Normalized Betweenness Centrality')
plt.ylabel('Frequency')
plt.title('Distribution of Betweenness Centrality (Log Scale)')
plt.yscale('log')
plt.grid(True, alpha=0.3)

# # Subplot 3: Cumulative distribution
# plt.subplot(2, 2, 3)
# sorted_bc = np.sort(all_betweenness)
# cumulative = np.arange(1, len(sorted_bc) + 1) / len(sorted_bc)
# plt.plot(sorted_bc, cumulative, linewidth=2, color='green')
# plt.xlabel('Normalized Betweenness Centrality')
# plt.ylabel('Cumulative Probability')
# plt.title('Cumulative Distribution of Betweenness Centrality')
# plt.grid(True, alpha=0.3)

# # Subplot 4: Box plot by network size categories
# plt.subplot(2, 2, 4)
# # Categorize networks by size
# size_categories = []
# bc_by_size = []

# small_nets = stats_df[stats_df['nodes'] <= 100]['network_id'].values
# medium_nets = stats_df[(stats_df['nodes'] > 100) & (stats_df['nodes'] <= 500)]['network_id'].values
# large_nets = stats_df[stats_df['nodes'] > 500]['network_id'].values

# for net_id, G in networks.items():
#     bc = list(nx.betweenness_centrality(G, normalized=True).values())
#     if net_id in small_nets:
#         size_categories.extend(['Small'] * len(bc))
#         bc_by_size.extend(bc)
#     elif net_id in medium_nets:
#         size_categories.extend(['Medium'] * len(bc))
#         bc_by_size.extend(bc)
#     elif net_id in large_nets:
#         size_categories.extend(['Large'] * len(bc))
#         bc_by_size.extend(bc)

# bc_df = pd.DataFrame({'Size': size_categories, 'Betweenness': bc_by_size})
# sns.boxplot(data=bc_df, x='Size', y='Betweenness')
# plt.title('Betweenness Centrality by Network Size')
# plt.ylabel('Normalized Betweenness Centrality')

plt.tight_layout()
plt.savefig('../results/{}_betweenness_distribution.pdf'.format(file_prefix), dpi=600)
plt.show()

print(f"\nBetweenness Centrality Discussion:")
print(f"- Most nodes have very low betweenness centrality (close to 0)")
print(f"- Few nodes have high betweenness centrality, indicating they are critical bridges")
print(f"- Distribution is highly right-skewed, typical of scale-free networks")

# Identify high-centrality nodes
high_centrality_threshold = np.percentile(all_betweenness, 99)
print(f"- 99th percentile betweenness centrality: {high_centrality_threshold:.6f}")
print(f"- {np.sum(all_betweenness > high_centrality_threshold)} nodes ({100*np.sum(all_betweenness > high_centrality_threshold)/len(all_betweenness):.2f}%) have very high centrality")


In [None]:
# =============================================================================
# TASK 3: Closeness Centrality for Largest Networks and Correlation Analysis
# =============================================================================

print("\n" + "="*50)
print("TASK 3: Closeness Centrality Analysis for Largest Networks")
print("="*50)

# Get the 5 largest networks
largest_5_ids = largest_networks['network_id'].values
print(f"Analyzing closeness centrality for networks: {largest_5_ids}")

closeness_data = {}
betweenness_data = {}
correlation_results = []

print("\nComputing closeness and betweenness centralities...")
for net_id in largest_5_ids:
    G = networks[net_id]
    print(f"Processing network {net_id} ({G.number_of_nodes()} nodes, {G.number_of_edges()} edges)")
    
    # Compute centralities
    if is_weighted:
        # For weighted graphs, use distance as weights
        # Note: closeness centrality with weights uses shortest path distances
        closeness_centrality = nx.closeness_centrality(G, distance='weight')
        betweenness_centrality = nx.betweenness_centrality(G, normalized=True, weight='weight')
        print(f"  Using weighted centrality measures")
    else:
        # For unweighted graphs
        closeness_centrality = nx.closeness_centrality(G)
        betweenness_centrality = nx.betweenness_centrality(G, normalized=True)
        print(f"  Using unweighted centrality measures")
    
    # Store data
    closeness_data[net_id] = list(closeness_centrality.values())
    betweenness_data[net_id] = list(betweenness_centrality.values())
    
    # Compute correlation
    cc_values = np.array(list(closeness_centrality.values()))
    bc_values = np.array(list(betweenness_centrality.values()))
    
    correlation = np.corrcoef(cc_values, bc_values)[0, 1]
    
    correlation_results.append({
        'network_id': net_id,
        'nodes': G.number_of_nodes(),
        'correlation': correlation
    })

# Convert to DataFrame
correlation_df = pd.DataFrame(correlation_results)
print(f"\nCorrelation Results:")
print(correlation_df)

# Plot comprehensive analysis
fig, axes = plt.subplots(3, 2, figsize=(16, 18))

# Plot 1 & 2: Distribution comparisons
all_closeness = []
all_betweenness_largest = []

for net_id in largest_5_ids:
    all_closeness.extend(closeness_data[net_id])
    all_betweenness_largest.extend(betweenness_data[net_id])

axes[0, 0].hist(all_closeness, bins=30, alpha=0.7, color='purple', edgecolor='black')
axes[0, 0].set_xlabel('Closeness Centrality')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Closeness Centrality\n(5 Largest Networks)')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].hist(all_betweenness_largest, bins=30, alpha=0.7, color='orange', edgecolor='black')
axes[0, 1].set_xlabel('Betweenness Centrality')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Betweenness Centrality\n(5 Largest Networks)')
axes[0, 1].grid(True, alpha=0.3)

# Plot 3: Correlation scatter plot (combined)
axes[1, 0].scatter(all_closeness, all_betweenness_largest, alpha=0.6, s=20)
axes[1, 0].set_xlabel('Closeness Centrality')
axes[1, 0].set_ylabel('Betweenness Centrality')
axes[1, 0].set_title('Closeness vs Betweenness Centrality\n(All 5 Networks Combined)')
axes[1, 0].grid(True, alpha=0.3)

# Add correlation coefficient to plot
overall_corr = np.corrcoef(all_closeness, all_betweenness_largest)[0, 1]
axes[1, 0].text(0.05, 0.95, f'r = {overall_corr:.3f}', transform=axes[1, 0].transAxes, 
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
                verticalalignment='top', fontsize=12)

# Plot 4: Individual network correlations
colors_net = ['red', 'blue', 'green', 'purple', 'orange']
for i, net_id in enumerate(largest_5_ids):
    axes[1, 1].scatter(closeness_data[net_id], betweenness_data[net_id], 
                       alpha=0.6, s=15, color=colors_net[i], label=f'Net {net_id}')

axes[1, 1].set_xlabel('Closeness Centrality')
axes[1, 1].set_ylabel('Betweenness Centrality')
axes[1, 1].set_title('Closeness vs Betweenness by Network')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# Plot 5: Correlation coefficients bar chart
axes[2, 0].bar(range(len(correlation_df)), correlation_df['correlation'], 
               color=['red', 'blue', 'green', 'purple', 'orange'])
axes[2, 0].set_xlabel('Network ID')
axes[2, 0].set_ylabel('Correlation Coefficient')
axes[2, 0].set_title('Correlation between Closeness and Betweenness\nby Network')
axes[2, 0].set_xticks(range(len(correlation_df)))
axes[2, 0].set_xticklabels([f'Net {id}' for id in correlation_df['network_id']])
axes[2, 0].grid(True, alpha=0.3)
axes[2, 0].axhline(y=0, color='black', linestyle='-', linewidth=0.5)

# Plot 6: Network size vs correlation
axes[2, 1].axis("off")
# axes[2, 1].scatter(correlation_df['nodes'], correlation_df['correlation'], 
#                    s=100, color='darkred', alpha=0.7)
# for i, row in correlation_df.iterrows():
#     axes[2, 1].annotate(f'Net {row["network_id"]}', 
#                        (row['nodes'], row['correlation']),
#                        xytext=(5, 5), textcoords='offset points', fontsize=10)
# axes[2, 1].set_xlabel('Number of Nodes')
# axes[2, 1].set_ylabel('Correlation Coefficient')
# axes[2, 1].set_title('Network Size vs Centrality Correlation')
# axes[2, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../results/{}_closeness_betweenness_analysis.pdf'.format(file_prefix), dpi=600)
plt.show()

# Summary statistics
print(f"\nCloseness Centrality Summary (5 largest networks):")
print(f"Mean closeness centrality: {np.mean(all_closeness):.6f}")
print(f"Std closeness centrality: {np.std(all_closeness):.6f}")
print(f"Max closeness centrality: {np.max(all_closeness):.6f}")
print(f"Min closeness centrality: {np.min(all_closeness):.6f}")

print(f"\nCorrelation Analysis Summary:")
print(f"Overall correlation (all 5 networks): {overall_corr:.3f}")
print(f"Mean correlation across networks: {correlation_df['correlation'].mean():.3f}")
print(f"Std correlation across networks: {correlation_df['correlation'].std():.3f}")

print(f"\nCorrelation Discussion:")
if overall_corr > 0.7:
    print("- Strong positive correlation between closeness and betweenness centrality")
elif overall_corr > 0.3:
    print("- Moderate positive correlation between closeness and betweenness centrality")
elif overall_corr > 0:
    print("- Weak positive correlation between closeness and betweenness centrality")
else:
    print("- Little to no correlation between closeness and betweenness centrality")

print("- Nodes with high closeness centrality tend to be close to all other nodes")
print("- Nodes with high betweenness centrality lie on many shortest paths")
print("- In power grids, both measures often identify critical infrastructure nodes")

print(f"\nAnalysis Complete!")
print(f"Summary: Analyzed {len(networks)} power grid networks with comprehensive centrality analysis.")