In [None]:
import subprocess, shlex, time, re, socket, os, json, glob
from datetime import datetime
from urllib.parse import urlparse
from collections import defaultdict, Counter
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

In [None]:
TRACEROUTE_CMD = "traceroute -q 1 -w 5 -m 64"
RESULTS_FOLDER = "traceroute_results"
VISUALIZATIONS_FOLDER = "visualizations"
SOURCE_LOCATION = "Ryon_111"

os.makedirs(RESULTS_FOLDER, exist_ok=True)
os.makedirs(VISUALIZATIONS_FOLDER, exist_ok=True)

TIMEOUT_PER_TRACEROUTE = 180  # seconds
MAX_THREADS = 15          # maximum number of parallel threads

destinations = [
	"aut.ac.ir",
	"www.wsj.com",
	"www.coolmathgames.com",
	"asce.rice.edu",
	"www.pokemon.co.jp",
	"www.nytimes.com",
	"www.iitb.ac.in",
	"umich.edu",
	"www.taobao.com",
	"www.universalorlando.com",
	"fried.rice.edu",
	"portal.ehawaii.gov",
	"yu-gi-oh.jp"
]

### Part 2: load all results from different source ips and do analysis and visualization

In [None]:
def load_all_results(folder=RESULTS_FOLDER):
	"""Load all JSON results from folder"""
	if not os.path.exists(folder):
		print(f"Results folder {folder} not found!")
		return []
	
	json_files = glob.glob(os.path.join(folder, "traceroute_*.json"))
	results = []
	
	for filepath in json_files:
		try:
			with open(filepath, 'r') as f:
				data = json.load(f)
				results.append(data)
				print(f"Loaded: {os.path.basename(filepath)} (source: {data['source_ip']})")
		except Exception as e:
			print(f"Error loading {filepath}: {e}")
	
	print(f"Loaded {len(results)} result files")
	return results

def filter_none_hops(hops):
	"""Filter out None hops (unknown/timeout hops) from route"""
	filtered = []
	for hop in hops:
		# Keep hop if it has valid host, IP, or RTT data
		if hop.get('host') or hop.get('ip') or hop.get('rtt_ms') is not None:
			filtered.append(hop)
		# Skip hops where all fields are None/empty
	return filtered

def filter_hop_map(hop_map):
	"""Filter out None hops from entire hop map"""
	filtered_map = {}
	for host, hops in hop_map.items():
		filtered_hops = filter_none_hops(hops)
		if filtered_hops:
			filtered_map[host] = filtered_hops
	return filtered_map

In [None]:
def node_id(h):
	"""Get unique node identifier for a hop"""
	if h.get('ip'):
		return h['ip']
	if h.get('host'):
		return h['host']
	return f"hop{h['hop']}_unknown"

def node_label(h):
	"""Get display label for a hop"""
	if h.get('host') and h.get('ip'):
		return f"{h['host']}\n{h['ip']}"
	if h.get('ip'):
		return h['ip']
	if h.get('host'):
		return h['host']
	return "*"

def classify_node_individual(nid, route_map, source_ip):
	"""Classify node type for individual graph visualization"""
	# Get destinations
	destinations = set(route_map.keys())
	
	# Count how many routes pass through this node
	route_count = 0
	for dest, hops in route_map.items():
		for hop in hops:
			if node_id(hop) == nid:
				route_count += 1
				break
	
	# Classification logic
	if nid == source_ip:
		return 'source'
	elif nid in destinations or any(nid in dest for dest in destinations):
		return 'destination'  
	elif route_count > 1:
		return 'multi_route'
	elif '.' not in nid or nid.count('.') != 3:  # Not an IP
		return 'hostname'
	elif nid != "*" and "unknown" not in nid:
		return 'ip_only'
	else:
		return 'unknown'

def build_individual_graph(G, route_map, source_ip, route_status, source_location=SOURCE_LOCATION):
	# Add source node
	# source_label = "{}({})".format(source_ip, source_location)
	source_label = "{}".format(source_location)
	G.add_node(source_label, label=source_label, node_type='source')

	# Process each route
	for dest, hops in route_map.items():
		dest_status = route_status.get(dest, False)
		prev_node = source_label  # Start from source
		
		# Add destination node (always add it)
		G.add_node(dest, label=dest, node_type='destination')
		
		# Process hops in the route
		for i, hop in enumerate(hops):
			if i == len(hops) - 1 and dest_status:
				# If destination is reached, skip adding last hop to avoid duplication
				continue
			nid = node_id(hop)
			label = nid
			
			# Skip nodes that would be created from None hops
			if nid.startswith("hop") and "unknown" in nid:
				continue
				
			if nid not in G:
				node_type = classify_node_individual(nid, route_map, source_ip)
				G.add_node(nid, label=label, node_type=node_type)
			
			# Connect to previous node (no weights)
			if prev_node != nid:
				G.add_edge(prev_node, nid)
			
			prev_node = nid
		
		# Handle destination connectivity based on dest_status
		if dest_status:
			# If dest_status is True, connect last hop to destination
			if hops and prev_node != dest:
				G.add_edge(prev_node, dest)

	return

def build_unified_graph(all_results):
	"""Build single unified undirected, unweighted graph from all results"""
	G = nx.Graph()
	route_counts = Counter()
	
	# First pass: identify all sources and destinations
	for result in all_results:
		source_ip = result['source_ip']
		source_location = result['source_location']
		timestamp = result['timestamp']
		route_map = result['route_map']
		route_status = result['statuses']

		filtered_route_map = filter_hop_map(route_map)
		if not filtered_route_map:
			raise ValueError("No valid routes after filtering for source: {}_{}".format(source_ip, source_location))
		for dest, hops in filtered_route_map.items():
			for hop in hops:
				nid = node_id(hop)
				if nid in G:
					route_counts[nid] += 1
		build_individual_graph(G, filtered_route_map, source_ip, route_status, source_location=source_location)
	
	# Reclassify nodes that appear in multiple routes as multi_route	
	for node in G.nodes():
		if G.nodes[node].get('node_type') not in ['source', 'destination'] and route_counts.get(node, 0) > 1:
			G.nodes[node]['node_type'] = 'multi_route'
	
	return G

def get_node_visual_props(node_type):
	"""Get size and color for different node types"""
	type_props = {
		'source': {'size': 600, 'color': 'red'},
		'destination': {'size': 600, 'color': 'blue'}, 
		'multi_route': {'size': 150, 'color': 'orange'},
		'hostname': {'size': 200, 'color': 'green'},
		'ip_only': {'size': 80, 'color': 'gray'},
		'unknown': {'size': 100, 'color': 'lightgray'}
	}
	return type_props.get(node_type, {'size': 200, 'color': 'gray'})

def graph_descriptors(G: nx.Graph):
	"""Calculate graph metrics"""
	V = G.number_of_nodes()
	E = G.number_of_edges()
	comps = list(nx.connected_components(G))
	C = len(comps)

	density = nx.density(G) if V > 1 else 0.0
	avg_degree = (2*E / V) if V > 0 else 0.0
	efficiency = nx.global_efficiency(G) if V > 1 else 0.0
	clustering = nx.average_clustering(G) if V > 1 else 0.0
	cyclomatic = E - V + C
	closeness_centrality = nx.closeness_centrality(G)
	betweenness_centrality = nx.betweenness_centrality(G, normalized=True)
	cycles = nx.cycle_basis(G)
	cycle_counts = {3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0}
	for c in cycles:
		L = len(c)
		if 3 <= L <= 8:
			cycle_counts[L] += 1
	
	# Compute correlation
	cc_values = list(closeness_centrality.values())
	bc_values = list(betweenness_centrality.values())

	if V >= 3:
		denom = max(1, 2*V - 5)
		meshedness = max(0.0, min(1.0, (E - V + C) / denom))
	else:
		meshedness = 0.0

	return {
		'nodes': V,
		'links': E,
		'components': C,
		'density': density,
		'avg_degree': avg_degree,
		'meshedness_est': meshedness,
		'efficiency': efficiency,
		'avg_clustering': clustering,
		'cyclomatic_number': cyclomatic,
		'closeness_centrality': cc_values,
		'betweenness_centrality': bc_values,
		'cycle_counts': cycle_counts
	}

def centrality_correlation_analysis(cc_values, bc_values, figsize=(12, 10)):	
	# Plot comprehensive analysis
	fig, axes = plt.subplots(2, 2, figsize=figsize)

	axes[0, 0].hist(cc_values, bins=30, alpha=0.7, color='purple', edgecolor='black')
	axes[0, 0].set_xlabel('Closeness Centrality')
	axes[0, 0].set_ylabel('Frequency')
	axes[0, 0].set_title('Distribution of Closeness Centrality')
	axes[0, 0].grid(True, alpha=0.3)

	axes[0, 1].hist(bc_values, bins=30, alpha=0.7, color='orange', edgecolor='black')
	axes[0, 1].set_xlabel('Betweenness Centrality')
	axes[0, 1].set_ylabel('Frequency')
	axes[0, 1].set_title('Distribution of Betweenness Centrality')
	axes[0, 1].grid(True, alpha=0.3)

	# Plot 3: Correlation scatter plot (combined)
	axes[1, 0].scatter(cc_values, bc_values, alpha=0.6, s=20)
	axes[1, 0].set_xlabel('Closeness Centrality')
	axes[1, 0].set_ylabel('Betweenness Centrality')
	axes[1, 0].set_title('Closeness vs Betweenness Centrality')
	axes[1, 0].grid(True, alpha=0.3)

	# Add correlation coefficient to plot
	overall_corr = np.corrcoef(np.array(cc_values), np.array(bc_values))[0, 1]
	axes[1, 0].text(0.05, 0.95, f'r = {overall_corr:.3f}', transform=axes[1, 0].transAxes, 
					bbox=dict(boxstyle='round', facecolor='white', alpha=0.8),
					verticalalignment='top', fontsize=12)

	axes[1, 1].axis("off")

	plt.tight_layout()
	plt.savefig('./closeness_betweenness_analysis.pdf', dpi=600)
	plt.show()

	# Summary statistics
	print(f"\nCloseness Centrality Summary:")
	print(f"Mean closeness centrality: {np.mean(cc_values):.6f}")
	print(f"Std closeness centrality: {np.std(cc_values):.6f}")
	print(f"Max closeness centrality: {np.max(cc_values):.6f}")
	print(f"Min closeness centrality: {np.min(cc_values):.6f}")

	print(f"\nBetweenness Centrality Summary:")
	print(f"Mean betweenness centrality: {np.mean(bc_values):.6f}")
	print(f"Std betweenness centrality: {np.std(bc_values):.6f}")
	print(f"Max betweenness centrality: {np.max(bc_values):.6f}")
	print(f"Min betweenness centrality: {np.min(bc_values):.6f}")

	print(f"\nCorrelation Analysis Summary:")
	print(f"Overall correlation: {overall_corr:.3f}")

	print(f"\nCorrelation Discussion:")
	if overall_corr > 0.7:
		print("- Strong positive correlation between closeness and betweenness centrality")
	elif overall_corr > 0.3:
		print("- Moderate positive correlation between closeness and betweenness centrality")
	elif overall_corr > 0:
		print("- Weak positive correlation between closeness and betweenness centrality")
	else:
		print("- Little to no correlation between closeness and betweenness centrality")

	print("- Nodes with high closeness centrality tend to be close to all other nodes")
	print("- Nodes with high betweenness centrality lie on many shortest paths")
	print("- In power grids, both measures often identify critical infrastructure nodes")

	print(f"\nAnalysis Complete!")


In [None]:
def visualize_graph(G, figsize=(16,12), save_path=None, is_show=True):
	"""Visualize graph with source at bottom and destinations at top"""
	if G.number_of_nodes() == 0:
		print(f"No nodes to visualize!")
		return

	plt.figure(figsize=figsize)

	# Create custom layout with source at bottom, destinations at top
	pos = create_hierarchical_layout(G)
	
	source_num, dest_num = 0, 0
	# Separate nodes by type for different visual properties
	node_types_viz = {}
	for node, data in G.nodes(data=True):
		node_type = data.get('node_type', 'unknown')
		if node_type not in node_types_viz:
			node_types_viz[node_type] = []
		if node_type == 'source':
			source_num += 1
		elif node_type == 'destination':
			dest_num += 1
		node_types_viz[node_type].append(node)
	
	# Draw nodes by type
	for node_type, nodes in node_types_viz.items():
		props = get_node_visual_props(node_type)
		nx.draw_networkx_nodes(G, pos, nodelist=nodes, 
							 node_size=props['size'], 
							 node_color=props['color'],
							 alpha=0.8,
							 label=f"{node_type} ({len(nodes)})")
	
	# Draw edges
	nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.6, edge_color='gray')
	
	# Draw labels
	labels = nx.get_node_attributes(G, 'label')
	nx.draw_networkx_labels(G, pos, labels=labels, font_size=8,)
	
	plt.axis('off')
	plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', labelspacing=2.5)
	plt.title(f'Network Topology\n'
				f'{source_num} sources, {dest_num} destinations, | '
				f'{G.number_of_nodes()} nodes, {G.number_of_edges()} edges', 
				fontsize=14, pad=20)
	plt.tight_layout()
	if save_path:
		plt.savefig(save_path, dpi=600)
	if is_show:
		plt.show()

def create_hierarchical_layout(G):
	"""
	Create a hierarchical layout with source at bottom and destinations at top.
	
	Parameters:
	-----------
	G : nx.Graph
		NetworkX graph
		
	Returns:
	--------
	dict
		Position dictionary with (x, y) coordinates for each node
	"""
	pos = {}
	
	# Identify node types
	source_nodes = []
	destination_nodes = []
	intermediate_nodes = []
	
	for node, data in G.nodes(data=True):
		node_type = data.get('node_type', 'unknown')
		if node_type == 'source':
			source_nodes.append(node)
		elif node_type == 'destination':
			destination_nodes.append(node)
		else:
			intermediate_nodes.append(node)
	
	# Calculate shortest paths from source to determine hierarchy levels
	node_distances = {}
	for node in G.nodes():
		if node in source_nodes:
			node_distances[node] = 0
		else:
			# Average distance from all sources
			distances = []
			for src in source_nodes:
				try:
					dist = nx.shortest_path_length(G, src, node)
					distances.append(dist)
				except nx.NetworkXNoPath:
					pass
			
			if distances:
				node_distances[node] = sum(distances) / len(distances)
			else:
				# Disconnected node - put at top
				node_distances[node] = float('inf')
	
	# Group nodes by hierarchy level
	levels = defaultdict(list)
	for node, dist in node_distances.items():
		if node in destination_nodes:
			# Force destinations to top level
			levels[float('inf')].append(node)
		else:
			# Round to nearest integer for level grouping
			level = int(round(dist)) if dist != float('inf') else float('inf')
			levels[level].append(node)
	
	# Position nodes level by level
	y_spacing = 2.0
	x_spacing = 2.0
	source_x_spacing = 2.5
	dest_x_spacing = 3.3
	
	sorted_levels = sorted([k for k in levels.keys() if k != float('inf')])
	if float('inf') in levels:
		sorted_levels.append(float('inf'))
	
	for level_idx, level in enumerate(sorted_levels):
		nodes = levels[level]
		
		# Calculate y position
		if level == float('inf'):
			y_pos = (len(sorted_levels) - 1) * y_spacing
		else:
			y_pos = level * y_spacing
		
		# Calculate x positions to center nodes horizontally
		num_nodes = len(nodes)
		if num_nodes == 1:
			x_positions = [0]
		else:
			total_width = (num_nodes - 1) * x_spacing
			x_positions = [-total_width/2 + i * x_spacing for i in range(num_nodes)]
		
		# Assign positions
		for i, node in enumerate(nodes):
			pos[node] = (x_positions[i], y_pos)
	
	# Special handling: ensure source is at the bottom and destinations at top
	if source_nodes:
		num_sources = len(source_nodes)
		if num_sources == 1:
			source_x_positions = [0]
		else:
			source_total_width = (num_sources - 1) * source_x_spacing
			source_x_positions = [-source_total_width/2 + i * source_x_spacing for i in range(num_sources)]
		
		for i, src in enumerate(source_nodes):
			pos[src] = (source_x_positions[i], 0)
	
	# Move all destination nodes to the top level
	if destination_nodes:
		max_y = max([pos[node][1] for node in pos.keys()]) if pos else 0
		top_y = max_y + y_spacing
		
		# Spread destinations horizontally at the top
		num_dests = len(destination_nodes)
		if num_dests == 1:
			dest_x_positions = [0]
		else:
			dest_total_width = (num_dests - 1) * dest_x_spacing
			dest_x_positions = [-dest_total_width/2 + i * dest_x_spacing for i in range(num_dests)]
		
		for i, dest in enumerate(destination_nodes):
			pos[dest] = (dest_x_positions[i], top_y)
	
	return pos

In [None]:
def hop_and_time_summary(all_results):
	# Summarize average hop counts and times across all destinations in all results
	summary = {}
	sources = set()
	destinations = set()
	for result in all_results:
		source_location = result['source_location']
		sources.add(source_location)
		trace_time_map = result['timings']
		route_map = result['route_map']
		route_status = result['statuses']
		filtered_route_map = filter_hop_map(route_map)
		for dest, hop_list in route_map.items():
			destinations.add(dest)
			if dest not in summary:
				summary[dest] = {"hop_count": 0, "filtered_hop_count": 0, "time": 0, "success": 0, "count": 0}
			summary[dest]["hop_count"] += len(hop_list)
			summary[dest]["filtered_hop_count"] += len(filtered_route_map.get(dest, []))
			if route_status.get(dest, False):
				summary[dest]["success"] += 1
			summary[dest]["time"] += trace_time_map.get(dest, TIMEOUT_PER_TRACEROUTE)
			summary[dest]["count"] += 1
	for dest, data in summary.items():
		data["hop_count"] /= data["count"]
		data["filtered_hop_count"] /= data["count"]
		data["time"] /= data["count"]
		data["success_rate"] = data["success"] / data["count"]
	print(f"\nSummary across {len(sources)} sources and {len(destinations)} destinations:")
	print(f"Total unique sources: {len(sources):}"+" (" + ", ".join(sorted(sources)) + ")")
	print(f"Total unique destinations: {len(destinations)}")
	print(f"Total traceroute results analyzed: {len(all_results)}")
	# print summary pd.DataFrame
	df = pd.DataFrame.from_dict(summary, orient='index')
	df = df.sort_values(by="hop_count", ascending=False)
	pd.set_option('display.max_columns', None)
	print(df)
	return

In [None]:
"""Load all results and create individual visualizations"""
# Load all results
all_results = load_all_results()
if not all_results:
	print("No results found to analyze!")
	raise ValueError("No results found")
hop_and_time_summary(all_results)

In [None]:
for i, result in enumerate(all_results, 1):
	source_ip = result['source_ip']
	source_location = result['source_location']
	timestamp = result['timestamp']
	route_map = result['route_map']
	route_status = result['statuses']

	print(f"\n{'='*60}")
	print(f"RESULT {i}/{len(all_results)}: Source IP {source_ip} ({timestamp})")
	print(f"{'='*60}")

	# Filter None hops from routes
	print("Filtering None hops from routes...")
	filtered_route_map = filter_hop_map(route_map)

	if not filtered_route_map:
		print("No valid routes after filtering!")
		raise ValueError("No valid routes after filtering")
		
	print(f"Valid routes after filtering: {len(filtered_route_map)}")
	print(f"\nHop counts per destination:")
	timings = result.get('timings', {})
	for dest, hops in filtered_route_map.items():
		timing = timings.get(dest, 0)
		print(f"  {dest:35s}  hops: {len(hops):2d}  time: {timing:.3f}s")

print("Building graph...")
G = build_unified_graph(all_results)

# Calculate metrics
metrics = graph_descriptors(G)
print(f"\nNetwork metrics for unified graph:")
for k, v in metrics.items():
	if isinstance(v, float):
		print(f"  {k:18s}: {v:.4f}")
	else:
		print(f"  {k:18s}: {v}")

# Centrality correlation analysis
cc_values = metrics['closeness_centrality']
bc_values = metrics['betweenness_centrality']
centrality_correlation_analysis(cc_values, bc_values, figsize=(12, 10))

# Node type analysis
node_types = Counter()
for node, data in G.nodes(data=True):
	node_types[data.get('node_type', 'unknown')] += 1

print(f"\nNode type distribution:")
for node_type, count in node_types.most_common():
	print(f"  {node_type:12s}: {count}")

# Visualize individual graph
print(f"\nVisualizing unified graph ...")
visualize_graph(G, figsize=(20, 12),
	save_path=f"{VISUALIZATIONS_FOLDER}/graph_unified.pdf", is_show=True)

print(f"\n{'='*60}")
print(f"ANALYSIS COMPLETE - Processed {len(all_results)} source(s)")
print(f"{'='*60}")