In [None]:
import numpy as np
import networkx as nx
import requests
import gzip
import shutil
import os

# Dataset URL
url = "https://snap.stanford.edu/data/web-Stanford.txt.gz"
filename = "web-Stanford.txt.gz"
extracted_file = "web-Stanford.txt"

# Download dataset if not already downloaded
if not os.path.exists(filename):
    print("Downloading dataset...")
    response = requests.get(url, stream=True)
    with open(filename, "wb") as f:
        shutil.copyfileobj(response.raw, f)
    print("Download complete.")

# Extract file if not already extracted
if not os.path.exists(extracted_file):
    print("Extracting dataset...")
    with gzip.open(filename, "rb") as f_in:
        with open(extracted_file, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("Extraction complete.")

# Load dataset into NetworkX graph
G = nx.DiGraph()
with open(extracted_file, "r") as file:
    for line in file:
        if line.startswith("#"):  # Ignore comments
            continue
        src, dest = map(int, line.strip().split())  # Read edges
        G.add_edge(src, dest)

print(f"Graph loaded with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

# Compute PageRank using NetworkX's built-in function
pagerank_scores = nx.pagerank(G, alpha=0.85)

# Display top 10 pages by rank
top_pages = sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True)[:10]

print("\nTop 10 pages by PageRank:")
for node, rank in top_pages:
    print(f"Page {node}: {rank:.6f}")


Downloading dataset...
Download complete.
Graph loaded with 281903 nodes and 2312497 edges.

Top 10 pages by PageRank:
Page 89073: 0.010006
Page 226411: 0.009682
Page 241454: 0.008900
Page 134832: 0.005894
Page 69358: 0.003399
Page 67756: 0.003208
Page 105607: 0.002925
Page 225872: 0.002873
Page 234704: 0.002871
Page 186750: 0.002846
