# SDSC3001 - Assignment 2

![Assignment_2](./SDSC3001%20-%20Assignment%202.png)

# Question 3

In [1]:
def load_graph(file_path):
    edges = []
    with open(file_path, "r") as file:
        for line in file:
            if line.startswith("#"):
                continue  # Skip comment lines
            parts = line.strip().split()
            if len(parts) == 2:
                from_node, to_node = map(int, parts)
                edges.append((from_node, to_node))
    return edges

In [2]:
file_path = "com-dblp.txt"
graph_edges = load_graph(file_path)

### Question 3.0

In [3]:
import numpy as np


def power_iteration_pagerank(graph_edges, alpha=0.15, tolerance=1e-9):
    # Get unique nodes in the graph to determine the number of nodes (n)
    nodes = set()
    for edge in graph_edges:
        nodes.update(edge)
    nodes = list(nodes)
    n = len(nodes)

    # Initialize the PageRank vector
    pi = np.full(n, 1 / n)
    prev_pi = np.zeros(n)

    # Create a mapping from node ID to index and vice versa
    node_to_index = {node: index for index, node in enumerate(nodes)}
    index_to_node = {index: node for index, node in enumerate(nodes)}

    # Create an adjacency list from the graph edges
    adjacency_list = {node: [] for node in nodes}
    for from_node, to_node in graph_edges:
        adjacency_list[from_node].append(to_node)

    # Perform power iterations
    iteration_count = 0
    while np.max(np.abs(pi - prev_pi)) > tolerance:
        prev_pi = pi.copy()
        pi = np.zeros(n)

        for node, index in node_to_index.items():
            outbound_neighbors = adjacency_list[node]
            if len(outbound_neighbors) == 0:
                continue

            for neighbor in outbound_neighbors:
                neighbor_index = node_to_index[neighbor]
                pi[neighbor_index] += prev_pi[index] / len(outbound_neighbors)

        pi = (1 - alpha) * pi + alpha / n
        iteration_count += 1

    print(f"Converged in {iteration_count} iterations.")

    # Convert the PageRank vector back to a dictionary with node IDs as keys
    pagerank_dict = {index_to_node[i]: rank for i, rank in enumerate(pi)}

    return pagerank_dict


In [4]:
power_iter_pagerank = power_iteration_pagerank(graph_edges, alpha=0.15)

Converged in 17 iterations.


### Question 3.1

Random walk from Assignment 1

```python
import random


def simulate_random_walk(graph_edges, num_steps, seed=42):
    random.seed(seed)

    neighbors = {}
    for from_node, to_node in graph_edges:
        if from_node not in neighbors:
            neighbors[from_node] = []
        if to_node not in neighbors:
            neighbors[to_node] = []
        neighbors[from_node].append(to_node)
        neighbors[to_node].append(from_node)

    current_node = random.choice(list(neighbors.keys()))
    visit_counts = {node: 0 for node in neighbors.keys()}

    for _ in range(num_steps):
        visit_counts[current_node] += 1
        current_node = random.choice(neighbors[current_node])

    return visit_counts
```

In [5]:
import random
from collections import defaultdict


def monte_carlo_pagerank(graph_edges, alpha, num_walks):
    # Get unique nodes in the graph to determine the number of nodes (n)
    nodes = set()
    for edge in graph_edges:
        nodes.update(edge)
    nodes = list(nodes)
    
    # Create a mapping from node ID to index and vice versa
    node_to_index = {node: index for index, node in enumerate(nodes)}
    
    # Create an adjacency list from the graph edges
    adjacency_list = {node: [] for node in nodes}
    for from_node, to_node in graph_edges:
        adjacency_list[from_node].append(to_node)

    f_v = defaultdict(int)
    
    for _ in range(num_walks):
        start_node = random.choice(nodes)
        current_node = start_node
        
        while True:
            if random.random() < alpha:
                # Stop the random walk with probability alpha
                break
            
            neighbors = adjacency_list[current_node]
            if not neighbors:
                break  # stop if there are no neighbors to move to
            
            # Move to a random neighbor with probability 1-alpha
            current_node = random.choice(neighbors)
        
        f_v[current_node] += 1
    
    # Estimate PageRank values using f_v counts
    est_pagerank = np.zeros(len(nodes))
    for node, count in f_v.items():
        index = node_to_index[node]
        est_pagerank[index] = count / num_walks
    
    return est_pagerank

In [6]:
def compute_difference(power_iteration_pi, monte_carlo_est):
    difference = np.sum(np.abs(power_iteration_pi - monte_carlo_est))
    return difference

In [7]:
nodes = list(power_iter_pagerank.keys())
power_iter_pi = np.array([power_iter_pagerank[node] for node in nodes])


n = len(nodes)
results = {}

for M in [2 * n, 4 * n, 6 * n, 8 * n, 10 * n]:
    monte_carlo_est = monte_carlo_pagerank(graph_edges, alpha=0.15, num_walks=M)
    difference = compute_difference(power_iter_pi, monte_carlo_est)
    results[M] = difference
    print(f'M = {M}, Difference = {difference}')

M = 634160, Difference = 0.9007271828211119
M = 1268320, Difference = 0.8402100561294962
M = 1902480, Difference = 0.8173469649443026
M = 2536640, Difference = 0.8060844092001701
M = 3170800, Difference = 0.7968972425552688


### Question 3.2

In [8]:
def monte_carlo_pagerank_new(graph_edges, alpha, num_walks):
    # Get unique nodes in the graph to determine the number of nodes (n)
    nodes = set()
    for edge in graph_edges:
        nodes.update(edge)
    nodes = list(nodes)
    
    # Number of nodes in the graph
    n = len(nodes)
    
    # Create a mapping from node ID to index and vice versa
    node_to_index = {node: index for index, node in enumerate(nodes)}

    # Create an adjacency list from the graph edges
    adjacency_list = {node: [] for node in nodes}
    for from_node, to_node in graph_edges:
        adjacency_list[from_node].append(to_node)

    # Initialize count of each node appearances
    s_v = defaultdict(int)
    
    # Perform Monte Carlo walks
    for _ in range(num_walks):
        start_node = random.choice(nodes)
        current_node = start_node
        
        while True:
            # Increment count for the current node
            s_v[current_node] += 1

            if random.random() < alpha:
                # Stop the random walk with probability alpha
                break
            
            neighbors = adjacency_list.get(current_node, [])
            if not neighbors:
                break  # Stop if there are no neighbors to move to
            
            # Move to a random neighbor with probability 1-alpha
            current_node = random.choice(neighbors)
    
    # Calculate the refined PageRank estimates
    est_pagerank = np.zeros(n)
    for node in nodes:
        index = node_to_index[node]
        est_pagerank[index] = (alpha * s_v[node]) / num_walks
    
    return est_pagerank, nodes

In [9]:
def compute_difference_new(edges, alpha, n, m_values):
    differences = {}
    for m_multiplier in m_values:
        num_walks = m_multiplier * n
        est_pagerank, nodes = monte_carlo_pagerank_new(edges, alpha, num_walks)

        diff_sum = 0
        for i in range(len(est_pagerank)):
            true_pagerank = 1 / len(nodes)
            diff_sum += abs(est_pagerank[i] - true_pagerank)
        
        differences[m_multiplier] = diff_sum
        print(f"M = {num_walks}, Sum of differences: {diff_sum:.4f}")
    
    return differences

In [10]:
alpha = 0.15
nodes_count = len(set(edge[0] for edge in graph_edges).union(set(edge[1] for edge in graph_edges)))

m_values = [2, 4, 6, 8, 10]

difference_sums = compute_difference_new(graph_edges, alpha, nodes_count, m_values)

M = 634160, Sum of differences: 0.7303
M = 1268320, Sum of differences: 0.7292
M = 1902480, Sum of differences: 0.7284
M = 2536640, Sum of differences: 0.7285
M = 3170800, Sum of differences: 0.7285


### Question 3.3

Thus, $\frac{\alpha s_v}{M}$ is an unbiased estimator for $\pi_v$ because the expected value of our PageRank estimate equals the true PageRank $\pi_v$. The randomness of the starting node and the walk leads to a fair distribution of visits.