In [None]:
# Always run this code.
%config InteractiveShell.ast_node_interactivity="none"
import sys
if 'google.colab' in sys.modules:
  !pip install --force-reinstall git+https://github.com/jamcoders/jamcoders-public-2025.git --quiet
from jamcoders.base_utils import *
from jamcoders.week4.labw4d3a import *

import matplotlib.pyplot as plt
import numpy as np
import random

!pip install networkx matplotlib --quiet
!pip install pydot --quiet

# Week 4, Day 3A: Co-Occurrence Graphs

## You're a wizard, Harry!

Just run through these cells.

In the function `get_harry_potter_data`, we split *Harry Potter and the Sorcerer's Stone* into sentences for you. Here are the first 10 sentences!

In [None]:
sentences = get_harry_potter_data()

for i in range(10):
  print(i, sentences[i])

How many sentences are in the Harry Potter dataset?

In [None]:
print(len(sentences))

Here's a list of Harry Potter characters called `characters`.

In [None]:
characters = [
    'Harry',
    'Ron',
    'Hermione',
    'Mr Dursley',
    'Mrs Dursley',
    'Dumbledore',
    'Hagrid',
    'Snape',
    'McGonagall',
    'Voldemort',
    'Dudley',
    'Draco',
    'Quirrell',
    'Hedwig',
    'Lily',
    'James',
    'Filch',
    'Neville',
    'Crabbe',
    'Goyle',
]

## Co-Occurrence Functions

Create a function called `has_both_unique` that checks if two unique characters, `char1` and `char2`, are in the same `sentence`.

**NOTE**: this should be false if the first character is the same as the second character!


In [None]:
def has_both_unique(sentence, char1, char2):
  """
  Check if two unique character names appear in the sentence.

  Parameters:
      sentence (str): The sentence to search within.
      char1 (str): The first character's name.
      char2 (str): The second character's name.

  Returns:
      bool: True if unique char1 and char2 appear in the sentence, False otherwise.
  """
  # YOUR CODE HERE

assert_equal(got=has_both_unique('Do Hanna and Lydia look alike?', 'Hanna', 'Lydia'), want=True)
assert_equal(got=has_both_unique('Seriously, do Hanna and Lydia look alike?', 'Lydia', 'Hanna'), want=True)
assert_equal(got=has_both_unique('Xavier will give Joy five eggs.', 'Xavier', 'Joy'), want=True)
assert_equal(got=has_both_unique('Xavier will give Joy five eggs.', 'Xavier', 'Sam'), want=False)
assert_equal(got=has_both_unique('Xavier will give Joy five eggs.', 'Santa Claus', 'Joy'), want=False)
assert_equal(got=has_both_unique('Xavier will give Joy five eggs.', 'Santa Claus', 'Winnie the Pooh'), want=False)
assert_equal(got=has_both_unique('Bruno likes to run', 'Bruno', 'Bruno'), want=False)

Write a function called `create_key` that returns a tuple with the `char1` and `char2` in alphabetical order.

Remember, a tuple called `fruit` with `"apple"` and `"banana"` can be initialized like so: `fruit = ("apple", "banana")`.

*Hint: how do string comparisons with `>`, `<`, etc. work?*

In [None]:
print("a" < "b")
print("Ron" < "Harry")

In [None]:
def create_key(char1, char2):
  """
  Return a tuple containing char1 and char2 in alphabetical order.

  Parameters:
      char1 (str): The first character's name.
      char2 (str): The second character's name.

  Returns:
      tuple: A tuple (char1, char2) where char1 <= char2 alphabetically.

  Notes:
      This ensures consistent key ordering for use in dictionaries,
      such as for co-occurrence counting.
  """
  # YOUR CODE HERE

assert_equal(got=create_key("Harry", "Ron"), want=("Harry", "Ron"))
assert_equal(got=create_key("Ron", "Harry"), want=("Harry", "Ron"))

Create a function called `add_cooccurrence` that updates a dictionary called `data` if `char1` and `char2` co-coccur in the `sentence`.

* This function uses `has_both_unique` to determine if two characters co-occur.
* When this happens, use `create_key` to update the dictionary.

**NOTE**: the function should modify `data` directly. It doesn't need to return `data`.

In [None]:
def add_cooccurrence(sentence, char1, char2, data):
    """
    Update the co-occurrence dictionary if both characters appear in the sentence.

    Parameters:
        data (dict): A dictionary where keys are (char1, char2) tuples and values are co-occurrence counts.
        sentence (str): The sentence to check for character mentions.
        char1 (str): The first character's name.
        char2 (str): The second character's name.

    Notes:
        - The order of char1 and char2 in the key is alphabetically sorted.
        - The function increments the count for the pair if both names appear in the sentence.
    """
    # YOUR CODE HERE


data = {}

add_cooccurrence('Bruno likes to run.', 'Bruno', 'Bruno', data)
assert_equal(got=data, want={})

add_cooccurrence('Do Hanna and Lydia look alike?', 'Hanna', 'Lydia', data)
assert_equal(got=data, want={('Hanna', 'Lydia'): 1})

add_cooccurrence('Seriously, do Hanna and Lydia look alike?', 'Lydia', 'Hanna', data)
assert_equal(got=data, want={('Hanna', 'Lydia'): 2})

add_cooccurrence('Xavier will give Joy five eggs.', 'Xavier', 'Sam', data)
assert_equal(got=data, want={('Hanna', 'Lydia'): 2})

add_cooccurrence('Xavier will give Joy five eggs.', 'Xavier', 'Joy', data)
assert_equal(got=data, want={('Hanna', 'Lydia'): 2, ('Joy', 'Xavier'): 1})


Create a function called `count_cooccurrences` that counts the number of times two characters appear together in a list of sentences, then returns the counts in a dictionary.

When you're done, the dictionary should follow the formatting below:

```
{
  ("Orr", "Zaria"): 2,
  ("Hanna", "Lydia"): 5,
  ("Bruno", "Frank"): 1,
}
```

*Here's some pseudocode:*

* Initialize a dictionary for storing the co-occurrence counts.
* Loop through the sentences.
  * Outer loop through characters.
    * Inner loop through characters. **Do not repeat the same pair!**
      * Call `add_cooccurrence` with the correct inputs.
* Return the dictionary.

*Hint: if you check for Harry and Ron, do you still need to check for Ron and Harry?*


In [None]:
def count_cooccurrence(sentences, characters):
  """
  Computes the number of co-occurrences of characters in a list of sentences.

    Args:
        sentences (list(str)): the list of sentences to search.
        characters (list(str)): the list of characters to search for.

    Returns (dict((str, str): int)): The mapping of character-pairs to the number of times they co-occur.
  """

  # YOUR CODE HERE


assert_equal(got=count_cooccurrence([], []), want={})
assert_equal(got=count_cooccurrence(['Bruno likes to run.'], ['Bruno', 'Hanna', 'Lydia']), want={})
assert_equal(got=count_cooccurrence(['Bruno likes to run.', 'Do Hanna and Lydia look alike?', 'Seriously, do Hanna and Lydia look alike?'], ['Bruno', 'Hanna', 'Lydia']), want={('Hanna', 'Lydia'): 2})
assert_equal(got=count_cooccurrence(['A B C D E F G', 'A B B A'], ['A', 'B', 'C', 'X', 'Y', 'Z']), want={('A', 'B'): 2, ('B', 'C'): 1, ('A', 'C'): 1})
assert_equal(got=count_cooccurrence([], []), want={})

Call the function co-occurences on the Harry Potter sentences and characters.

In [None]:
cooccurrence_data = count_cooccurrence(sentences, characters)

Visualize the co-occurrences!

In [None]:
visualize_bar_chart(cooccurrence_data)

Who is Harry's bestie?

In [None]:
# YOUR ANSWER HERE

## Graphs

Now we can store the information in a graph! How would you encode the characters and the co-occurrences?

In [None]:
# DO NOT CHANGE THE CODE BELOW

import networkx as nx
import matplotlib.pyplot as plt

def graph_cooccurrence(data):
  """
  Returns the co-occurrences in graph form.

    Args:
        data (dict((str, str): int)): The mapping of character-pairs to the number of times they co-occur.

    Returns (networkx.Graph): The graph of co-occurrences.
  """

  G = nx.Graph()
  for key in data:
      item1, item2 = key
      weight = data[key]
      G.add_edge(item1, item2, weight=weight)
  return G

G = graph_cooccurrence(cooccurrence_data)


Here's some information about the graph.

In [None]:
# This should be 20
print(G.number_of_nodes())

In [None]:
# This should be 90
print(G.number_of_edges())

In [None]:
print(G['Hedwig'])

In [None]:
print(len(G['Harry']))

**Think about it**: If there are $20$ nodes and Harry has $19$ neighbors, what does this mean? Does this match your expectations?

In [None]:
# Write down any thoughts

Let's explore displaying our graph to learn about our characters. We've written some code for the visualization given the co-occurrence graph `G` and integer `k`. Pay attention to this mysterious parameter `k`...

In [None]:
visualize_cooccurence(G, k=1)

In [None]:
visualize_cooccurence(G, k=5)

In [None]:
visualize_cooccurence(G, k=9)

What do you observe when `k` increases? What quality of the graph changes? Just write down your thoughts, and ask a TA if you have any questions!

In [None]:
# YOUR COMMENT HERE

## BFS

In [None]:
# DO NOT CHANGE THE CODE BELOW

def init_q(lst=None):
    """Constructs a new empty queue.

    Arguments: Optional list of initial elements in the queue (Optional[list]).
    Returns (Queue): The new empty queue.
    Effects: None.
    """
    if lst is None:
        return []
    return lst[:]


def enqueue_q(queue, elem):
    """Adds an element to the rear of the queue.

    Arguments:
        queue (Queue): The queue to which the element should be added.
        elem (Any): The element to be added to the queue.
    Returns: None.
    Effects: Modifies `queue` by adding the new element.
    """
    queue.append(elem)


def dequeue_q(queue):
    """Removes the element from the front of the queue and returns it.

    queue must not be empty.

    Arguments:
        queue (Queue): The queue from which the front element should be removed.
    Returns (any): The front element in the queue.
    Effects: The front element is removed from the queue.
    """
    return queue.pop(0)


def peek_q(queue):
    """Returns the element at the front of the queue, without removing it.

    queue must not be empty.

    Arguments:
        queue (Queue): The queue from which the front element should be returned.
    Returns (any): The front element in the queue.
    Effects: None
    """
    return queue[0]


def is_empty_q(queue):
    """Determines whether or not the queue is empty.

    Arguments:
        queue (Queue):  The queue to be checked if it is empty or not
    Returns (bool): True if the queue is empty or False if it is not empty
    Effects: None
    """
    return len(queue) == 0

Previously, we wrote code to check if two nodes are connected in a graph using BFS. Now, we want to see the minimum path length between two characters using `path_length_bfs`.

While you may have completed this before in w4d1b, **parts of the code will be different now that the nodes are not numerical.**

Here are some helpful examples of graph utilities you can use for `G`.

In [None]:
# print type of G
print("Type of G:", type(G))
print()

# print all the nodes in G
print("Nodes in G:", G.nodes)
print("Number of nodes in G:", len(G))
print()

# check for nodes in G
print('Is Harry in co-occurrence graph G?', 'Harry' in G)
print('Is Orr in co-occurrence graph G?', 'Orr' in G)
print()

# get the neighbots of a node in G
print("Neighbors of Hedwig in co-occurrence graph G:", G['Hedwig'])

**READ THE PSEUDOCODE BELOW CAREFULLY**.

* Initialize a dictionary for `visited`. *What are the types of the keys and values? What default values make sense?*
* Initialize a dictionary for `distances`. *What are the types of the keys and values? What default values make sense?*
* If the `start_vertex` or `end_vertex` is not in `G`, return $-1$.
* Initialize `queue` with the `start_vertex` and update `visited`.
* While there are still items in the `queue`:
  * remove ```current_vertex``` from the `queue`
  * loop over all `current_vertex`'s neighbors:
    * if the `neighbor` has not yet been visited:
      * add it to the `queue`
      * update `visited`
      * update `distance` based on the distance between `start_vertex` and `current_vertex`
* Return the distance to the `end_vertex`


In [None]:
def path_length_bfs(G, start_vertex, end_vertex):
    """ returns the length of the path between start_vertex and end_vertex.
    Inputs:
        G: The graph
            type: list[list[int]]
        start_vertex: A node
            type: int
        end_vertex: Another node
            type: int
    Returns:
            type: int
    """

    # YOUR CODE HERE

In [None]:
assert_equal(got=path_length_bfs(G, 'Harry', 'Ron'), want=1)
assert_equal(got=path_length_bfs(G, 'Hedwig', 'Voldemort'), want=2)
assert_equal(got=path_length_bfs(G, 'Harry', 'Orr'), want=-1)
assert_equal(got=path_length_bfs(G, 'Sam', 'Dumbledore'), want=-1)
assert_equal(got=path_length_bfs(G, 'Sam', 'Orr'), want=-1)

Run the code below to see the two characters that are the furthest away from each other!

In [None]:
def get_max_degrees(G, characters):
  """
  Find the pair of characters in the graph with the maximum shortest path length between them.

  Parameters:
      G (networkx.Graph): The graph where nodes are characters and edges represent connections.
      characters (list): A list of character names (node labels) to consider.

  Returns:
      tuple: A tuple (max_distance, (char1, char2)) where:
          - max_distance (int): The longest shortest-path length between any two characters.
          - (char1, char2) (tuple): The character pair with this maximum path length.
  """
  max = 0
  chars = None
  for i in range(len(characters)):
    for j in range(i + 1, len(characters)):
      char1 = characters[i]
      char2 = characters[j]
      if char1 != char2:
        d = path_length_bfs(G, char1, char2)
        if d > max:
          chars = (char1, char2)
          max = d
  return max, chars

max, chars = get_max_degrees(G, characters)
print(max, chars)

What is the runtime of this algorithm? *Hint: what is $N$?*

In [None]:
# YOUR ANSWER HERE

Check your answer with a TA!

## Congratulations on completing this lab! Here's some extra exploration :)

## Side Characters

Harry, Ron, and Hermione were eaten by a hippogriff 💔 Only the characters below are left.

In [None]:
side_characters = [
    'Dumbledore',
    'Hagrid',
    'Snape',
    'Mr Dursley',
    'Mrs Dursley',
    'McGonagall',
    'Voldemort',
    'Dudley',
    'Draco',
    'Quirrell',
    'Hedwig',
    'Lily',
    'James',
    'Filch',
    'Neville',
    'Crabbe',
    'Goyle',
]

Here are the functions you have access to:


```python
def count_cooccurrence(sentences, characters):
  """
  Computes the number of co-occurrences of characters in a list of sentences.

    Args:
        sentences (list(str)): the list of sentences to search.
        characters (list(str)): the list of characters to search for.

    Returns (dict((str, str): int)): The mapping of character-pairs to the number of times they co-occur.
  """

def graph_cooccurrence(data):
  """
  Returns the co-occurrences in graph form.

    Args:
        data (dict((str, str): int)): The mapping of character-pairs to the number of times they co-occur.
    
    Returns (networkx.Graph): The graph of co-occurrences.
  """

def visualize_cooccurrence(graph, k):
  """
  Visualize the graph.

      Args:
          graph (networkx.Graph): The graph of co-occurrences.
          k (int): mystery parameter!
  """

def get_max_degrees(G, characters):
  """
  Find the pair of characters in the graph with the maximum shortest path length between them.

      Args:
          G (networkx.Graph): The graph where nodes are characters and edges represent connections.
          characters (list): A list of character names (node labels) to consider.

      Returns:
          tuple: A tuple (max_distance, (char1, char2)) where:
              - max_distance (int): The longest shortest-path length between any two characters.
              - (char1, char2) (tuple): The character pair with this maximum path length.
  """

```

With these functions, try visualizing the graph of side characters!

In [None]:
# YOUR CODE HERE

# Try setting k=1 at first

In [None]:
# YOUR CODE HERE

# Try setting k=3 now

In [None]:
# YOUR CODE HERE

# Try getting the maximum path length between side characters!

## Let's make it unweighted!



First, we'll visualize the bar chart again.

In [None]:
visualize_bar_chart(cooccurrence_data)

There's several different levels of co-occurrence. What if we want only close friends, or people who know each other? Instead of a **weighted** graph, we'd want an **unweighted** graph. We can remove all the edges less than a pre-determined `threshold`.

What would be a good cutoff? Try changing the `threshold` for the number of co-occurrences based on the bar chart above.

In [None]:
threshold = ... # YOUR CODE HERE

def prune_data(data, threshold):
  new_dict = {}
  for edge in data:
    if data[edge] > threshold:
      new_dict[edge] = 1
    else:
      new_dict[edge] =  0
  return new_dict

pruned_data = prune_data(cooccurrence_data, threshold)
G_pruned = graph_cooccurrence(pruned_data)
visualize_cooccurence(G_pruned, k=3, weighted=False)

## Congrats on finishing the notebook! You got this 😀