In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from bs4 import BeautifulSoup as bs
import json


In [None]:
website = "https://pypi.org"

In [None]:
class Tree:
  def __init__(self, value):
    self.value = value
    self.children = []
    self.parent = None
  
  def add_child(self, child):
    child.parent = self
    self.children.append(child)
  
  def remove_child(self, child):
    if child in self.children:
      child.parent = None
      self.children.remove(child)

  def __repr__(self):
    return f'TreeNode({self.value})'

  def __str__(self, level=0):
    ret = " " + str(level) + repr(self) + "\n"
    for child in self.children:
      ret += child.__str__(level + 1)
    return ret

# Get the list of all packages


In [None]:
def build_graph(node, graph, pos={}, level=0, x=0, width=1., vert_gap = 0.2):
  pos[node.value] = (x, level)
  for i, child in enumerate(node.children):
    _x = x - width/2. + (i+0.5)*width/len(node.children)
    pos = build_graph(child, graph, pos=pos, level=level-vert_gap, x=_x, width=width/2.)
    graph.add_edge(node.value, child.value)
  return pos

def draw_tree(root):
  graph = nx.DiGraph()
  pos = build_graph(root, graph)
  nx.draw(graph, pos, with_labels=True, arrows=False)
  plt.show()

In [None]:
def get_href(root, website,idx=0,max_depth=2,verbose=False):
  tree_node = Tree(website)
  if idx >= max_depth:
    print(f"Max depth reached for {website}")
    return None
  idx += 1
  if verbose:
    print(f"Getting hrefs from {website}")
  r = requests.get(website, headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"})
  if not r.ok:
    print(f"Failed to get {website}. Status code: {r.status_code}")
    return None
  soup = bs(r.text,"html.parser")
  href_links = [href.get("href") for href in soup.find_all("a")]
  # filter out None and empty strings
  href_links = [link for link in href_links if link]
  href_links = [(root + link) for link in href_links if link.startswith("/") and '#' not in link and not len(link) == 1]
  for link in href_links:
    tree_node.add_child(Tree(get_href(root,link,idx,max_depth,verbose)))
  return tree_node

In [None]:
nodes = get_href(website,website,max_depth=3,verbose=True)
if(nodes is not None):
  draw_tree(nodes)