In [1]:
from collections import OrderedDict
import requests
from bs4 import BeautifulSoup
def get_internal_links_with_title(url):
    # Request the HTML source code of the Wikipedia page
    response = requests.get(url)
    html = response.text
    
    # Use BeautifulSoup to extract the links from the HTML
    soup = BeautifulSoup(html, 'html.parser')
    
    # Find the main text container
    main_text = soup.find("div", class_="mw-body-content mw-content-ltr")
    
    # Extract the links and titles from the main text container
    links_and_titles = [(a['href'], a['title']) 
    for a in main_text.find_all('a', href=True) 
    if (a['href'].startswith('/wiki/') 
    and (a['href'].count(':') == 0) 
    and (a['href'].count('disambiguation') == 0))]
    
    # Convert the links to full URLs and remove duplicates while maintaining the order of the links
    links_and_titles = list(OrderedDict.fromkeys(links_and_titles))
    links_and_titles = [('https://en.wikipedia.org' + link, title) for link, title in links_and_titles]
    
    # Only return the first 10 links, or all links if there are less than 10
    return links_and_titles[:10] if len(links_and_titles) >= 10 else links_and_titles


In [2]:
import requests
from bs4 import BeautifulSoup

def get_article_title(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    title = soup.find("h1").text
    return title


In [3]:
from neo4j import GraphDatabase

def build_link_network(start_url, depth):
    driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "12345678"))
    session = driver.session()
    
    # Clear the database of any existing data (FOR TESTING)
    # session.run("MATCH (n) DETACH DELETE n")
    
    def traverse_links(url, depth, link_order):
        links = get_internal_links_with_title(url)
        title = get_article_title(url)
        
        # Create a node in the database for the current article
        session.run("MERGE (b:Article {name: $name, title: $title})", name=url, title=title)

        
        # Connect the current article to all of its links
        for i, link in enumerate(links):
            link_title = link[1]
            link_url = link[0]
            session.run("MERGE (b:Article {name: $name, title: $title})", name=link_url, title=link_title)
            session.run("MERGE (a:Article {name: $from_article}) MERGE (b:Article {name: $to_article}) MERGE (a)-[r:LINKS_TO]->(b) ON CREATE SET r.order_position = $order_position ON MATCH SET r.order_position = r.order_position", from_article=url, to_article=link_url, order_position=link_order+i+1)
        
        if depth > 0:
            for link in links:
                traverse_links(link[0], depth-1, link_order+i+1)
    if depth > 0:
        traverse_links(start_url, depth-1, 0)
    
    session.close()
    driver.close()


In [5]:
build_link_network('https://en.wikipedia.org/wiki/Neo4j',4)