In [14]:
import json

import bs4
from bs4 import BeautifulSoup     
import requests

PARSER = "lxml"            # to use lxml (the most common), you'll need to install with .../pip install lxml

In [15]:
star_wars_url = "https://en.wikipedia.org/wiki/Star_Wars"
response = requests.get(star_wars_url)
data_from_url = response.text
soup = BeautifulSoup(data_from_url,PARSER)


In [16]:
queue = ["Star Wars"]

In [17]:
def get_wiki_graph_one_step(l):
    """
    Takes a list of wikipedia articles with no repeats. 
    Returns A dictionary where keys are items in original list. 
    Values are lists of references to other wikipedia articles
    """
    d = {}
    # s = set()

    for i in range(len(l)):
        title = l[i]
        wiki_url = "https://en.wikipedia.org/wiki/" + title

        # Request Wikipedia and Parse
        response = requests.get(wiki_url)
        data_from_url = response.text
        soup = BeautifulSoup(data_from_url,PARSER)

        print(f"({i}/{len(l)}): {title}")

        # Capture all referenced articles within article
        link_set = set()  # Use a set to ensure no repeats
        for link in soup.find_all('a'):
            s = link.get('href')
            if (s and s[:6] == "/wiki/"):
                ref = s[6:]

                # Make sure that title does not include ":" (means it is not normal wikipedia page)
                if (ref.find(":") == -1):
                    link_set.add(ref)
        d[title] = list(link_set)

    return d

    

In [18]:
def list_of_lists_to_set(lol):
    s = set()
    for l in lol:
        for item in l:
            s.add(item)

    return s

In [19]:
def get_wiki_graph(final_d = {}, starting_refs=[], num_steps=1):
    if starting_refs:
        ref_list = starting_refs
    else:
        ref_list = ["New_York_State_Route_373"]

    for i in range(num_steps):
        step_d = get_wiki_graph_one_step(ref_list)
        final_d.update(step_d)

        # Get all of the references that haven't been added to graph
        step_refs = list_of_lists_to_set(list(step_d.values()))
        existing_refs = set(final_d.keys())
        unseen_refs = step_refs.difference(existing_refs)

        ref_list = list(unseen_refs)

    return (final_d, step_refs, ref_list)

In [21]:
final_d, step_refs, ref_list = get_wiki_graph(starting_refs=["New_York_State_Route_373"], num_steps=3)

(0/1): New_York_State_Route_373
(0/59): Auto_trail
(1/59): Parkways_in_New_York
(2/59): Chesterfield,_New_York
(3/59): Port_Kent_and_Hopkinton_Turnpike
(4/59): Burlington,_Vermont
(5/59): Plattsburgh,_New_York
(6/59): ISBN_(identifier)
(7/59): County_Route_17_(Essex_County,_New_York)
(8/59): 52nd_New_York_State_Legislature
(9/59): Interstate_87_(New_York)
(10/59): Lake_Champlain
(11/59): New_York_State_Legislature
(12/59): Theodore_Roosevelt_International_Highway
(13/59): Baltimore,_Maryland
(14/59): Hopkinton,_New_York
(15/59): New_York_State_Route_372
(16/59): Lake_Champlain_Transportation_Company#Burlington-Port_Kent
(17/59): General_Drafting
(18/59): Lake_Champlain_Transportation_Company
(19/59): Port_Kent_(Amtrak_station)
(20/59): Albany,_New_York
(21/59): New_York_State_Route_374
(22/59): Ausable_Chasm,_New_York
(23/59): Amtrak
(24/59): Toll_gate
(25/59): Hamlet_(New_York)
(26/59): Google_Maps
(27/59): List_of_U.S._Routes_in_New_York
(28/59): Ausable_Chasm
(29/59): Toll_road
(30/

In [22]:
# create json object from dictionary
json = json.dumps(final_d)

# open file for writing, "w" 
f = open("wiki_graph.json","w")

# write json object to file
f.write(json)

# close file
f.close()


In [10]:
"""
d = {}
steps_remaining = 2

while steps_remaining > 0 and len(queue)>0:
    title = queue.pop(0)
    wiki_url = "https://en.wikipedia.org/wiki/" + title

    # Request Wikipedia and Parse
    response = requests.get(wiki_url)
    data_from_url = response.text
    soup = BeautifulSoup(data_from_url,PARSER)

    # Capture all referenced articles within article
    link_set = set()  # Use a set to ensure no repeats
    for link in soup.find_all('a'):
        s = link.get('href')
        if (s and s[:6] == "/wiki/"):
            ref = s[6:]
            link_set.add(ref)
            if (title not in d):
                queue.append(ref)

"""

'\nd = {}\nsteps_remaining = 2\n\nwhile steps_remaining > 0 and len(queue)>0:\n    title = queue.pop(0)\n    wiki_url = "https://en.wikipedia.org/wiki/" + title\n\n    # Request Wikipedia and Parse\n    response = requests.get(wiki_url)\n    data_from_url = response.text\n    soup = BeautifulSoup(data_from_url,PARSER)\n\n    # Capture all referenced articles within article\n    link_set = set()  # Use a set to ensure no repeats\n    for link in soup.find_all(\'a\'):\n        s = link.get(\'href\')\n        if (s and s[:6] == "/wiki/"):\n            ref = s[6:]\n            link_set.add(ref)\n            if (title not in d):\n                queue.append(ref)\n\n'