# Wikipedia Scraping

In this notebook I made a simple scraper for Wikipedia. The data from the scraper is used to plot a network graph in which the relation between different subjects is visualised.

This excercise served as a practice for the BeautifulSoup library.


In [88]:
from bs4 import BeautifulSoup
import requests
import networkx as nx
from pyvis.network import Network


In [89]:
def search_and_soup(search_string):
    '''gets the html of the relevant wikipedia page and returns a BeautifulSoup object'''
    search_string = search_string.replace(" ", "_")
    response = requests.get(f"https://en.wikipedia.org/wiki/{search_string}")
    source = response.content
    soup = BeautifulSoup(source, "html.parser")
    
    return soup

def extract_soup(soup):
    '''extracts all the titles of links in the wikipedia paragraphs'''
    subjects = [i["title"] for i in soup.select("p a[title]")]
    
    return subjects

def create_graph(search_string, subjects, nx_graph = None):
    '''creates a graph and adds nodes and edges between the search string and subjects
    if a nx_graph is given, nodes and edges will be added to the given graph'''
    if nx_graph is None:
        nx_graph = nx.Graph()
        nx_graph.add_node(search_string, size=20)
    
    for title in subjects:
        nx_graph.add_node(title, size=5)
        nx_graph.add_edge(search_string, title, weight=5)
        
    return nx_graph

In [90]:
search_string = "Web scraping"

soup = search_and_soup(search_string)
#100 is set as the maximum of first order relations to show
subjects = subjects[:min(100, len(subjects))]
subjects = extract_soup(soup)
nx_graph = create_graph(search_string, subjects)

#50 subjects to be explored on relations is set as maximum
for search_string in subjects[:min(50, len(subjects))]:
    soup = search_and_soup(search_string)
    subjects2 = extract_soup(soup)
    #10 is set as the maximum of second order relations to show
    subjects2 = subjects2[:min(10, len(subjects2))]
    nx_graph = create_graph(search_string, subjects2, nx_graph)
    

In [91]:
#graph physics settings
options = 'var options = {"physics": { "enabled": true, "barnesHut": {"springLength": 100},"minVelocity": 1,"solver": "forceAtlas2Based","timestep": 0.61}}'
nt = Network("1000px", "1000px", notebook = True)
# populates the nodes and edges data structures
nt.from_nx(nx_graph)
nt.set_options(options)
nt.show("nx.html")