In [11]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import io

In [3]:
# Check you're allowed to scrape the page.

# Split url between domain and a specific page.
query = 'Squash_(sport)'
url = 'https://en.wikipedia.org/wiki/' + query

# This grabs us the html of the entire page
page = requests.get(url)
soup = BeautifulSoup(page.content, features="html.parser")
print(soup)

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-enabled vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Squash (sport) - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-

In [4]:
# Check if the request was sucsessful. We want it to be 200, or at least start with a 2... anything else is a problem.

print(page.status_code)

200


In [5]:
# Initialise empty arrays for links

links = []

# Find only links on this page which are <a> anchor tags. We use 'try' and 'except' as some of the anchors may not have an 'href', which we can ignore, as these would otherwise could cause an error.

for a in soup.find_all("a"):
    try:
        links.append(a["href"])
    except:
        pass


# Loop to cycle though the array and print each link.
for link in links:
    print(link) # Many of the returned links are from outside wikipedia, but we only want internal links.

#bodyContent
/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
//en.wikipedia.org/wiki/Wikipedia:Contact_us
https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/w/index.php?title=Special:CreateAccount&returnto=Squash+%28sport%29
/w/index.php?title=Special:UserLogin&returnto=Squash+%28sport%29
/w/index.php?title=Special:CreateAccount&returnto=Squash+%28sport%29
/w/index.php?title=Special:UserLogin&returnto=Squash+%28sport%29
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
#
#History
#Equipment
#Racket
#Ball
#Court
#Manner_of_play
#Service
#Play
#Strategy_and_tactics
#Interference_and_obstruction
#Referee
#Scoring_system
#Point-a-Rally

In [None]:
###Up to here

In [6]:
# Filter the array to links starting with /wiki/ i.e internal links.

filtered = [link for link in links if link.startswith('/wiki/')]

for f in filtered:
    print(f) # Still cluttered with links not needed.

/wiki/Main_Page
/wiki/Wikipedia:Contents
/wiki/Portal:Current_events
/wiki/Special:Random
/wiki/Wikipedia:About
/wiki/Help:Contents
/wiki/Help:Introduction
/wiki/Wikipedia:Community_portal
/wiki/Special:RecentChanges
/wiki/Wikipedia:File_upload_wizard
/wiki/Main_Page
/wiki/Special:Search
/wiki/Help:Introduction
/wiki/Special:MyContributions
/wiki/Special:MyTalk
/wiki/Squash_(sport)
/wiki/Talk:Squash_(sport)
/wiki/Squash_(sport)
/wiki/Squash_(sport)
/wiki/Special:WhatLinksHere/Squash_(sport)
/wiki/Special:RecentChangesLinked/Squash_(sport)
/wiki/Wikipedia:File_Upload_Wizard
/wiki/Special:SpecialPages
/wiki/File:Semifinal_Squash_SM_2021.jpg
/wiki/Sports_governing_body
/wiki/World_Squash_Federation
/wiki/England
/wiki/United_Kingdom
/wiki/Contact_sport#Limited-contact
/wiki/Mixed-sex_sports
/wiki/List_of_racket_sports
/wiki/Racket_(sports_equipment)
/wiki/Goggles
/wiki/Olympic_Games
/wiki/2028_Summer_Olympics
/wiki/World_Games
/wiki/1997_World_Games
/wiki/2005_World_Games
/wiki/List_of_ra

In [13]:

ignores = ['png', 'jpg', 'jpeg', 'isbn', 'svg', 'identifier', \
           'File', 'Special', 'Template', 'Mailto', 'Portal', \
           'Help', 'Category', 'Talk', 'Wikipedia', 'Main_Page']

filtered = []

# Loop flags lines with links to wiki pages as valid.
for link in links:
    if link.startswith('/wiki/'):
        valid = True

# Completes the other entries as invalid
        for ignore in ignores:
            if ignore in link:
                valid = False
                break

# Any links still valid are added to 'filtered' array
        if valid:
            filtered.append(link)

for f in filtered:
    print(f)

/wiki/Squash_(sport)
/wiki/Squash_(sport)
/wiki/Squash_(sport)
/wiki/Sports_governing_body
/wiki/World_Squash_Federation
/wiki/England
/wiki/United_Kingdom
/wiki/Contact_sport#Limited-contact
/wiki/Mixed-sex_sports
/wiki/List_of_racket_sports
/wiki/Racket_(sports_equipment)
/wiki/Goggles
/wiki/Olympic_Games
/wiki/2028_Summer_Olympics
/wiki/World_Games
/wiki/1997_World_Games
/wiki/2005_World_Games
/wiki/List_of_racket_sports
/wiki/Ball_game
/wiki/World_Squash_Federation
/wiki/International_Olympic_Committee
/wiki/Olympic_Games
/wiki/2028_Summer_Olympics
/wiki/Professional_Squash_Association
/wiki/Rackets_(sport)
/wiki/Harrow_School
/wiki/Natural_rubber
/wiki/St._Paul%27s_School_(Concord,_New_Hampshire)
/wiki/Concord,_New_Hampshire
/wiki/U.S._Squash
/wiki/Fives
/wiki/RMS_Titanic
/wiki/First_class_facilities_of_the_RMS_Titanic
/wiki/Royal_Automobile_Club
/wiki/England_Squash
/wiki/Great_Britain
/wiki/Hardball_squash
/wiki/Squash_Doubles
/wiki/Squash_tennis
/wiki/Racquetball#United_Kingdom

In [12]:
# Get the response in html.
wikiurl="https://en.wikipedia.org/wiki/AFC_Wimbledon"

# Check request was sucsessful (code 200)
response=requests.get(wikiurl)
print(response.status_code)

# Parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')

# Objective is to find any element with the table tag, however, there are some of these we dont want from this page, so we specify only tables using the "wikitable" class.
tabledata=soup.find('table',{'class':"wikitable"})

# Read the table data
df=pd.read_html(io.StringIO(str(tabledata)))

# Convert list to pandas dataframe
df=pd.DataFrame(df[0])
print(df.head())

# Write the data to a .csv file
df.to_csv('team_info.csv', sep='\t', encoding='utf-8')

200


ImportError: Missing optional dependency 'lxml'.  Use pip or conda to install lxml.