# Web Scraping

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

## Apply for Twitter API Developer Account

#### Web Scraping an Article from Tagesschau

In [3]:
url = 'https://www.tagesschau.de/ausland/ischgl-corona-111.html'
requests.get(url)

<Response [200]>

In [6]:
html = requests.get(url).content
html
# this content agrument is what we will be using the most for web scraping
# output is the raw html from the website

b'<!DOCTYPE html>\n<html lang="de">\n<head>\n<meta http-equiv="X-UA-Compatible" content="IE=edge"/>\n<title>Corona-Hotspot Ischgl: Justiz ermittelt gegen vier Personen | tagesschau.de</title>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\n<meta http-equiv="pragma" content="no-cache"/>\n<meta http-equiv="cache-control" content="private"/>\n<meta name="viewport" content="width=device-width, user-scalable=yes, initial-scale=1.0, minimum-scale=1.0"/>\n<meta name="apple-mobile-web-app-capable" content="no"/>\n<meta name="description" content="Ischgl soll ma\xc3\x9fgeblich zur Virus-Verbreitung in Teilen Europas beigetragen haben. Was lief schief im Tiroler Skiort? 10.000 Seiten Material hat die Justiz zusammengetragen - gegen vier Personen wird ermittelt. <em>Von Clemens Verenkotte.</em>" />\n<meta name="keywords" content="Nachrichten, Inland, Ausland, Wirtschaft, Sport, Kultur Reportage, Bericht, News, Tagesthemen, Aktuell, Neu, Neuigkeiten, Hintergrund, Hintergrund

In [9]:
soup = BeautifulSoup(html, 'lxml') #BeautifulSoup takes 2 arguments: what we are parsing (html), and with WHICH driver (aka HOW) (lxml is a powerful parser.)

# as compared to the output above, you can already see that the parsed website is already more structured. 
# this is kind of a pre-processer step that we need. 

soup

<!DOCTYPE html>
<html lang="de">
<head>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<title>Corona-Hotspot Ischgl: Justiz ermittelt gegen vier Personen | tagesschau.de</title>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="no-cache" http-equiv="pragma"/>
<meta content="private" http-equiv="cache-control"/>
<meta content="width=device-width, user-scalable=yes, initial-scale=1.0, minimum-scale=1.0" name="viewport"/>
<meta content="no" name="apple-mobile-web-app-capable"/>
<meta content="Ischgl soll maßgeblich zur Virus-Verbreitung in Teilen Europas beigetragen haben. Was lief schief im Tiroler Skiort? 10.000 Seiten Material hat die Justiz zusammengetragen - gegen vier Personen wird ermittelt. &lt;em&gt;Von Clemens Verenkotte.&lt;/em&gt;" name="description"/>
<meta content="Nachrichten, Inland, Ausland, Wirtschaft, Sport, Kultur Reportage, Bericht, News, Tagesthemen, Aktuell, Neu, Neuigkeiten, Hintergrund, Hintergrund, Information, Politik, I

In [10]:
soup.prettify()

'<!DOCTYPE html>\n<html lang="de">\n <head>\n  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n  <title>\n   Corona-Hotspot Ischgl: Justiz ermittelt gegen vier Personen | tagesschau.de\n  </title>\n  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>\n  <meta content="no-cache" http-equiv="pragma"/>\n  <meta content="private" http-equiv="cache-control"/>\n  <meta content="width=device-width, user-scalable=yes, initial-scale=1.0, minimum-scale=1.0" name="viewport"/>\n  <meta content="no" name="apple-mobile-web-app-capable"/>\n  <meta content="Ischgl soll maßgeblich zur Virus-Verbreitung in Teilen Europas beigetragen haben. Was lief schief im Tiroler Skiort? 10.000 Seiten Material hat die Justiz zusammengetragen - gegen vier Personen wird ermittelt. &lt;em&gt;Von Clemens Verenkotte.&lt;/em&gt;" name="description"/>\n  <meta content="Nachrichten, Inland, Ausland, Wirtschaft, Sport, Kultur Reportage, Bericht, News, Tagesthemen, Aktuell, Neu, Neuigkeiten, Hintergru

In [11]:
soup.find_all('h1') 

# here, we will want to use either find (which, similar to search will show only the first instance), or find_all
# BUT find won't work on the raw html, it will only work after you've parsed the content.

[<h1>
 <span class="dachzeile">Corona-Hotspot Ischgl</span>
 <span class="headline">Justiz ermittelt gegen vier Personen</span>
 </h1>]

In [12]:
soup.find_all('h2')

[<h2 class="subtitle small">Corona-Verordnung Stunden später an "schwarzen Brettern"</h2>,
 <h2 class="subtitle small">"Mehr konnten wir nicht tun"</h2>,
 <h2 class="subtitle small">10.000 Seiten Beweismaterial gesichtet</h2>,
 <h2 class="hidden">Überblick über die tagesschau.de-Seiten und weitere ARD Online-Angebote</h2>]

In [13]:
soup.find_all('p')

[<p>Detail Navigation:</p>,
 <p class="text"><span class="stand">Stand: 30.09.2020 13:30 Uhr</span></p>,
 <p class="text small"><strong>Ischgl soll maßgeblich zur Virus-Verbreitung in Teilen Europas beigetragen haben. Was lief schief im Tiroler Skiort? 10.000 Seiten Material hat die Justiz zusammengetragen - gegen vier Personen wird ermittelt.</strong></p>,
 <p class="autorenzeile small">Von Clemens Verenkotte, ARD-Studio Wien</p>,
 <p class="text small"> Die Staatsanwaltschaft Innsbruck hat gegen vier Beschuldigte rund um die Umsetzung der Quarantäne-Verordnungen im Paznauntal und der Verkehrsbeschränkungen in Ischgl Mitte März dieses Jahres Ermittlungsverfahren eingeleitet. Dies bestätigte der Sprecher der Staatsanwaltschaft Innsbruck, Hansjörg Mayr, gegenüber dem <em>ARD-Studio Wien</em>. Auskünfte dazu, um wen es sich bei den Beschuldigten handele, würden nicht erteilt.</p>,
 <p class="text small"> Damit richten sich rund sechs Monate nach der teilweise chaotisch abgelaufenen Abrei

In [14]:
soup.find_all(['h1', 'h2']) # need to pass this as a list if doing more than one at once


[<h1>
 <span class="dachzeile">Corona-Hotspot Ischgl</span>
 <span class="headline">Justiz ermittelt gegen vier Personen</span>
 </h1>,
 <h2 class="subtitle small">Corona-Verordnung Stunden später an "schwarzen Brettern"</h2>,
 <h2 class="subtitle small">"Mehr konnten wir nicht tun"</h2>,
 <h2 class="subtitle small">10.000 Seiten Beweismaterial gesichtet</h2>,
 <h2 class="hidden">Überblick über die tagesschau.de-Seiten und weitere ARD Online-Angebote</h2>]

In [15]:
# note: Find all generates a list, with multiple elements. We want to cleanly extract JUST the text.

In [16]:
headings2 = soup.find_all('h2')

In [17]:
[i for i in headings2] # use a list comprehension to extract the headings, then extract JUST the text in the next step. 

[<h2 class="subtitle small">Corona-Verordnung Stunden später an "schwarzen Brettern"</h2>,
 <h2 class="subtitle small">"Mehr konnten wir nicht tun"</h2>,
 <h2 class="subtitle small">10.000 Seiten Beweismaterial gesichtet</h2>,
 <h2 class="hidden">Überblick über die tagesschau.de-Seiten und weitere ARD Online-Angebote</h2>]

In [18]:
[i.text for i in headings2] # here, you are extracting just the text! 

# note: you could also sidestep making an extra variable:

# [i.text for i in soup.find_all('h2')]

['Corona-Verordnung Stunden später an "schwarzen Brettern"',
 '"Mehr konnten wir nicht tun"',
 '10.000 Seiten Beweismaterial gesichtet',
 'Überblick über die tagesschau.de-Seiten und weitere ARD Online-Angebote']

In [19]:
[i.text for i in soup.find_all(['h1', 'h2', 'p'])]

['Detail Navigation:',
 '\nCorona-Hotspot Ischgl\nJustiz ermittelt gegen vier Personen\n',
 'Stand: 30.09.2020 13:30 Uhr',
 'Ischgl soll maßgeblich zur Virus-Verbreitung in Teilen Europas beigetragen haben. Was lief schief im Tiroler Skiort? 10.000 Seiten Material hat die Justiz zusammengetragen - gegen vier Personen wird ermittelt.',
 'Von Clemens Verenkotte, ARD-Studio Wien',
 ' Die Staatsanwaltschaft Innsbruck hat gegen vier Beschuldigte rund um die Umsetzung der Quarantäne-Verordnungen im Paznauntal und der Verkehrsbeschränkungen in Ischgl Mitte März dieses Jahres Ermittlungsverfahren eingeleitet. Dies bestätigte der Sprecher der Staatsanwaltschaft Innsbruck, Hansjörg Mayr, gegenüber dem ARD-Studio Wien. Auskünfte dazu, um wen es sich bei den Beschuldigten handele, würden nicht erteilt.',
 'Corona-Verordnung Stunden später an "schwarzen Brettern"',
 ' Damit richten sich rund sechs Monate nach der teilweise chaotisch abgelaufenen Abreise von rund 10.000 Touristen aus Ischgl und dem 

In [23]:
[i for i in soup.find_all('a')] 

# extract all of the LINKS from the article (i.text would give just the text displayed in the links)

[<a class="hidden" id="seitenanfang"></a>,
 <a accesskey="1" href="#mainmenu" title="[ALT + 1]">Hauptnavigation</a>,
 <a accesskey="2" href="#goToContent" title="[ALT + 2]">Zum Inhalt</a>,
 <a accesskey="5" href="#goToSearch" title="[ALT + 5]">Zur Suche</a>,
 <a accesskey="6" href="#seitenanfang" title="[ALT + 6]">Zum Seitenanfang</a>,
 <a class="hidden" name="goToHead"></a>,
 <a class="hidden" href="#goToContent">Zum Inhalt</a>,
 <a href="#">ARD Navigation</a>,
 <a class="home" href="https://www.ard.de" rel="nofollow">ARD Home</a>,
 <a href="https://www.tagesschau.de">Nachrichten</a>,
 <a href="https://www.sportschau.de" rel="nofollow">Sport</a>,
 <a href="https://boerse.ard.de" rel="nofollow">Börse</a>,
 <a href="https://www.ard.de/ratgeber" rel="nofollow">Ratgeber</a>,
 <a href="https://www.ard.de/wissen" rel="nofollow">Wissen</a>,
 <a href="https://www.ard.de/kultur" rel="nofollow">Kultur</a>,
 <a href="https://www.ard.de/kinder" rel="nofollow">Kinder</a>,
 <a href="https://www.ard

In [30]:
text = [i.text for i in soup.find_all(['h1', 'h2', 'p'])] # set output above to variable
print("\n".join(text)) # join the list with a line break in between each element. 

Detail Navigation:

Corona-Hotspot Ischgl
Justiz ermittelt gegen vier Personen

Stand: 30.09.2020 13:30 Uhr
Ischgl soll maßgeblich zur Virus-Verbreitung in Teilen Europas beigetragen haben. Was lief schief im Tiroler Skiort? 10.000 Seiten Material hat die Justiz zusammengetragen - gegen vier Personen wird ermittelt.
Von Clemens Verenkotte, ARD-Studio Wien
 Die Staatsanwaltschaft Innsbruck hat gegen vier Beschuldigte rund um die Umsetzung der Quarantäne-Verordnungen im Paznauntal und der Verkehrsbeschränkungen in Ischgl Mitte März dieses Jahres Ermittlungsverfahren eingeleitet. Dies bestätigte der Sprecher der Staatsanwaltschaft Innsbruck, Hansjörg Mayr, gegenüber dem ARD-Studio Wien. Auskünfte dazu, um wen es sich bei den Beschuldigten handele, würden nicht erteilt.
Corona-Verordnung Stunden später an "schwarzen Brettern"
 Damit richten sich rund sechs Monate nach der teilweise chaotisch abgelaufenen Abreise von rund 10.000 Touristen aus Ischgl und dem Paznauntal am Freitag, den 13. Mä

#### Writing a function for getting the url

In [45]:
# RATHER than re-defining the same 3 steps, DEFINE A FUNCTION!
# input = url
# output = soup

def make_soup(url):
    requests.get(url)
    html = requests.get(url).content
    return BeautifulSoup(html, 'lxml')

make_soup('https://en.wikipedia.org/wiki/List_of_police_dog_breeds')

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of police dog breeds - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"fe459afe-b8a4-4683-9e3b-8e5425e96dda","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_police_dog_breeds","wgTitle":"List of police dog breeds","wgCurRevisionId":980139453,"wgRevisionId":980139453,"wgArticleId":17333279,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Articles to be merged from July 2020","All articles

#### Webscraping a Wikipedia Article

In [37]:
url2 = 'https://en.wikipedia.org/wiki/List_of_police_dog_breeds'
requests.get(url2)

<Response [200]>

In [40]:
html2 = requests.get(url2).content
html2

b'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of police dog breeds - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"fe459afe-b8a4-4683-9e3b-8e5425e96dda","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_police_dog_breeds","wgTitle":"List of police dog breeds","wgCurRevisionId":980139453,"wgRevisionId":980139453,"wgArticleId":17333279,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Articles to be merged from July 2020","All a

In [47]:
soup2 = make_soup('https://en.wikipedia.org/wiki/List_of_police_dog_breeds') # do the same thing, but use the FUNCTION you just defined.
soup2.prettify()

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   List of police dog breeds - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"fe459afe-b8a4-4683-9e3b-8e5425e96dda","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_police_dog_breeds","wgTitle":"List of police dog breeds","wgCurRevisionId":980139453,"wgRevisionId":980139453,"wgArticleId":17333279,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Articles to be merged fr

In [48]:
 help(soup2.find_all)

Help on method find_all in module bs4.element:

find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) method of bs4.BeautifulSoup instance
    Look in the children of this PageElement and find all
    PageElements that match the given criteria.
    
    All find_* methods take a common set of arguments. See the online
    documentation for detailed explanations.
    
    :param name: A filter on tag name.
    :param attrs: A dictionary of filters on attribute values.
    :param recursive: If this is True, find_all() will perform a
        recursive search of this PageElement's children. Otherwise,
        only the direct children will be considered.
    :param limit: Stop looking after finding this many results.
    :kwargs: A dictionary of filters on attribute values.
    :return: A ResultSet of PageElements.
    :rtype: bs4.element.ResultSet



In [50]:
# we want to extrat the items of the list of all police dogs!
soup2.find_all('li')

[<li class="toclevel-1 tocsection-1"><a href="#All_police_dog_breeds_used_in_law_enforcement"><span class="tocnumber">1</span> <span class="toctext">All police dog breeds used in law enforcement</span></a></li>,
 <li class="toclevel-1 tocsection-2"><a href="#Illicit-substance_detection_dogs"><span class="tocnumber">2</span> <span class="toctext">Illicit-substance detection dogs</span></a></li>,
 <li class="toclevel-1 tocsection-3"><a href="#Tracking_dogs"><span class="tocnumber">3</span> <span class="toctext">Tracking dogs</span></a></li>,
 <li class="toclevel-1 tocsection-4"><a href="#Cadaver-sniffing_dogs"><span class="tocnumber">4</span> <span class="toctext">Cadaver-sniffing dogs</span></a></li>,
 <li class="toclevel-1 tocsection-5"><a href="#See_also"><span class="tocnumber">5</span> <span class="toctext">See also</span></a></li>,
 <li class="toclevel-1 tocsection-6"><a href="#References"><span class="tocnumber">6</span> <span class="toctext">References</span></a></li>,
 <li><a hr

In [51]:
# use list comp. to extract text of these items

[i.text for i in soup2.find_all('li')]

# this still has too much useless info! Start filtering with RegEx below:

['1 All police dog breeds used in law enforcement',
 '2 Illicit-substance detection dogs',
 '3 Tracking dogs',
 '4 Cadaver-sniffing dogs',
 '5 See also',
 '6 References',
 'Airedale Terrier',
 'Akita',
 'Belgian Malinois',
 'Belgian Sheepdog',
 'Staffordshire Bull Terrier',
 'Border Collie[1]',
 'Bouvier des Flandres',
 'Boxer',
 'Doberman Pinscher',
 'Dutch Shepherd',
 'German Shepherd',
 'Giant Schnauzer',
 'Indian pariah dog[2]',
 'Labrador Retriever',
 'Rottweiler',
 'Weimaraner',
 'Bloodhound',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'Indian pariah dog[2]',
 'Beagle',
 'Springer Spaniel',
 'German Shorthaired Pointer[7]',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'German Shorthaired Pointer[7]',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'Police dog',
 'List of fictional dogs',
 'List of dog breeds',
 'List of dog types',
 'Dogs in warfare',
 'List of dog fighting breeds',
 '^ Allsopp, Nigel (2012). K9 Cop

In [54]:
# add conditions to your list comp. to filter for more specific information
# start by only choosing text that starts with uppercase letter. 
# re.match() looks at just the BEGINNING of each string. 

[ i.text for i in soup2.find_all('li') if re.match('[A-Z]', i.text) ]

['Airedale Terrier',
 'Akita',
 'Belgian Malinois',
 'Belgian Sheepdog',
 'Staffordshire Bull Terrier',
 'Border Collie[1]',
 'Bouvier des Flandres',
 'Boxer',
 'Doberman Pinscher',
 'Dutch Shepherd',
 'German Shepherd',
 'Giant Schnauzer',
 'Indian pariah dog[2]',
 'Labrador Retriever',
 'Rottweiler',
 'Weimaraner',
 'Bloodhound',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'Indian pariah dog[2]',
 'Beagle',
 'Springer Spaniel',
 'German Shorthaired Pointer[7]',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'German Shorthaired Pointer[7]',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'Police dog',
 'List of fictional dogs',
 'List of dog breeds',
 'List of dog types',
 'Dogs in warfare',
 'List of dog fighting breeds',
 'Law enforcement-related lists',
 'Lists of breeds',
 'Police dog',
 'Articles with short description',
 'Short description is different from Wikidata',
 'Articles to be merged from July 2020',
 'All art

In [60]:
# subsetting to get just the first chunk of elements of the longer list

dogs = [i.text for i in soup2.find_all('li') if re.match('[A-Z]', i.text)] [0:17]

[ i.text for i in soup2.find_all('li') if re.match('[A-Z]', i.text) ] 
# alternatively, you could set the first list comp. to a variable, then do e.g. dogs[0:20].

['Airedale Terrier',
 'Akita',
 'Belgian Malinois',
 'Belgian Sheepdog',
 'Staffordshire Bull Terrier',
 'Border Collie[1]',
 'Bouvier des Flandres',
 'Boxer',
 'Doberman Pinscher',
 'Dutch Shepherd',
 'German Shepherd',
 'Giant Schnauzer',
 'Indian pariah dog[2]',
 'Labrador Retriever',
 'Rottweiler',
 'Weimaraner',
 'Bloodhound',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'Indian pariah dog[2]',
 'Beagle',
 'Springer Spaniel',
 'German Shorthaired Pointer[7]',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'German Shorthaired Pointer[7]',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)[6]',
 'Police dog',
 'List of fictional dogs',
 'List of dog breeds',
 'List of dog types',
 'Dogs in warfare',
 'List of dog fighting breeds',
 'Law enforcement-related lists',
 'Lists of breeds',
 'Police dog',
 'Articles with short description',
 'Short description is different from Wikidata',
 'Articles to be merged from July 2020',
 'All art

In [56]:
[ i.text.replace('[', '').replace(']', '').strip() for i in soup2.find_all('li') if re.match('[A-Z]', i.text) ]

# strip() removes trailing white spaces
# replace the special characters with white spaces
# continue using data cleaning methods to clean this text

['Airedale Terrier',
 'Akita',
 'Belgian Malinois',
 'Belgian Sheepdog',
 'Staffordshire Bull Terrier',
 'Border Collie1',
 'Bouvier des Flandres',
 'Boxer',
 'Doberman Pinscher',
 'Dutch Shepherd',
 'German Shepherd',
 'Giant Schnauzer',
 'Indian pariah dog2',
 'Labrador Retriever',
 'Rottweiler',
 'Weimaraner',
 'Bloodhound',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)6',
 'Indian pariah dog2',
 'Beagle',
 'Springer Spaniel',
 'German Shorthaired Pointer7',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)6',
 'German Shorthaired Pointer7',
 'Collie (a.k.a. Rough Collie, Smooth Collie, Scotch Collie)6',
 'Police dog',
 'List of fictional dogs',
 'List of dog breeds',
 'List of dog types',
 'Dogs in warfare',
 'List of dog fighting breeds',
 'Law enforcement-related lists',
 'Lists of breeds',
 'Police dog',
 'Articles with short description',
 'Short description is different from Wikidata',
 'Articles to be merged from July 2020',
 'All articles to be merg

In [61]:
pd.DataFrame(dogs)

# we can even make a dataframe out of it! 
# here I will make a dataframe out of just the subset (created and set to 'dogs' above) to avoid the superfluous elements of the scraped list. 

Unnamed: 0,0
0,Airedale Terrier
1,Akita
2,Belgian Malinois
3,Belgian Sheepdog
4,Staffordshire Bull Terrier
5,Border Collie[1]
6,Bouvier des Flandres
7,Boxer
8,Doberman Pinscher
9,Dutch Shepherd


In [66]:
# find all the links

# soup2.find_all('a')

# note: there are more arguments that we probably want to specify.
# for example, if you inspect all the links, they specify a "class". Let's say we want only "jump-links"
# help(soup2.find_all()) will show all the different attributes/Tags you can specify! The name of the Tag is 'a'. we want "class"
# you will specify your attributes as a key, value pair (this is also told in our help function!)

soup2.find_all('a', {'class': 'mw-jump-link'}) # we searched the Tag 'a' (links), and then specified the attribute 'class'


[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>]