# SCRAPING AND VIZUALIZING DATA FROM WIKIPEDIA

The aim of this notebook is to scrounge through wikipedia data on Nobel Prize winners. Find some information about nobel prize winners, experiment and end up vizualizing the data with Javascript

## PART ONE: DATA GATHERING AND WRANGLING

### SCRAPING AND CLEANING THE DATA

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
BASE_URL = 'http://en.wikipedia.org'
    # Wikipedia will reject our request unless we add a 'User-Agent' attribute to our http header.

HEADERS = {'User-Agent': 'Chrome/70.0.3538.110'}

def get_nobel_soup():
    """ Return a parsed tag tree of our Nobel prize page """
    
    # Make a request to the Nobel page, setting valid headers
    
    response = requests.get( BASE_URL + '/wiki/List_of_Nobel_laureates', headers=HEADERS)
    
    # Return the content of the response parsed by BeautifulSoup
    
    if response.status_code != 200:
        print ("Sorry, couldn't reach page for some reason.")
        return 0;
    
    return BeautifulSoup(response.content, "lxml")

In [3]:
soup = get_nobel_soup()

In [4]:
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of Nobel laureates - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_Nobel_laureates","wgTitle":"List of Nobel laureates","wgCurRevisionId":871884043,"wgRevisionId":871884043,"wgArticleId":1175987,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Wikipedia indefinitely semi-protected pages","Articles containing Swedish-language text","Articles containing Norwegian-language text","Articles with hCards","Commons category link is locally defined","Commons category link is on Wikidata using P373","Featured lists","Nobel laureates","Lists of N

In [5]:
soup.find('table')

<table class="wikitable sortable">
<tbody><tr>
<th>Year
</th>
<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>
</th>
<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>
</th>
<th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>
</th></tr>
<tr>
<td align="center">1901
</td>
<td><span data-sort-value="Röntgen, Wilh

In [6]:
soup.find('table', {"class" : "wikitable sortable"})

<table class="wikitable sortable">
<tbody><tr>
<th>Year
</th>
<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>
</th>
<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>
</th>
<th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>
</th>
<th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>
</th></tr>
<tr>
<td align="center">1901
</td>
<td><span data-sort-value="Röntgen, Wilh

In [7]:
soup.select("table.sortable.wikitable")

[<table class="wikitable sortable">
 <tbody><tr>
 <th>Year
 </th>
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>
 </th>
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>
 </th>
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>
 </th>
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>
 </th>
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>
 </th>
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>
 </th></tr>
 <tr>
 <td align="center">1901
 </td>
 <td><span data-sort-

In [8]:
table = soup.select_one("table.sortable.wikitable")

table.select('th')

[<th>Year
 </th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>
 </th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>
 </th>,
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>
 </th>,
 <th>Year
 </th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel

In [9]:
table = soup.select_one('table')

def get_column_titles(table):   #a fucntion to get header rows of nobel winners
    #note table.find_all('th')  == table.select('th') in this instance
    #only one uses the DOM to while the other uses css selectors
    cols=[]
    for th in table.select_one('tr').select('th')[1:]:

        link = th.select_one('a') #select the first link in the header column
        #Store the category name and any Wikipedia link it has
        if link: #get the link text and the link href link from attrs. You can aslo get the title using attrs
            cols.append({'name': link.text,
                             'href': link.attrs['href']})
        else:
            cols.append({'name':th.text, 'href':None})
    return cols

In [10]:
print(get_column_titles(table))

[{'name': 'Physics', 'href': '/wiki/List_of_Nobel_laureates_in_Physics'}, {'name': 'Chemistry', 'href': '/wiki/List_of_Nobel_laureates_in_Chemistry'}, {'name': 'Physiologyor Medicine', 'href': '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine'}, {'name': 'Literature', 'href': '/wiki/List_of_Nobel_laureates_in_Literature'}, {'name': 'Peace', 'href': '/wiki/List_of_Nobel_Peace_Prize_laureates'}, {'name': 'Economics', 'href': '/wiki/List_of_Nobel_laureates_in_Economics'}]


In [11]:
def get_nobel_table(table):
    
    cols = get_column_titles(table)
    winners=[]
    
    for row in table.select('tr')[1:-1]: #a fucntion to get all data
        year = int((row.select_one('td').text)[:4])  #write the year from the first column
        
        for i, td in enumerate(row.select('td')[1:]): #iterate through every other column except the first
            
            for winner in td.select('a'): #select all tags in a column. Note: Some columns have more than one name/tag
                href = winner.attrs['href']  #select the link name to href
                
                if not href.startswith('#endnote'): #if the ref link is not an endnote then add the prize winner data
                    winners.append({'year':year,   
                                    'category':cols[i]['name'],
                                    'name': winner.text,
                                    'link': winner.attrs['href']
                                   })   
    return winners

In [12]:
winners = get_nobel_table(table)
winners

[{'category': 'Physics',
  'link': '/wiki/Wilhelm_R%C3%B6ntgen',
  'name': 'Wilhelm Röntgen',
  'year': 1901},
 {'category': 'Chemistry',
  'link': '/wiki/Jacobus_Henricus_van_%27t_Hoff',
  'name': "Jacobus Henricus van 't Hoff",
  'year': 1901},
 {'category': 'Physiologyor Medicine',
  'link': '/wiki/Emil_Adolf_von_Behring',
  'name': 'Emil Adolf von Behring',
  'year': 1901},
 {'category': 'Literature',
  'link': '/wiki/Sully_Prudhomme',
  'name': 'Sully Prudhomme',
  'year': 1901},
 {'category': 'Peace',
  'link': '/wiki/Henry_Dunant',
  'name': 'Henry Dunant',
  'year': 1901},
 {'category': 'Peace',
  'link': '/wiki/Fr%C3%A9d%C3%A9ric_Passy',
  'name': 'Frédéric Passy',
  'year': 1901},
 {'category': 'Physics',
  'link': '/wiki/Hendrik_Lorentz',
  'name': 'Hendrik Lorentz',
  'year': 1902},
 {'category': 'Physics',
  'link': '/wiki/Pieter_Zeeman',
  'name': 'Pieter Zeeman',
  'year': 1902},
 {'category': 'Chemistry',
  'link': '/wiki/Hermann_Emil_Fischer',
  'name': 'Hermann Emil F

### GETTING PERSONAL INFORMATION FROM INDIVIDUAL WINNER PAGES

In [13]:
def get_winner_country(winner): 
    
    person_data = {'name': winner['name']}
    
    personal_link = BASE_URL + winner['link']
    data = requests.get(personal_link)
    user_content = BeautifulSoup(data.content,"lxml")
    
    for tr in user_content.select_one('.infobox').select('tr'):
        try:
            attribute = tr.select_one('th').text
            if attribute == 'Nationality':
                person_data[attribute] = tr.select_one('td').text
        except AttributeError:
            pass
    return person_data

In [14]:
wdata = []
for winner in winners[:20]:
    wdata.append(get_winner_country(winner))
missing_nationality = []
for w in wdata:
    if not w.get('Nationality'):
        missing_nationality.append(w)
        
missing_nationality

[{'name': 'Élie Ducommun'},
 {'name': 'Charles Albert Gobat'},
 {'name': 'Marie Curie'},
 {'name': 'Niels Ryberg Finsen'}]

In [15]:
wdata

[{'Nationality': 'German[1]', 'name': 'Wilhelm Röntgen'},
 {'Nationality': 'Dutch', 'name': "Jacobus Henricus van 't Hoff"},
 {'Nationality': 'German', 'name': 'Emil Adolf von Behring'},
 {'Nationality': 'French', 'name': 'Sully Prudhomme'},
 {'Nationality': 'Swiss', 'name': 'Henry Dunant'},
 {'Nationality': 'French', 'name': 'Frédéric Passy'},
 {'Nationality': 'Netherlands', 'name': 'Hendrik Lorentz'},
 {'Nationality': 'Netherlands', 'name': 'Pieter Zeeman'},
 {'Nationality': 'Germany', 'name': 'Hermann Emil Fischer'},
 {'Nationality': 'British', 'name': 'Ronald Ross'},
 {'Nationality': 'German', 'name': 'Theodor Mommsen'},
 {'name': 'Élie Ducommun'},
 {'name': 'Charles Albert Gobat'},
 {'Nationality': 'French', 'name': 'Henri Becquerel'},
 {'Nationality': 'French', 'name': 'Pierre Curie'},
 {'name': 'Marie Curie'},
 {'Nationality': 'Swedish', 'name': 'Svante Arrhenius'},
 {'name': 'Niels Ryberg Finsen'},
 {'Nationality': 'Norwegian', 'name': 'Bjørnstjerne Bjørnson'},
 {'Nationality':

## PART ONE B: REWORKING SCRAPED DATA WITH SCRAPY

In [16]:
#importing scrapy into jupyter

import scrapy
from scrapy.crawler import CrawlerProcess 
from scrapy.utils.project import get_project_settings

class QuotesSpider(scrapy.Spider):
    name = "nobel_win"
    start_urls = [
        'https://en.wikipedia.org/wiki/List_of_Nobel_laureates_by_country'
    ]
    

process = CrawlerProcess({
    'USER_AGENT': 'User-Agent: Chrome/70.0.3538.110'
})

process.crawl(QuotesSpider)
process.start() 

2018-12-06 11:07:19 [scrapy.utils.log] INFO: Scrapy 1.5.1 started (bot: scrapybot)
2018-12-06 11:07:19 [scrapy.utils.log] INFO: Versions: lxml 3.7.3.0, libxml2 2.9.4, cssselect 1.0.3, parsel 1.5.1, w3lib 1.19.0, Twisted 18.9.0, Python 3.6.1 |Anaconda custom (64-bit)| (default, May 11 2017, 13:25:24) [MSC v.1900 64 bit (AMD64)], pyOpenSSL 17.0.0 (OpenSSL 1.0.2p  14 Aug 2018), cryptography 1.8.1, Platform Windows-10-10.0.17134-SP0
2018-12-06 11:07:19 [scrapy.crawler] INFO: Overridden settings: {'USER_AGENT': 'User-Agent: Chrome/70.0.3538.110'}
2018-12-06 11:07:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2018-12-06 11:07:20 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.D

In [17]:
import scrapy
import re
#Defining the data to be scraped
class NWinnerItem(scrapy.Item):
    country = scrapy.Field()
    name = scrapy.Field()
    link_text = scrapy.Field()

In [19]:
class NWinnerSpider(scrapy.Spider):
    """"Scrapes the country and link text of the Nobel-winners."""
    
    name = 'winners_list'
    allowed_domains = ['en.wikipedia.org']
    start_urls = [
        "http://en.wikipedia.org ... of_Nobel_laureates_by_country"
    ]
    
    def parse(self, response):
        h3s = response.xpath('//h3')
        
        for h3 in h3s: #find the h3 headers with user countries
            country = h3.xpath('span[@class="mw-headline"]/text()')[0].extract()
            winners = h3.xpath('following-sibling::ol[1]') #find the ordered list which is a sibling of said headers
            
            for winner in winners.xpath('li'): #search through the ordered list of winners
                text = winner.xpath('descendant-or-self::text()').extract() #extract winner data
                yield NWinnerItem(
                country = country[0], name = text[0],
                link_text = ' '.join(text)
                )

SyntaxError: invalid syntax (<ipython-input-19-d6ca4fc8ff32>, line 5)

## PART TWO: ANALYZING THE DATA

## PART THREE: VIZUALIZNG THE DATA