# Web Scraping

The work was based on the demo from Pohan on youtube.com.  The way of extraction of html contents is based on ElementTree package.  Newer method is with BeautifulSoup package.

one of the most common method to collecting data


## Prerequists

In [91]:
from datetime import datetime
from lxml import html
import requests, numpy as np, pandas as pd
import matplotlib.pylab as plt

pd.options.display.max_columns = 50

## Data and Data Source
working with the Nobel Prize data from wikipedia

## Before Web Scrape
Read the Terms of Service of the acraping website

### Basic Rules
* check if there is an API -> use it -> make things easier
* don't use scrape too much in a short time -> slow down servers and migh get banned from the web site
* never scrape anything private
* chgeck /robots.txt for allowed paths

## Fetch page and build HTML tree

In [92]:
def print_element(element):
    print("<{} {}> {} ...".format(element.tag, element.attrib, element.text_content()[:200].replace('\n', ' ')))

In [81]:
# get the page and printout to get the information for futher processing
page = requests.get('https://en.wikipedia.org/wiki/List_of_Nobel_laureates')
tree = html.fromstring(page.text)
# print_element(tree)

## Locate the table

### Find all tables

In [82]:
tables = tree.xpath('//table')
# for table in tables:
#     print_element(table)
    
# get the infomation:
# <table {'class': 'wikitable sortable'}>   Year Physics Chemistry Physiology or Medicine Literature Peace Economics

When locating the table watchout for client side Javascript alternation to the HTML code if the Javascript related class existed.

In [83]:
table = tree.xpath('//table[@class="wikitable sortable"]')[0]
# print_element(table)

## Extractr the subjects & years

In [84]:
subjects = [subject[0].text_content().replace('\n', ' ') for subject in table.xpath('tr')[0][1:]]
# print(subjects)
years = [item[0].text for item in table.xpath('tr')[1:-1]]
# print(years)

## Extract Winner Data

### Testing for single data

In [85]:
for index, item in enumerate(table.xpath('tr')[1][1:]):
    subject = subjects[index]
#     print(subject)
    for winner in item.xpath('span[@class="vcard"]/span/a'):
        winner_name = winner.attrib['title']
        winner_url = winner.attrib['href']
#         print(' - {}'.format(winner_name))

## Extra the complete table

In [86]:
year_list = []
subject_list = []
name_list = []
url_list = []
for y_index, year in enumerate(years):
#     print(subject)
    for index, item in enumerate(table.xpath('tr')[y_index + 1][1:]):
        subject = subjects[index]
#         print(subject)
        for winner in item.xpath('span[@class="vcard"]/span/a'):
            winner_name = winner.attrib['title']
            winner_url = winner.attrib['href']
#             print(' - {}'.format(winner_name))
            year_list.append(year)
            subject_list.append(subject)
            name_list.append(winner_name)
            url_list.append(winner_url)
            
# print(year_list)
# print(subject_list)
# print(name_list)
# print(url_list)

## Post Processing in Panadas

In [87]:
# generat data frame with listes extracting from Nobel Winner table
data_set = pd.DataFrame(name_list, columns = ['winner_name'])
data_set['subject'] = subject_list
data_set['year'] = year_list
data_set['year'] = data_set["year"].astype(np.int32)
data_set['url'] = url_list
# data_set.head(5)

### Looking at data

In [88]:
years_df = data_set['year'].value_counts().sort_index()
# print(years_df)

## Post Processing in Pandas

### Number of Prizes per year

In [94]:
plt.Figure(figsize = (15, 5))
plt.plot(years_df.index, years_df.values, linewidth = 2, alpha = .6)
plt.grid()
plt.xlabel("Year")
plt.ylabel("Number of Prizes")
plt.show()

In [97]:
years_df.value_counts()
plt.bar(years_df.value_counts().index, years_df.value_counts())
plt.box(on = 'off')
plt.grid()
plt.xlabel('Numberof Nobel Prizes / Year')
plt.ylabel(' ')
plt.show()

## By Subject

In [129]:
plt.Figure(figsize = (13, 5))
for subject in subjects:
    df = data_set[data_set['subject'] == subject]['year'].value_counts().sort_index().cumsum()
    plt.plot(df.index, df, label = subject, linewidth = 2, alpha = .6)
    
plt.grid()
plt.legend(loc = 'best')
plt.xlabel('year')
plt.ylabel('Cumulative Sum of Given Nobel Prizes')
plt.xticks(np.arange(1900, 2020, 10))

plt.show()

## The Effects pf WWI and WWII

In [131]:
plt.Figure(figsize = (13, 5))
for subject in subjects:
    df = data_set[(data_set['subject'] == subject) & (data_set['year'].astype(np.int32) < 1950)
                 ]['year'].value_counts().sort_index().cumsum()
    plt.plot(df.index, df, label = subject, linewidth = 2, alpha = .6)

plt.grid()
plt.legend(loc = 'best')
plt.xlabel('Year')
plt.ylabel('Cumulative Sum of Given Nobel Prizes')
plt.xticks(np.arange(1900, 1950, 5))

gca = plt.gca()

gca.add_patch(plt.Rectangle((1914, 0), 4, 60, alpha = .3, color = 'orange'))
gca.add_patch(plt.Rectangle((1939, 0), (45 - 39), 60, alpha = .3, color = 'orange'))

plt.annotate(s = 'WWI', xy = (1915, 55))
plt.annotate(s = 'WWII', xy = (1945, 55))

plt.show()