# Scraper

based on: https://gist.github.com/ajmendez/4536824

In [None]:
### imports
import urllib, json, pprint, re, datetime
import mwparserfromhell
import pandas as pd
import altair as alt

## Scraping Functions

In [None]:
### datetime match
def PatternMatch(inStr):
    # print(inStr)
    patList=["%d/%m/%y", "%d %B %Y", "%B %d, %Y"]
    retStr=None
    for pat in patList:
        # print(pat)
        try:
            retStr=datetime.datetime.strptime(inStr, pat).strftime("%Y-%m-%d")
            break
        except ValueError:
            pass
    return retStr

### testing
PatternMatch("7 September 1805")

In [None]:
### parsing the wiki date
def _parseDate(wikiDate):
  ''' Parse a mediawiki date template -- assumes years, month, day
  Input:
    a mwparser object containing just the date to be parsed
  Returns:
    datetime.date object of the date
  '''
  template = mwparserfromhell.parse("%s"%wikiDate.value)
  try:
    d = map(template.filter_templates()[0].get, [1,2,3])
    d = [int('%s'%x.value) for x in d]
    return datetime.date(*d)
  except IndexError:
    print("Problem with date in template:\n",template)
    '''trying alternate formats'''
    d = PatternMatch(template.__str__().split('(')[0].strip())
    return d
  except:
    return None

### testing
# _parseDate("death_date        = {{d-da|November 15, 1928|September 25, 1843}}".split('=')[-1].strip())

In [None]:
### parsing wikipage infobox
def _parseInfobox(page):
  '''Parse out the nice mediawiki markdown to get birth and death
  Input:
    mediawiki unicode page string
  Returns:
    a dictionary with name(string), birth_date:DateTime, death_date:DateTime
  '''
  try:
    code = mwparserfromhell.parse(page)
    for template in code.filter_templates():
      if 'Infobox' in template.name or 'infobox' in template.name:
        # Found the right template -- attempting to extract data
        # print("- got template",template)
        output = {}
        for nm in ['name','birth_name']:
          try:
            output['name'] = "%s"%template.get(nm).value
            break
          except ValueError:
            output['name'] = None
            pass
        if output['name']!=None:
          output['name']=output['name'].strip()

        ### birth info.
        item = None
        for date in [x+y+z for x in ["birth","Birth"] for y in ['_',' ','-'] for z in ["date","Date"]]:
          # print(f"\t- {date}")
          try:
            item = _parseDate(template.get(date))
            if item!=None:
              break
          except ValueError as e:
            pass
        output['birth_date'] = item

        ### death info.
        item = None
        for date in [x+y+z for x in ["death","Death"] for y in ['_',' ','-'] for z in ["date","Date"]]:
          # print(f"\t- {date}")
          try:
            item = _parseDate(template.get(date))
            if item!=None:
              break
          except ValueError as e:
            pass
        output['death_date'] = item
        if item==None:
          print("none!!!")
          print(template.get('death_date'))
          print(_parseDate(template.get('death_date')))

        # ok we are done here
        return output
        
    raise ValueError('Missing InfoBox')

  except Exception as e:
    print("Failed to parse find infobox or something else")
    raise e



In [None]:
def wikiAge(wikiTitle, function=None):
  ''' Parse a wikipedia url to run a function on the data
  Input:
    wikiTitle : Title of a wiki page for an individual with born and died date
    function : a python function which operates on a mediawikipage
  Output:
    Person Dictionary with ['name', 'birth_date', 'death_date'

  Example:
    person = wikiDate('Albert_Einstein', function=_parseInfobox)
    assert person['name'] == 'Albert Einstein'
    assert person['birth_date'] == datetime.date(1879, 03, 14) # '14 March 1879'
    assert person['death_date'] == datetime.date(1955, 04, 18) # '18 April 1955'
  '''
  URLTEMPLATE = 'http://en.wikipedia.org/w/api.php?format=json&action=query&titles=%s&prop=revisions&rvprop=content'
  
  # Attempt to read page otherwise error out on all errors
  try:
    pageJson = urllib.request.urlopen(URLTEMPLATE%(wikiTitle)).readlines()[0]
  except Exception as e:
    print("Failed to Read page: %s"%(URLTEMPLATE%(wikiTitle)) )
    raise e

  # Now that we have some json Data
  try:
    page = json.loads(pageJson)
    # The data is three dictionaries deep:
    # Ignoring the extra data
    page = page['query']['pages']
    pageid = list(page.keys())[0]
    page = page[pageid]['revisions'][0]['*'] 
    # Page should now contain the mediawiki unicode markup text
    # runs function to try to grab what you want out of it
    # print page
    return function(page)

  except Exception as e:
    print('Failed to process Page -- Probably means that the wiki page was missing something important')
    raise e


## Names

In [None]:
### list of names
### TODO sort into useful sets
nameList={'older': [
                "Martin Luther",
                "James Ussher",
                "Gottfried Wilhelm Leibniz",
            #     "René Descartes",
                "Thomas Burnet", # bad format
                "Nicolas Steno",
            #     "John Woodward (naturalist)", # bad format
                "William Whiston"],
          'middle': [
                "Immanuel Kant",
                "Pierre-Simon Laplace",
                "Georges-Louis Leclerc, Comte de Buffon",
                "Robert Hooke",
                # "Benoît de Maillet",
                "James Hutton",
                "Georges Cuvier",
                "Jean-Baptiste Lamarck",
                "Richard Kirwan",
                # "Sir James Hall, 4th Baronet", # bad format
                "Abraham Gottlob Werner",
                "William Buckland",
                "George Julius Poulett Scrope"],
          'newer': [
                "Louis Agassiz",
                "William Whewell",
                "Archibald Geikie",
                "Thomas Chrowder Chamberlin",
                "Julius von Mayer",
                "James Prescott Joule",
                "Rudolf Clausius",
                "Lord Kelvin", 
                "Thomas Henry Huxley",
                "Samuel Wilberforce",
                "Charles Lyell",
                "Charles Darwin",
                "James Croll",
                "Albert Einstein"]
}


In [None]:
### loop over names
data=[]
for k,v in nameList.items():
    for name in v:
        print(f"### {name}")
        person = wikiAge(name.replace(' ','_'), function=_parseInfobox)
        
        for key in person:
            print('Key:%s  Value: %s'%(key,person[key]))

        if person['name']==None:
            person['name']=name

        person['period']=k

        data.append(person)


In [None]:
### make table + add missing
df_data=pd.DataFrame(data)
df_data['death_date'][df_data.name == 'Archibald Geikie'] = "1924-11-10"
df_data['death_date'][df_data.name == 'Thomas Chrowder Chamberlin'] = "1928-11-15"
for col in df_data.columns:
    df_data[col]=df_data[col].astype(str)
df_data

In [None]:
### plotting
alt.Chart(df_data).mark_bar().encode(
    x=alt.X('birth_date:T', title="Date"),
    x2=alt.X2('death_date:T', title=None),
    y=alt.Y('name:N', title="Name"),
    color=alt.Color('period:N', sort=['older','middle','newer']),
    tooltip=['name:N','birth_date:T','death_date:T','period:N']
).properties(
    title="Who's Who"
)

### old code

In [None]:
person = wikiAge('Albert_Einstein', function=_parseInfobox)
for key in person:
  print('Key:%s  Value: %s'%(key,person[key]))

person = wikiAge('Galileo_Galilei', function=_parseInfobox)
for key in person:
  print('Key:%s  Value: %s'%(key,person[key]))

person = wikiAge('Mark_Zuckerberg', function=_parseInfobox)
for key in person:
  print('Key:%s  Value: %s'%(key,person[key]))