# Scraper

based on: https://gist.github.com/ajmendez/4536824

In [10]:
### imports
import urllib, json, pprint, re, datetime
import mwparserfromhell
import pandas as pd


## Scraping Functions

In [3]:
### datetime match
def PatternMatch(inStr):
    # print(inStr)
    patList=["%d/%m/%y", "%d %B %Y", "%B %d, %Y"]
    retStr=None
    for pat in patList:
        # print(pat)
        try:
            retStr=datetime.datetime.strptime(inStr, pat).strftime("%Y-%m-%d")
            break
        except ValueError:
            pass
    return retStr

### testing
PatternMatch("7 September 1805")

'1805-09-07'

In [4]:
### parsing the wiki date
def _parseDate(wikiDate):
  ''' Parse a mediawiki date template -- assumes years, month, day
  Input:
    a mwparser object containing just the date to be parsed
  Returns:
    datetime.date object of the date
  '''
  template = mwparserfromhell.parse("%s"%wikiDate.value)
  try:
    d = map(template.filter_templates()[0].get, [1,2,3])
    d = [int('%s'%x.value) for x in d]
    return datetime.date(*d)
  except IndexError:
    print("Problem with date in template:\n",template)
    '''trying alternate formats'''
    d = PatternMatch(template.__str__().split('(')[0].strip())
    return d
  except:
    return None

### testing
# _parseDate("death_date        = {{d-da|November 15, 1928|September 25, 1843}}".split('=')[-1].strip())

In [5]:
### parsing wikipage infobox
def _parseInfobox(page):
  '''Parse out the nice mediawiki markdown to get birth and death
  Input:
    mediawiki unicode page string
  Returns:
    a dictionary with name(string), birth_date:DateTime, death_date:DateTime
  '''
  try:
    code = mwparserfromhell.parse(page)
    for template in code.filter_templates():
      if 'Infobox' in template.name or 'infobox' in template.name:
        # Found the right template -- attempting to extract data
        # print("- got template",template)
        output = {}
        for nm in ['name','birth_name']:
          try:
            output['name'] = "%s"%template.get(nm).value
            break
          except ValueError:
            output['name'] = None
            pass
        if output['name']!=None:
          output['name']=output['name'].strip()

        ### birth info.
        item = None
        for date in [x+y+z for x in ["birth","Birth"] for y in ['_',' ','-'] for z in ["date","Date"]]:
          # print(f"\t- {date}")
          try:
            item = _parseDate(template.get(date))
            if item!=None:
              break
          except ValueError as e:
            pass
        output['birth_date'] = item

        ### death info.
        item = None
        for date in [x+y+z for x in ["death","Death"] for y in ['_',' ','-'] for z in ["date","Date"]]:
          # print(f"\t- {date}")
          try:
            item = _parseDate(template.get(date))
            if item!=None:
              break
          except ValueError as e:
            pass
        output['death_date'] = item
        if item==None:
          print("none!!!")
          print(template.get('death_date'))
          print(_parseDate(template.get('death_date')))

        # ok we are done here
        return output
        
    raise ValueError('Missing InfoBox')

  except Exception as e:
    print("Failed to parse find infobox or something else")
    raise e



In [6]:
def wikiAge(wikiTitle, function=None):
  ''' Parse a wikipedia url to run a function on the data
  Input:
    wikiTitle : Title of a wiki page for an individual with born and died date
    function : a python function which operates on a mediawikipage
  Output:
    Person Dictionary with ['name', 'birth_date', 'death_date'

  Example:
    person = wikiDate('Albert_Einstein', function=_parseInfobox)
    assert person['name'] == 'Albert Einstein'
    assert person['birth_date'] == datetime.date(1879, 03, 14) # '14 March 1879'
    assert person['death_date'] == datetime.date(1955, 04, 18) # '18 April 1955'
  '''
  URLTEMPLATE = 'http://en.wikipedia.org/w/api.php?format=json&action=query&titles=%s&prop=revisions&rvprop=content'
  
  # Attempt to read page otherwise error out on all errors
  try:
    pageJson = urllib.request.urlopen(URLTEMPLATE%(wikiTitle)).readlines()[0]
  except Exception as e:
    print("Failed to Read page: %s"%(URLTEMPLATE%(wikiTitle)) )
    raise e

  # Now that we have some json Data
  try:
    page = json.loads(pageJson)
    # The data is three dictionaries deep:
    # Ignoring the extra data
    page = page['query']['pages']
    pageid = list(page.keys())[0]
    page = page[pageid]['revisions'][0]['*'] 
    # Page should now contain the mediawiki unicode markup text
    # runs function to try to grab what you want out of it
    # print page
    return function(page)

  except Exception as e:
    print('Failed to process Page -- Probably means that the wiki page was missing something important')
    raise e


## List of Names

In [55]:
### list of names
### TODO sort into useful sets
nameList={'older': [
                "Martin Luther",
                "James Ussher",
                "Gottfried Wilhelm Leibniz",
            #     "René Descartes",
                "Thomas Burnet", # bad format
                "Nicolas Steno",
            #     "John Woodward (naturalist)", # bad format
                "William Whiston"],
          'middle': [
                "Immanuel Kant",
                "Pierre-Simon Laplace",
                "Georges-Louis Leclerc, Comte de Buffon",
                "Robert Hooke",
                # "Benoît de Maillet",
                "James Hutton",
                "Georges Cuvier",
                "Jean-Baptiste Lamarck",
                "Richard Kirwan",
                # "Sir James Hall, 4th Baronet", # bad format
                "Abraham Gottlob Werner",
                "William Buckland",
                "George Julius Poulett Scrope"],
          'newer': [
                "Louis Agassiz",
                "William Whewell",
                "Archibald Geikie",
                "Thomas Chrowder Chamberlin",
                "Julius von Mayer",
                "James Prescott Joule",
                "Rudolf Clausius",
                "Lord Kelvin", 
                "Thomas Henry Huxley",
                "Samuel Wilberforce",
                "Charles Lyell",
                "Charles Darwin",
                "James Croll",
                "Albert Einstein"]
}


In [50]:
nameList={ 'geologists': [
              "Ulisse Aldrovandi",
              "Georges-Louis Leclerc, Comte de Buffon",
              "James Hutton",
              "Abraham Gottlob Werner",
              "Georges Cuvier",
              "Mary Anning",
              "William Buckland",
              "Gideon Mantell",
              "William Smith (geologist)",
              "George Bellas Greenough",
              "Charles Lyell",
              "Louis Agassiz",
              "Giovanni Arduino (geologist)",
              "Johann Gottlob Lehmann (scientist)",
              "Alexander von Humboldt",
              "Christian Leopold von Buch",
              # "Jean Baptiste Julien d'Omalius d'Halloy",
              # "William Phillips (geologist)",
              "William Conybeare (geologist)",
              "Adam Sedgwick",
              "Roderick Murchison",
            #   "Charles Cadworth",
              # "Friedrich August von Alberti"
          ]
}

## Get Data

In [56]:
### loop over names
data=[]
for k,v in nameList.items():
    for name in v:
        print(f"\n### {name}")
        person = wikiAge(name.replace(' ','_'), function=_parseInfobox)
        
        for key in person:
            print('Key:%s  Value: %s'%(key,person[key]))

        if person['name']==None:
            person['name']=name

        person['period']=k

        data.append(person)



### Martin Luther
Problem with date in template:
  10 November 1483

Key:name  Value: Martin Luther
Key:birth_date  Value: 1483-11-10
Key:death_date  Value: 1546-02-18

### James Ussher
Problem with date in template:
  4 January 1581

Key:name  Value: James Ussher
Key:birth_date  Value: 1581-01-04
Key:death_date  Value: 1656-03-21

### Gottfried Wilhelm Leibniz
Problem with date in template:
  1 July 1646

Key:name  Value: None
Key:birth_date  Value: 1646-07-01
Key:death_date  Value: 1716-11-14

### Thomas Burnet
Problem with date in template:
  ''c.'' 1635 

Problem with date in template:
  27 September 1715 (aged ''c.'' 80) 

Key:name  Value: Thomas Burnet
Key:birth_date  Value: None
Key:death_date  Value: 1715-09-27

### Nicolas Steno
Key:name  Value: Niels Steensen
Key:birth_date  Value: 1638-01-01
Key:death_date  Value: 1686-11-25

### William Whiston
Key:name  Value: William Whiston
Key:birth_date  Value: 1667-12-09
Key:death_date  Value: 1752-08-22

### Immanuel Kant
none!!!
 d

In [57]:
### make table + add missing
df_data=pd.DataFrame(data)
df_data['death_date'][df_data.name == 'Archibald Geikie'] = "1924-11-10"
df_data['death_date'][df_data.name == 'Thomas Chrowder Chamberlin'] = "1928-11-15"
for col in df_data.columns:
    df_data[col]=df_data[col].astype(str)
df_data

Unnamed: 0,name,birth_date,death_date,period
0,Martin Luther,1483-11-10,1546-02-18,older
1,James Ussher,1581-01-04,1656-03-21,older
2,Gottfried Wilhelm Leibniz,1646-07-01,1716-11-14,older
3,Thomas Burnet,,1715-09-27,older
4,Niels Steensen,1638-01-01,1686-11-25,older
5,William Whiston,1667-12-09,1752-08-22,older
6,Immanuel Kant,1724-04-22,,middle
7,Pierre-Simon Laplace,1749-03-23,1827-03-05,middle
8,"Georges-Louis Leclerc,<br>Comte de Buffon",1707-09-07,1788-04-16,middle
9,Robert Hooke,1635-07-18,,middle


### Save Data

In [58]:
### Save Data to ../data
saveName="agents"

if saveName[-4::]!=".csv":
    saveName=saveName+".csv"
df_data.to_csv("../data/"+saveName, index=False)