In [None]:
from IPython.core.display import HTML

In [None]:
import requests # comes with Anaconda - great for interacting with websites and APIs
from lxml import html # comes with Anaconda - great for parsing HTML
import zipfile, io, os # standard lib
import pandas as pd # awesome library | comes with Anaconda
import re # part of standard lib
import nameparser # you'll have to pip install nameparser

## Problem 1 -- Analyzing wealthy families

{GO OVER WEB SCRAPING FIRST}

You found a great article online and they were nice enough to put it in a table on their website. How do you easily get the data?

For something like this, you could easily just copy and paste into Excel, right? Well kind of. For simple tables and for easy things, sure that makes the most sense. No need to complicate gathering the data. You want to make that as efficient as possible, so do that. But let's say for this, that there are 100 pages of these names (I wish!). Or that you want to revisit them on a quarterly basis. Then doing the same thing and cleaning the same data in the same way each time doesn't make all that much sense when a couple of functions in Python can do. 

Or imagine you want to get the names, do whatever processing you want, add further context, compare against your database, and then store the data. Just copy and pasting into Excel is just one step, what if you didn't have to do that step? 

So, how should we think about this problem? What should our first step be? I know many of you, if not all of you, aren't familiar with Python, that's why you're in this session, but use what you do know to think about this. 

https://i.imgur.com/WRuJV6r.png

### Thought Process (very granular)
1. Do we have permission via the TOS (or rather are there any provisions forbidding access?) or the robots.txt?
2. For wikipedia we do, as long as we're kind and don't hit their servers with too many requests too fast (this is common, basically don't be a jerk and respect their server)
3. We're building a scraper, so we need something to scrape. We have to get the data somehow. We do this with the requests library and specifically we have to use the .get method. 
4. Once we have the web data, we have to parse it using the lxml.html library. (There are a couple of others namely, beautifulsoup). See below for how that works and read up about how to parse HTML with python. 

  * This is done by using selectors to extract bits of information.
    * Think of it as traversing a tree (because you are) so parent --> child --> grandchild
    * In trees, there are nodes, those nodes can have parents, siblings, children, grandchildren, and so on.
    * Xpath (//div extracts all div elements)
    * css selectors (div)
5. We have to analyze the HTML for how to best write the selectors. Try to find common attributes in the HTML to extract the elements you want. That could be in class|id|attr name. If not, you'll have to find the lowest order parent that still leads to all of the data points you want to extract and start there.

In [None]:
def getData():
    url = r'https://en.wikipedia.org/wiki/List_of_wealthiest_families'
    r = requests.get(url)
    print(r)
    body = html.fromstring(r.text)
    tbl = body.xpath(r'//*[@id="mw-content-text"]/table[3]')
    trs = tbl[0].xpath('tr') # this creates a list of elements
    rows = [] # this is an empty list to store all of the rows
    for tr in trs:
        # i use print statements as scaffolding to make sure i'm building in the right direction. 
        row = [] # this is an empty list to store our parsed row
        urls = [] # this is an empty list for all of urls we'll be grabbing
        tds = tr.xpath('td')
        # below is a nested for loop
        for td in tds:
            spans = td.xpath('span')
            if len(spans) > 0:
                row.append(spans[1].text)
            else:
                row.append(td.text_content())
                href = td.xpath('a')
                if len(href) > 0:
                    for h in href:
                        url = "https://en.wikipedia.org" + h.get('href')
                        urls.append(h.text)
                        urls.append(url)

        if len(row) > 0: # so if it's not empty
            #print(row)
            row += urls
            rows.append(row)        
            
    # another direction to take this is to parse out the list of family members to do more research on... like maybe
    # check if they're in your database...
    
    return rows
rows = getData()

In [None]:
rows = getData()

In [None]:
def check_NU(url):
    r = requests.get(url)
    hits = re.findall('Northwestern University|Kellogg School of Management', r.text, re.I)
    return len(hits)

In [None]:
for row in rows:
    print(row[0])
    for element in row:
        if element.startswith('htt'):
            hits = check_NU(element)
            if hits > 0:
                print('\t[{}]'.format(hits), element)