## Scrape Wikipedia

In [1]:
#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# vim:fenc=utf-8

"""

"""

import wikipedia
from lxml import html, etree
import pandas as pd
import pandas.io.sql as pd_sql
import numpy as np
import sqlite3
import re

wikipedia.set_rate_limiting(True)

def find_page(org_row):
    hits = wikipedia.search(org_row['name'], results=1)
    try:
        page = wikipedia.page(hits[0])
    except wikipedia.exceptions.DisambiguationError:
        return('DisambiguationError')
    except wikipedia.exceptions.PageError:
        return('PageError')
    except:
        return('Error')
    return(page)

def add_page(org_row):
    if 'page' in org_row:
        print('Skipping {}'.format(org_row['name']))
        return(org_row['page'])
    else:
        print('Looking for {}'.format(org_row['name']))
        page = find_page(org_row)
        return(page)
    
def get_infobox(series):
    try:
        df = pd.read_html(series['page'].html(), attrs = {'class': 'infobox vcard'})[0]
        df.columns = ['key', 'value']
        df = df.set_index('key')
        return(df)
    except:
        return(None)

    
def get_row_attr(row, attr):
    try:
        return(getattr(row['page'], attr))
    except (KeyError, AttributeError):
        return(None)

In [28]:
orgs = pd.read_csv('~/projects/group-positions/position_counts.csv')
orgs = orgs.sort_values('count', ascending = False)
orgs = orgs.drop_duplicates('name')
orgs.head()
# orgs = orgs[1:10]

Unnamed: 0,name,count,topcode
0,U.S. Chamber of Commerce,625,Other
3,National Association of Manufacturers,380,Other
8,AFL-CIO,361,Other
10,National Education Association,359,Education
12,Public Citizen,242,Other


In [29]:
orgs['page'] = orgs.apply(add_page, axis=1)

Looking for U.S. Chamber of Commerce
Looking for National Association of Manufacturers
Looking for AFL-CIO
Looking for National Education Association
Looking for Public Citizen
Looking for National Taxpayers Union
Looking for Sierra Club
Looking for American Federation of State, County and Municipal Employees
Looking for American Civil Liberties Union
Looking for Americans for Tax Reform
Looking for Natural Resources Defense Council


KeyboardInterrupt: 

In [30]:
orgs = orgs.set_index(orgs.name)
orgs.head()
orgs.tail()
orgs.page.head()

AttributeError: 'DataFrame' object has no attribute 'page'

In [None]:
orgs['url'] = orgs.apply(lambda row: get_row_attr(row, 'url'), axis=1)
orgs['summary'] = orgs.apply(lambda row: get_row_attr(row, 'summary'), axis=1)

In [None]:
orgs['links'] = orgs.apply(lambda row: get_row_attr(row, 'links'), axis=1)
orgs['references'] = orgs.apply(lambda row: get_row_attr(row, 'references'), axis=1)

## Explore

In [31]:
#orgs.to_pickle('orgs.pkl')
orgs = pd.read_pickle('orgs.pkl')

In [32]:
orgs.tail()

Unnamed: 0_level_0,name,count,topcode,page,url,summary
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
American Council on the Teaching of Foreign Languages,American Council on the Teaching of Foreign La...,1,Education,<WikipediaPage 'American Council on the Teachi...,https://en.wikipedia.org/wiki/American_Council...,The American Council on the Teaching of Foreig...
Parents for Public Schools,Parents for Public Schools,1,Education,<WikipediaPage 'Dick Molpus'>,https://en.wikipedia.org/wiki/Dick_Molpus,"Richard ""Dick"" Molpus (born September 7, 1949)..."
National Institute on Out-of-School Time,National Institute on Out-of-School Time,1,Education,<WikipediaPage 'After-school activity'>,https://en.wikipedia.org/wiki/After-school_act...,An after-school activity is any organized prog...
Los Angeles Education Partnership,Los Angeles Education Partnership,1,Education,<WikipediaPage 'East Los Angeles Renaissance A...,https://en.wikipedia.org/wiki/East_Los_Angeles...,"The East Los Angeles Renaissance Academy, offi..."
Zonta International,Zonta International,1,Women's Issues,<WikipediaPage 'Zonta International'>,https://en.wikipedia.org/wiki/Zonta_International,Zonta International is an international servic...


In [98]:
infoboxes = {}
for key, row in o.iterrows():
    infobox = get_infobox(row)
    infoboxes[key] = infobox
infoboxes = pd.concat(infoboxes)
infoboxes = infoboxes.dropna()
infoboxes.index.names = ['name', 'field']
infoboxes.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,value
name,field,Unnamed: 2_level_1
AARP,Motto,"""Real Possibilities."""
AARP,Predecessor,National Retired Teachers Association
AARP,Formation,1958 (1958)
AARP,Tax ID no.,95-1985500[1]
AARP,Headquarters,"Washington, D.C., U.S."


In [100]:
# Sample the result for a glimpse
infoboxes.ix[np.random.choice(infoboxes.index.get_level_values(0), 2)]

Unnamed: 0_level_0,Unnamed: 1_level_0,value
name,field,Unnamed: 2_level_1
Microsoft,Type,Public
Microsoft,Traded as,NASDAQ: MSFTDow Jones Industrial Average Compo...
Microsoft,Industry,Computer softwareComputer hardwareConsumer ele...
Microsoft,Founded,"April 4, 1975; 41 years ago (1975-04-04)Albuqu..."
Microsoft,Founders,Bill GatesPaul Allen
Microsoft,Headquarters,"Microsoft Redmond campus, Redmond, Washington,..."
Microsoft,Area served,Worldwide
Microsoft,Key people,John W. Thompson (Chairman) Satya Nadella (CEO...
Microsoft,Products,Windows Office Servers Skype Visual Studio...
Microsoft,Services,MSN Bing OneDrive MSDN Outlook.com TechNe...


In [176]:
# Which orgs had no Wikipedia pag?
no_wiki = set(orgs['name']) - set(infoboxes.index.get_level_values(0).unique())

# Which have the wrong page?
# Difficult to say without manual review 

# Which of the pages gave location info? 
fields = infoboxes.index.levels[1]
loc_fields = fields[fields.str.contains('address|location|headquarters|office(?!r)|coordinates', flags=re.I)]
infoboxes.unstack().loc[slice(str(loc_fields))].index.values

array(['AARP', 'AFL-CIO', 'American Academy of Pediatrics',
       'American Bankers Association', 'American Bar Association',
       'American Civil Liberties Union',
       'American Council of Engineering Companies',
       'American Farm Bureau Federation',
       'American Federation of Government Employees',
       'American Federation of State, County and Municipal Employees',
       'American Federation of Teachers',
       'American Hotel & Lodging Association',
       'American Immigration Lawyers Association', 'American Legion',
       'American Library Association', 'American Medical Association',
       'American Petroleum Institute',
       'American Public Health Association',
       'Americans for Financial Reform', 'Americans for Prosperity',
       'Americans for Tax Reform', 'Associated Builders & Contractors',
       'Center for American Progress', 'Common Cause',
       'Communications Workers of America',
       'Competitive Enterprise Institute', 'Consumer Action

In [177]:
# Which didn't?

In [10]:
#infoboxes.to_pickle('infoboxes.pkl')
infoboxes = pd.read_pickle('../data/infoboxes.pkl')

In [11]:
infoboxes

Unnamed: 0_level_0,Unnamed: 1_level_0,value
name,key,Unnamed: 2_level_1
AARP,Motto,"""Real Possibilities."""
AARP,Predecessor,National Retired Teachers Association
AARP,Formation,1958 (1958)
AARP,Tax ID no.,95-1985500[1]
AARP,Headquarters,"Washington, D.C., U.S."
AARP,Membership,"37,000,000+"
AARP,Chief Executive Officer,Jo Ann Jenkins[1]
AARP,President,Jeannine English[1]
AARP,Board Chair,Carol Raphael[1]
AARP,Subsidiaries,AARP Foundation;AARP Institute;Legal Counsel f...


In [12]:
infoboxes.to_csv('../data/infoboxes.csv')