In [3]:
import os
import json, time, urllib.parse
import requests
import pandas as pd
import numpy as np

### Set working directory

In [4]:
WD = '/Users/johnmichael/Documents/DATA512/data-512-homework_2'

### Import files

In [9]:
us_cities = pd.read_csv(os.path.join(WD, 'input/us_cities_by_state_SEPT.2023.csv'))
us_pop = pd.read_excel(os.path.join(WD, 'input/NST-EST2022-POP.xlsx'), skiprows=3)
us_regions = pd.read_excel(os.path.join(WD, 'input/US States by Region - US Census Bureau.xlsx'))

### Clean input files

In [51]:
'''
Clean regions file so that each row has a
unique region, division and state combination.
'''
us_regions_clean = us_regions.ffill()
us_regions_clean.dropna(subset='STATE', inplace=True)
us_regions_clean.drop_duplicates(subset='STATE', ignore_index=True, inplace=True)
us_regions_clean.columns = us_regions_clean.columns.str.lower()
us_regions_clean.head()

Unnamed: 0,region,division,state
0,Northeast,New England,Connecticut
1,Northeast,New England,Maine
2,Northeast,New England,Massachusetts
3,Northeast,New England,New Hampshire
4,Northeast,New England,Rhode Island


In [53]:
'''
Clean US population data so that each row
contains a state and the 2022 population.
'''
us_pop_clean = us_pop.copy()
us_pop_clean.columns = ['state', 'pop_2020_est', 'pop_2020', 'pop_2021', 'pop_2022']
us_pop_clean = us_pop_clean[us_pop_clean.state.str.contains('^\.', na=False)]
us_pop_clean['state'] = us_pop_clean.state.str.slice(1)
us_pop_clean = us_pop_clean[['state', 'pop_2022']].reset_index(drop=True)
us_pop_clean.head()

Unnamed: 0,state,pop_2022
0,Alabama,5074296.0
1,Alaska,733583.0
2,Arizona,7359197.0
3,Arkansas,3045637.0
4,California,39029342.0


### Request page info

In [57]:
us_cities.head()

Unnamed: 0,state,page_title,url
0,Alabama,"Abbeville, Alabama","https://en.wikipedia.org/wiki/Abbeville,_Alabama"
1,Alabama,"Adamsville, Alabama","https://en.wikipedia.org/wiki/Adamsville,_Alabama"
2,Alabama,"Addison, Alabama","https://en.wikipedia.org/wiki/Addison,_Alabama"
3,Alabama,"Akron, Alabama","https://en.wikipedia.org/wiki/Akron,_Alabama"
4,Alabama,"Alabaster, Alabama","https://en.wikipedia.org/wiki/Alabaster,_Alabama"


In [58]:
'''
All of the API-related code below are adopted from wp_page_info_example.ipynb
and wp_ores_liftwing_example.ipynb. Both notebooks are in the repository.
See the notebooks for full license information.
'''

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = 'https://en.wikipedia.org/w/api.php'

'''
We'll assume that there needs to be some throttling for these requests - 
we should always be nice to a free data resource
'''
API_LATENCY_ASSUMED = 0.002 # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

'''
When making automated requests we should include something that is unique
to the person making the request. This should include an email - your UW email
would be good to put in there.
'''
REQUEST_HEADERS = {
    'User-Agent': '<jmic94@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles we are requesting info for
ARTICLE_TITLES = list(us_cities.page_title)

'''
This is a string of additional page properties that can be returned.
See the Info documentation for what can be included.
If you don't want any this can simply be the empty string.
'''
# PAGEINFO_EXTENDED_PROPERTIES = 'talkid|url|watched|watchers'
PAGEINFO_EXTENDED_PROPERTIES = ''

# This template lists the basic parameters for making this request
PAGEINFO_PARAMS_TEMPLATE = {
    'action': 'query',
    'format': 'json',
    'titles': '', # to simplify this should be a single page title at a time
    'prop': 'info',
    'inprop': PAGEINFO_EXTENDED_PROPERTIES
}

In [60]:
def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception('Must supply an article title to make a pageinfo request.')

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [61]:
print(f"Getting page info data for: {ARTICLE_TITLES[3]}")
info = request_pageinfo_per_article(ARTICLE_TITLES[3])
print(json.dumps(info,indent=4))

Getting page info data for: Akron, Alabama
{
    "batchcomplete": "",
    "query": {
        "pages": {
            "104726": {
                "pageid": 104726,
                "ns": 0,
                "title": "Akron, Alabama",
                "contentmodel": "wikitext",
                "pagelanguage": "en",
                "pagelanguagehtmlcode": "en",
                "pagelanguagedir": "ltr",
                "touched": "2023-10-10T22:35:37Z",
                "lastrevid": 1165909508,
                "length": 11710,
                "talkid": 281240,
                "fullurl": "https://en.wikipedia.org/wiki/Akron,_Alabama",
                "editurl": "https://en.wikipedia.org/w/index.php?title=Akron,_Alabama&action=edit",
                "canonicalurl": "https://en.wikipedia.org/wiki/Akron,_Alabama"
            }
        }
    }
}
