In [None]:
import json, time, urllib.parse
import requests
import pandas as pd

In [None]:
dinosaur_data = pd.read_csv('dinosaur_genera_cleaned.csv')
ARTICLE_TITLES = dinosaur_data['name']

In [None]:
#########
#
#    CONSTANTS
#

# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include a "unique ID" that will allow them to
# contact you if something happens - such as - your code exceeding request limits - or some other error happens
REQUEST_HEADERS = {
    'User-Agent': '<harrymn@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}


# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "desktop",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015010100",
    "end":         "2022090100"    # this is likely the wrong end date
}


In [None]:
#
#    PROCEDURES/FUNCTIONS
#

def request_pageviews_per_article(article_title = None, 
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(article_title.replace(' ','_'))
    request_template['article'] = article_title_encoded
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [None]:
def outputToJson(base_path,accesstype,start,end,data):
    out_path = base_path + '_' + accesstype + '_' + start + '-' + end + '.json'
    with open(out_path, 'w') as f:
        json.dump(data, f)

In [None]:
ACCESS_TYPES = ['desktop', 'mobile-web', 'mobile-app']
for access_type in ACCESS_TYPES:
    ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE['access'] = access_type
    data = []
    for i in range(1, len(ARTICLE_TITLES)):
        views = request_pageviews_per_article(ARTICLE_TITLES[i])
        items_list = views['items']
        data += items_list
        print("logged:", ARTICLE_TITLES[i])
    outputToJson('dino_monthly',access_type,'start201501', 'end202209', data)
    print(access_type, "completed successfully")

In [None]:
df = pd.read_json('dino_monthly_desktop_start201501-end202209.json')
df

In [94]:
df.describe()

Unnamed: 0,views
count,86.0
mean,1619.44186
std,4272.53953
min,357.0
25%,464.0
50%,570.0
75%,742.75
max,34235.0
