In [1]:
# These are standard python modules
import json, time, urllib.parse

# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests
import pandas as pd
import os

import datetime as dt
import matplotlib.pyplot as plt

article = pd.read_csv('dinosaur.csv')

In [2]:
# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include a "unique ID" that will allow them to
# contact you if something happens - such as - your code exceeding request limits - or some other error happens
REQUEST_HEADERS = {
    'User-Agent': 'choubju1@uw.edu, University of Washington, MSDS DATA 512 - AUTUMN 2022',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = list(article['name'])

# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "",             # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",
    "end":         "2022093000"    
}

In [3]:
def request_pageviews_per_article(article_title = None,
                                  access=None,
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):
    # Make sure we have an article title
    if not article_title: return None
    
    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(article_title.replace(' ','_'))
    request_template['article'] = article_title_encoded
    request_template['access'] = access
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
        
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [4]:
filename = 'dino_monthly_desktop_201507-202209.json'
listObj = []
for i in ARTICLE_TITLES:
    views = request_pageviews_per_article(i,'desktop')
    for month in views['items']:
        listObj.append(month)
    with open(filename, 'w') as json_file:
        json.dump(listObj, json_file, 
                        indent=4,  
                        separators=(',',': '))

In [None]:
filename = 'dino_monthly_mobile_201507-202209.json'
listObjMobile = []
for i in ARTICLE_TITLES:
    viewsApp = request_pageviews_per_article(i,"mobile-app")
    viewsWeb = request_pageviews_per_article(i,"mobile-web")
    viewsMobile = viewsApp
    for month in viewsWeb['items']:
        ts = month['timestamp']
        vs = month['views']
        for m in viewsMobile['items']:
            if m['timestamp']==ts:
                m['views']+=vs
                m['access']='mobile'

    for month in viewsMobile['items']:
        listObjMobile.append(month)
    with open(filename, 'w') as json_file:
        json.dump(listObjMobile, json_file, 
                        indent=4,  
                        separators=(',',': '))

In [None]:
filename = 'all_access_data.json'
listObjCumulative = []
for i in ARTICLE_TITLES:
    viewsCumulative = request_pageviews_per_article(i,"all-access")
    for month in viewsCumulative['items']:
        listObjCumulative.append(month)
    with open(filename, 'w') as json_file:
        json.dump(listObjCumulative, json_file, 
                        indent=4,  
                        separators=(',',': '))