# Page view count of wikipedia site 
This purpose of this notebook is to analyse the page view count of Wikipedia website. This notebook consistes mainly of three parts. The first section is downloading data from the Wikipedia api sites. The second section parses the data and the final section plots the data. 

In [46]:
#Imports and initializations 
from collections import defaultdict
from datetime import datetime
import pandas as pd 
import requests 
import json 
import csv

# Download data 
Wikipedia provides two endpoints for downloading data. There is a legacy endpoint and newer ednpoint. The two endpoints are provided below. Following the endpoint we define some utility functions that are required for API and file io access. 

In [3]:
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'

endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

In [4]:
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    if call.ok:
        response = call.json()
        return response
        

In [9]:
def save_data_as_json(data, filepath):
    with open(filepath, 'w') as json_file: 
        json.dump(data,json_file)

In [15]:
def read_json_file(filepath):
    with open(filepath, 'r') as json_file:
        return json.load(json_file)

# Initialize parameters
We initialize query parameters and headers needed for wikimedia api. Parameters will be updated as needed when actual data is quried by the api. 

In [5]:
# see: https://wikimedia.org/api/rest_v1/#!/Legacy_data/get_metrics_legacy_pagecounts_aggregate_project_access_site_granularity_start_end
params_legacy = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2001010100",
                # for end use 1st day of month following final month of data
                 "end" : "2018100100"
                    }

# see: https://wikimedia.org/api/rest_v1/#!/Pageviews_data/get_metrics_pageviews_aggregate_project_access_agent_granularity_start_end
params_pageviews = {"project" : "en.wikipedia.org",
                    "access" : "all-access",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2001010100",
                    # for end use 1st day of month following final month of data
                    "end" : '2018101000'
                        }

# Customize these with your own information
headers = {
    'User-Agent': 'https://github.com/koonaparaju',
    'From': 'koonav@uw.edu'
}

# Legacy endpoint query 
The following code queries the legacy endpoint for monthly count. The acces-site parameter is changed twice to collect all the data. The results of the query are stored in separate json file. 

In [36]:
access_site_types = ['desktop-site', 'mobile-site']
#The timestamp of the first hour/day/month to include, in YYYYMMDDHH format.
start_year = '2007'
start_month = '12'
end_year = '2016'
end_month = '08'
for access_site in access_site_types:
    params_legacy['access-site'] = access_site
    params_legacy['start'] = '{}{}0100'.format(start_year, start_month)
    params_legacy['end'] = '{}{}0100'.format(end_year, end_month)
    resp = api_call(endpoint_legacy, params_legacy)
    save_data_as_json(resp, 'pagecounts_{}_{}{}_{}{}.json'.format(access_site,start_year,start_month,end_year,end_month))

# New endpoint query 
The following code queries the page view endpoint for monthly count. The acces-site parameter is changed twice to collect all the data. The results of the query are stored in separate json file. This code should be refactoed to a utility function. we have code repetition here. 

In [37]:
access_site_types = ['desktop', 'mobile-app', 'mobile-web']
#The timestamp of the first hour/day/month to include, in YYYYMMDDHH format.
start_year = '2015'
start_month = '07'
end_year = '2020'
end_month = '09'
for access_site in access_site_types:
    params_pageviews['access'] = access_site
    params_pageviews['start'] = '{}{}0100'.format(start_year, start_month)
    params_pageviews['end'] = '{}{}0100'.format(end_year, end_month)
    resp = api_call(endpoint_pageviews, params_pageviews)
    save_data_as_json(resp, 'pageviews_{}_{}{}_{}{}.json'.format(access_site,start_year,start_month,end_year,end_month))

We aggregate moblie app and mobile web data for pageview below. This is an intermediate step. 

In [38]:
pageviews_mobile_files = ['pageviews_mobile-app_201507_202009.json', 'pageviews_mobile-web_201507_202009.json']
mobile_pageviews_view = defaultdict(int)
for pageviews_mobile_file in pageviews_mobile_files:
    items = read_json_file(pageviews_mobile_file)['items']
    for item in items:
        mobile_pageviews_view[item['timestamp']] += item["views"]

print(mobile_pageviews_view)
    

defaultdict(<class 'int'>, {'2015070100': 3288755294, '2015080100': 3302333038, '2015090100': 3170203333, '2015100100': 3268499132, '2015110100': 3236601070, '2015120100': 3376275307, '2016010100': 3717836846, '2016020100': 3334862272, '2016030100': 3386684191, '2016040100': 3258764002, '2016050100': 3395033236, '2016060100': 3354790945, '2016070100': 3496573762, '2016080100': 3515819303, '2016090100': 3393285781, '2016100100': 3509283891, '2016110100': 3591044925, '2016120100': 3776543855, '2017010100': 4231961542, '2017020100': 3711761399, '2017030100': 3903493989, '2017040100': 3639623119, '2017050100': 3686687720, '2017060100': 3519383193, '2017070100': 3725059253, '2017080100': 3621406302, '2017090100': 3531604369, '2017100100': 3673120542, '2017110100': 3670012061, '2017120100': 3830136304, '2018010100': 4259282371, '2018020100': 3725680728, '2018030100': 4049874257, '2018040100': 3927398111, '2018050100': 4089188522, '2018060100': 3977766741, '2018070100': 4266113752, '201808010

Mergeing all the remaingin json files into intermedite dictionary.

In [49]:
json_files = ['pageviews_desktop_201507_202009.json', 'pagecounts_mobile-site_200712_201608.json', 'pagecounts_desktop-site_200712_201608.json']
final_data = defaultdict(dict)
for json_file in json_files:
    api = json_file.split('_')[0]
    access_type = 'desktop' if 'desktop' in json_file else 'mobile'
    items = read_json_file(pageviews_mobile_file)['items']
    for item in items:
        ts = item['timestamp']
        final_data[ts]["{}_{}_views".format(api,access_type)] = item["views"]

for k,v in mobile_pageviews_view.items():
    final_data[k]['pageviews_mobile_views'] = v
#print(final_data)
    

We create a list of dictionaries below to persist to a final csv file.

In [50]:
csv_objects = []
for ts, data_item in final_data.items():
    year = ts[0:4]
    month = ts[4:6]
    pagecount_all_views = data_item.get('pagecounts_desktop_views',0)+ data_item.get('pagecounts_mobile_views',0)
    pageview_all_views = data_item.get('pageviews_desktop_views',0) + data_item.get('pageviews_mobile_views',0)
    data_item['year'] = year
    data_item['month'] = month
    data_item['pagecount_all_views'] = pagecount_all_views
    data_item['pageview_all_views'] = pageview_all_views
    csv_objects.append(data_item)

We persist the final csv final with the details needed. 

In [47]:
with open('en-wikipedia_traffic_200712-202008.csv', 'w', newline='') as csvfile:
    fieldnames = csv_objects[0].keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(csv_objects)