In [68]:
import json
import requests

In [69]:
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'
endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'


In [70]:
headers = {
    'User-Agent': 'https://github.com/kevSweet',
    'From': 'ksweet1@uw.edu'
}

# dump dictionary to file
def write_to_json(data, fn):
    with open(fn, "w") as outfile:
        json.dump(data, outfile)
        
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    
    return response

In [71]:
# see: https://wikimedia.org/api/rest_v1/#!/Pageviews_data/get_metrics_pageviews_aggregate_project_access_agent_granularity_start_end
params_legacy = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2001010100",
                # for end use 1st day of month following final month of data
                 "end" : "2018100100"
                    }

# pagecounts dump

# desktop
monthly_legacy = api_call(endpoint_legacy, params_legacy)
first_date = monthly_legacy['items'][0]['timestamp']
last_date = monthly_legacy['items'][-1]['timestamp']
legacy_desktopfn = 'pagecounts_desktop-site_' + first_date + '-' + last_date + '.json'
write_to_json(monthly_legacy, legacy_desktopfn)

# mobile
params_legacy['access-site'] = "mobile-site"
monthly_legacy = api_call(endpoint_legacy, params_legacy)
first_date = monthly_legacy['items'][0]['timestamp']
last_date = monthly_legacy['items'][-1]['timestamp']
legacy_mobilefn = 'pagecounts_mobile-site_' + first_date + '-' + last_date + '.json'
write_to_json(monthly_legacy, legacy_mobilefn)

In [72]:

params_pageviews = {"project" : "en.wikipedia.org",
                    "access" : "desktop",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2001010100",
                    # for end use 1st day of month following final month of data
                    "end" : '2021110100'
                        }

# pageviews dump

# desktop
monthly_pageviews = api_call(endpoint_pageviews, params_pageviews)
first_date = monthly_pageviews['items'][0]['timestamp']
last_date = monthly_pageviews['items'][-1]['timestamp']
pageviews_desktopfn = 'pageviews_desktop-site_'+ first_date + '-' + last_date + '.json'
write_to_json(monthly_pageviews, pageviews_desktopfn)

# mobile app
params_pageviews['access'] = "mobile-app"
monthly_pageviews = api_call(endpoint_pageviews, params_pageviews)
first_date = monthly_pageviews['items'][0]['timestamp']
last_date = monthly_pageviews['items'][-1]['timestamp']
pageviews_mobileappfn = 'pageviews_mobile-app_'+ first_date + '-' + last_date + '.json'
write_to_json(monthly_pageviews, pageviews_mobileappfn)

# mobile web
params_pageviews['access'] = "mobile-web"
monthly_pageviews = api_call(endpoint_pageviews, params_pageviews)
first_date = monthly_pageviews['items'][0]['timestamp']
last_date = monthly_pageviews['items'][-1]['timestamp']
pageviews_mobilewebfn = 'pageviews_mobile-web_'+ first_date + '-' + last_date + '.json'
write_to_json(monthly_pageviews, pageviews_mobilewebfn)

# Step 2: Data Processing

In [129]:
import pandas as pd
import datetime

In [238]:
def convert_dates(items_dict):
    for item in items_dict:
        datee = datetime.datetime.strptime(item['timestamp'], "%Y%m%d%H")
        item['year'] = datee.year
        item['month'] = datee.strftime("%m")
        del(item['timestamp'])
    return pd.DataFrame(items_dict)

def tabularize_data(items_dict, api):
    items_dict = convert_dates(items_dict)
    items_dict_tabularized = pd.json_normalize(items_dict['items'])
    if api == 'pagecounts':
        items_dict_tabularized = items_dict_tabularized.drop(['project', 'access-site', 'granularity'], axis=1)
    elif api == 'pageviews':
        items_dict_tabularized = items_dict_tabularized.drop(['project', 'access', 'granularity', 'agent'], axis=1)
    else:
        raise Exception('please enter valid api (pagecounts or pageviews)')
        
    return items_dict_tabularized

In [239]:
# Desktop Pagecounts
pagecounts_desktop = pd.read_json('./pagecounts_desktop-site_2007120100-2016080100.json')
pagecounts_desktop_tabularized = tabularize_data(pagecounts_desktop['items'], 'pagecounts')
pagecounts_desktop_tabularized

Unnamed: 0,count,year,month
0,2998331524,2007,12
1,4930902570,2008,01
2,4818393763,2008,02
3,4955405809,2008,03
4,5159162183,2008,04
...,...,...,...
100,5572235399,2016,04
101,5330532334,2016,05
102,4975092447,2016,06
103,5363966439,2016,07


In [240]:
# Mobile Pagecounts
pagecounts_mobile = pd.read_json('./pagecounts_mobile-site_2014100100-2016080100.json')
pagecounts_mobile_tabular = tabularize_data(pagecounts_mobile['items'], 'pagecounts')
pagecounts_mobile_tabular

Unnamed: 0,count,year,month
0,3091546685,2014,10
1,3027489668,2014,11
2,3278950021,2014,12
3,3485302091,2015,1
4,3091534479,2015,2
5,3330832588,2015,3
6,3222089917,2015,4
7,3334069483,2015,5
8,3038162463,2015,6
9,3254472695,2015,7


In [241]:
# Desktop Pageviews
pageviews_desktop = pd.read_json('./pageviews_desktop-site_2015070100-2021090100.json')
pageviews_desktop_tabular = tabularize_data(pageviews_desktop['items'], 'pageviews')
pageviews_desktop_tabular

Unnamed: 0,views,year,month
0,4376666686,2015,07
1,4332482183,2015,08
2,4485491704,2015,09
3,4477532755,2015,10
4,4287720220,2015,11
...,...,...,...
70,2824416177,2021,05
71,2505971366,2021,06
72,2765584368,2021,07
73,2763413934,2021,08


In [250]:
# Mobile Pageviews
pageview_mobileweb = pd.read_json('./pageviews_mobile-web_2015070100-2021090100.json')
pageview_mobileapp = pd.read_json('./pageviews_mobile-app_2015070100-2021090100.json')
pageview_mobile = pageview_mobileweb.append(pageview_mobileapp, ignore_index=True)

pageview_mobile_tabularized = tabularize_data(pageview_mobile['items'], 'pageviews')
pageview_mobile_tabularized = pd.DataFrame(pageview_mobile_tabular.groupby(['year', 'month']).views.agg(sum))
pageview_mobile_tabular

Unnamed: 0,year,month,views
0,2015,07,3288755294
1,2015,08,3302333038
2,2015,09,3170203333
3,2015,10,3268499132
4,2015,11,3236601070
...,...,...,...
70,2021,05,4976579558
71,2021,06,4584510417
72,2021,07,4778909421
73,2021,08,4732194000


In [None]:
wikitraffic_fn = 'en-wikipedia_traffic_200712-202108.csv'

# Step 3: Analysis