In [1]:
import json
import requests

In [2]:
endpoint_pagecount = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'

endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

In [34]:
# Parameters for getting aggregated legacy view data(Page count all views) 
# see: https://wikimedia.org/api/rest_v1/#!/Legacy_data/get_metrics_legacy_pagecounts_aggregate_project_access_site_granularity_start_end

# Parameters for getting aggregated legacy view data(Page count desktop views) 
pagecount_desktop_views = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2007120100",
                # for end use 1st day of month following final month of data
                 "end" : "2016073100"
                    }

# Parameters for getting aggregated legacy view data(Page count desktop views) 
pagecount_mobile_views = {"project" : "en.wikipedia.org",
                 "access-site" : "mobile-site",
                 "granularity" : "monthly",
                 "start" : "2007120100",
                # for end use 1st day of month following final month of data
                 "end" : "2016073100"
                    }

# Parameters for getting aggregated current standard pageview data(Desktop)
# see: https://wikimedia.org/api/rest_v1/#!/Pageviews_data/get_metrics_pageviews_aggregate_project_access_agent_granularity_start_end
pageview_desktop_views = {"project" : "en.wikipedia.org",
                    "access" : "desktop",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015010100",
                    # for end use 1st day of month following final month of data
                    "end" : '2021093000'
                        }

# Parameters for getting aggregated current standard pageview data(Mobile web)
pageview_mobile_web_views = {"project" : "en.wikipedia.org",
                    "access" : "mobile-web",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015010100",
                    # for end use 1st day of month following final month of data
                    "end" : '2021093000'
                        }

# Parameters for getting aggregated current standard pageview data(Mobile app)
pageview_mobile_app_views = {"project" : "en.wikipedia.org",
                    "access" : "mobile-app",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2015010100",
                    # for end use 1st day of month following final month of data
                    "end" : '2021093000'
                        }

# Customize these with your own information
headers = {
    'User-Agent': 'https://github.com/lanfuli',
    'From': 'lanfuli@uw.edu'
}

In [35]:
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    
    return response

In [36]:
pagecounts_desktop_site = api_call(endpoint_pagecount, pagecount_desktop_views)
pagecounts_mobile_site = api_call(endpoint_pagecount, pagecount_mobile_views)
pageviews_desktop = api_call(endpoint_pageviews, pageview_desktop_views)
pageviews_mobile_web = api_call(endpoint_pageviews, pageview_mobile_web_views)
pageviews_mobile_app = api_call(endpoint_pageviews, pageview_mobile_app_views)

In [38]:
print(pageviews_mobile_app)

{'items': [{'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015070100', 'views': 109624146}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015080100', 'views': 109669149}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015090100', 'views': 96221684}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015100100', 'views': 94523777}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015110100', 'views': 94353925}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015120100', 'views': 99438956}, {'project': 'en.wikipedia', 'access': 'mobile-app', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016010100', 'views': 1064

In [7]:
def create_json(filename, date):
    # 1. Read file contents
    container = []
    with open(filename, "r") as file:
        container = json.load(file)
        container.update(date)
    # 2. Write json file
    with open(filename, "w") as file:
        json.dump(container, file)    

In [46]:
filename = ['pagecounts_desktop_site_200712_201607.json', 'pagecounts_mobile_site_200712_201607.json',
            'pageviews_desktop_201501_202109.json','pageviews_mobile_web_201501_202109.json',
            'pageviews_mobile_app_201501_202109.json']
data_file = [pagecounts_desktop_site, pagecounts_mobile_site, pageviews_desktop, pageviews_mobile_web,
            pageviews_mobile_app]
for file, data in zip(filename, data_file):
    create_json(file, data)

In [47]:
#Step 2: Data Processing

In [48]:
import csv
import pandas as pd
import numpy as np
from collections import Counter

In [49]:
csv_column = ['timestamp', 'access','access-site','Country','views']

In [50]:
csv_file = "en-wikipedia_traffic_200712-202108.csv"

In [74]:
# df = pd.read_json('pagecounts_desktop_site_200712_201607.json')
# df.to_csv('test.csv',index = False)
# df

Unnamed: 0,items
0,"{'project': 'en.wikipedia', 'access-site': 'de..."
1,"{'project': 'en.wikipedia', 'access-site': 'de..."
2,"{'project': 'en.wikipedia', 'access-site': 'de..."
3,"{'project': 'en.wikipedia', 'access-site': 'de..."
4,"{'project': 'en.wikipedia', 'access-site': 'de..."
...,...
99,"{'project': 'en.wikipedia', 'access-site': 'de..."
100,"{'project': 'en.wikipedia', 'access-site': 'de..."
101,"{'project': 'en.wikipedia', 'access-site': 'de..."
102,"{'project': 'en.wikipedia', 'access-site': 'de..."


In [75]:
def data_clean(raw_data):
    temp_dict = []
    for i in raw_data.values():
        for j in i:
            temp_dict.append(j)
    return temp_dict

In [76]:
# Remove the items from dict and covert to list
clean_pagecounts_desktop_site = data_clean(pagecounts_desktop_site)
clean_pagecounts_mobile_site = data_clean(pagecounts_mobile_site)
clean_pageviews_desktop = data_clean(pageviews_desktop)
clean_pageviews_mobile_web = data_clean(pageviews_mobile_web)
clean_pageviews_mobile_app = data_clean(pageviews_mobile_app)

In [116]:
# combine pageviews mobile web and mopageviews moile app
clean_pageviews_mobile_all = clean_pageviews_mobile_web + clean_pageviews_mobile_app

In [126]:
df = pd.DataFrame(clean_pageviews_mobile_all)
# df.dtypes
df['timestamp'] = df['timestamp'].str[0:6]
df['timestamp']

Unnamed: 0,project,access,agent,granularity,timestamp,views
0,en.wikipedia,mobile-web,user,monthly,201507,3179131148
1,en.wikipedia,mobile-web,user,monthly,201508,3192663889
2,en.wikipedia,mobile-web,user,monthly,201509,3073981649
3,en.wikipedia,mobile-web,user,monthly,201510,3173975355
4,en.wikipedia,mobile-web,user,monthly,201511,3142247145
...,...,...,...,...,...,...
145,en.wikipedia,mobile-app,user,monthly,202105,166485079
146,en.wikipedia,mobile-app,user,monthly,202106,150704624
147,en.wikipedia,mobile-app,user,monthly,202107,161461155
148,en.wikipedia,mobile-app,user,monthly,202108,161381193


In [128]:
# df = df.drop(['project', 'agent', 'granularity'], axis=1)
new_df = df.groupby(['timestamp']).sum()
new_df['timestamp'] = pd.to_datetime(new_df['timestamp'], format='%Y%m')
new_df

KeyError: 'timestamp'

In [70]:
# temp_dict_2 = {}
# for i in range(len(clean_pageviews_mobile_all)):
#     for j in clean_pageviews_mobile_all[i]:
#         if clean_pageviews_mobile_all[i]["timestamp"] not in temp_dict_2:
#             temp_dict_2[clean_pageviews_mobile_all[i]["timestamp"]] = clean_pageviews_mobile_all[i]["views"]
#         else:
#             temp_dict_2[clean_pageviews_mobile_all[i]["timestamp"]] = clean_pageviews_mobile_all[i]["views"] + temp_dict_2.get(clean_pageviews_mobile_all[i]["timestamp"])

# # print(temp_dict_2)
# df = pd.DataFrame.from_dict(temp_dict_2, orient='index', columns=['views'])
# df
