## file 1, version 1: collect data and print it out

In [1]:
import requests
import json
import time

In [2]:
headers = {
    'User-Agent': 'data collection from makohill@uw.edu for teaching'
}

url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/University%20of%20Washington/daily/20010101/20230401"
    
response = requests.get(url, headers=headers)
if not response.status_code == 200:
    print("ERROR! status was not 200")
    
data = response.json()


In [3]:
len(data["items"])

2832

In [4]:
with open("wp_uw_pageview_data.json", 'w') as f:
    data_string = json.dumps(data)
    print(data_string, file=f)

## file 2, version 1: this will now read the data back in

In [5]:
import json

In [6]:
with open("wp_uw_pageview_data.json", 'r') as f:
    new_data_string = f.read()   

In [7]:
new_data = json.loads(new_data_string)

In [8]:
def parse_wikimedia_timestamp (old_date_string):
    new_date_string = f"{old_date_string[0:4]}-{old_date_string[4:6]}-{old_date_string[6:8]}"
    return new_date_string

In [9]:
with open("views_by_day_uw.tsv", "w") as f:
    print("day\tviews", file=f)
    
    for day_dict in new_data["items"]:
        day = parse_wikimedia_timestamp(day_dict["timestamp"])
        views = day_dict["views"]
        print(day, "\t", views, file=f)

## file 1, version 2: data collection code extended to multiple things

In [10]:
def get_pageview_data (page_title):
    
    headers = {
        'User-Agent': 'data collection from makohill@uw.edu for teaching'
    }

    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/all-agents/{page_title}/daily/20010101/20230401"
    
    response = requests.get(url, headers=headers)
    data = response.json()
    return data

In [11]:
# get_pageview_data("University of Washington")

In [12]:
for title in ["University of Washington", "Washington State University", "Western Washington University"]:
    data = get_pageview_data(title)
    print(title)
    time.sleep(1)
    

University of Washington
Washington State University
Western Washington University


In [13]:
with open("wp_combo_pageview_data.jsonl", 'w') as f:
    for title in ["University of Washington", "Washington State University", "Western Washington University"]:
        data = get_pageview_data(title)
        data_string = json.dumps(data)
        print(data_string, file=f)

## file 2, version 2: data loading and reprinting data

In [14]:
day_dicts = []
with open("wp_combo_pageview_data.jsonl", 'r') as f:
    for line in f.readlines():
        json_data = json.loads(line)
        day_dicts.extend(json_data["items"])

In [15]:
len(day_dicts)

8496

In [16]:
total_views_by_day = {} 

for day_dict in day_dicts:
    day = parse_wikimedia_timestamp(day_dict["timestamp"])
    if day in total_views_by_day:
        total_views_by_day[day] = total_views_by_day[day] + day_dict['views']
    else:
        total_views_by_day[day] = day_dict["views"]

In [17]:
print(len(total_views_by_day))
# total_views_by_day

2832


In [18]:
with open("views_by_day_combo.tsv", "w") as f:
    print("day\ttotal_views", file=f)
    
    for day in total_views_by_day.keys():
        views = total_views_by_day[day]
        print(day, "\t", views, file=f)