In [405]:
#1 Wikipedia Page View API

In [406]:
#Identify a famous person who has been famous for at least a few years and that you have some personal interest in. 
#Use the Wikimedia API to collect page view data from the English Wikipedia article on that person. Now use that data to 
#generate a time-series visualization and include a link to it in your notebook.

#https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia.org/all-access/user/Dolly_Parton/daily/20010115/20230430


import requests
import json

def get_wikipedia_pageviews(country):
    # /metrics/pageviews/per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}
    url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/" + 
           f"{country}.wikipedia.org/all-access/user/Dolly_Parton/daily/20010115/20230430")

    headers = {
        'User-Agent': 'python data collection bot by awadj@uw.edu'
    }

    response = requests.get(url, headers=headers)

    if not response.status_code == 200:
        print("ERROR, request not OK")
    
    data = response.json()
    return data

In [407]:
data = get_wikipedia_pageviews("en")
data

{'items': [{'project': 'en.wikipedia',
   'article': 'Dolly_Parton',
   'granularity': 'daily',
   'timestamp': '2015070100',
   'access': 'all-access',
   'agent': 'user',
   'views': 4137},
  {'project': 'en.wikipedia',
   'article': 'Dolly_Parton',
   'granularity': 'daily',
   'timestamp': '2015070200',
   'access': 'all-access',
   'agent': 'user',
   'views': 3778},
  {'project': 'en.wikipedia',
   'article': 'Dolly_Parton',
   'granularity': 'daily',
   'timestamp': '2015070300',
   'access': 'all-access',
   'agent': 'user',
   'views': 3848},
  {'project': 'en.wikipedia',
   'article': 'Dolly_Parton',
   'granularity': 'daily',
   'timestamp': '2015070400',
   'access': 'all-access',
   'agent': 'user',
   'views': 6142},
  {'project': 'en.wikipedia',
   'article': 'Dolly_Parton',
   'granularity': 'daily',
   'timestamp': '2015070500',
   'access': 'all-access',
   'agent': 'user',
   'views': 6272},
  {'project': 'en.wikipedia',
   'article': 'Dolly_Parton',
   'granularity'

In [408]:
len(data['items'])

2861

In [409]:
with open("Dolly_Parton_pageviews.json", 'w') as my_file:
    data_string = json.dumps(data)
    print(data_string, file=my_file)

In [410]:
with open("Dolly_Parton_pageviews.json", 'r') as input_file:
    input_data = input_file.read()

In [411]:
type(input_data)

str

In [412]:
new_data = json.loads(input_data)
new_data['items'][0]

{'project': 'en.wikipedia',
 'article': 'Dolly_Parton',
 'granularity': 'daily',
 'timestamp': '2015070100',
 'access': 'all-access',
 'agent': 'user',
 'views': 4137}

In [413]:
def clean_up_timestamp(day):
    new_time_stamp = day[0:4] + "-" + day[4:6] + "-" + day[6:8]
    return new_time_stamp

In [414]:
views_by_day = {}
for day_dict in new_data['items']:
    day = clean_up_timestamp(day_dict['timestamp'])
    views_by_day[day] = day_dict['views']

In [415]:
with open('view_by_day_Dolly_Parton.tsv', 'w') as output_file:
    print("day\tviews", file=output_file)
    for day in views_by_day:
        print(f"{day}\t{views_by_day[day]}", file=output_file)

In [416]:
day_dicts = []
with open("Dolly_Parton_pageviews.json", 'r') as input_file:
    for line in input_file.readlines():
        new_data = json.loads(line)
        day_dicts = day_dicts + new_data['items']

In [417]:
total_views_by_day = {}

for day_dict in day_dicts:
    day = clean_up_timestamp(day_dict["timestamp"])
    
    if day in total_views_by_day.keys():
        total_views_by_day[day] = total_views_by_day[day] + day_dict["views"]
    else:
        total_views_by_day[day] = day_dict['views']

In [418]:
with open('Dolly_Parton_pageviews.json', 'w') as output_file:
    print("day\tviews", file=output_file)
    for day in total_views_by_day:
        print(f"{day}\t{total_views_by_day[day]}", file=output_file)

In [419]:
#https://docs.google.com/spreadsheets/d/1vcUWU_bJfLhzgr-JKXEboACd7JybSxi380e0JjGHsAc/edit?usp=sharing

In [467]:
#Identify 2 other languages editions of Wikipedia that have articles on that person. Collect page view data on the article 
#in other languages and create a single visualization that shows how the dynamics and similar and/or different. 
#(Note: My approach involved creating a TSV file with multiple columns.)
    
pages = ["en", "fr",
         "es"]

with open("Dolly_Parton_Intl_Pageviews.json1", "w") as my_file:
    for country in pages:
        data = get_wikipedia_pageviews(country)
        json_string = json.dumps(data)
        print(json_string, file=my_file)

In [468]:
day_dicts = []
with open("Dolly_Parton_Intl_Pageviews.json1", 'r') as input_file:
    for line in input_file.readlines():
        new_data = json.loads(line)
        day_dicts = day_dicts + new_data['items']

In [469]:
len(day_dicts)

8583

In [470]:
def countryID(day):
    countrySource = day[0:2]
    return countrySource

In [471]:
day_dicts[0]

{'project': 'en.wikipedia',
 'article': 'Dolly_Parton',
 'granularity': 'daily',
 'timestamp': '2015070100',
 'access': 'all-access',
 'agent': 'user',
 'views': 4137}

In [472]:
total_views_by_day = {}

for day_dict in day_dicts:
    day = clean_up_timestamp(day_dict["timestamp"])
    country = day_dict["project"]
    if country not in total_views_by_day.keys():
        total_views_by_day[country] = {}
    if day in total_views_by_day[country].keys():
        total_views_by_day[country][day][0] += day_dict["views"]
    else:
        total_views_by_day[country][day] = day_dict['views']

In [473]:
total_views_by_day

{'en.wikipedia': {'2015-07-01': 4137,
  '2015-07-02': 3778,
  '2015-07-03': 3848,
  '2015-07-04': 6142,
  '2015-07-05': 6272,
  '2015-07-06': 4555,
  '2015-07-07': 7220,
  '2015-07-08': 3796,
  '2015-07-09': 3852,
  '2015-07-10': 3849,
  '2015-07-11': 3828,
  '2015-07-12': 6049,
  '2015-07-13': 8286,
  '2015-07-14': 7499,
  '2015-07-15': 3957,
  '2015-07-16': 3792,
  '2015-07-17': 3685,
  '2015-07-18': 4514,
  '2015-07-19': 4635,
  '2015-07-20': 3938,
  '2015-07-21': 3872,
  '2015-07-22': 3558,
  '2015-07-23': 3659,
  '2015-07-24': 4737,
  '2015-07-25': 6492,
  '2015-07-26': 4468,
  '2015-07-27': 5497,
  '2015-07-28': 5266,
  '2015-07-29': 4719,
  '2015-07-30': 4864,
  '2015-07-31': 4263,
  '2015-08-01': 7050,
  '2015-08-02': 6476,
  '2015-08-03': 5239,
  '2015-08-04': 5062,
  '2015-08-05': 4320,
  '2015-08-06': 4092,
  '2015-08-07': 4420,
  '2015-08-08': 5585,
  '2015-08-09': 6720,
  '2015-08-10': 4147,
  '2015-08-11': 4178,
  '2015-08-12': 5015,
  '2015-08-13': 5103,
  '2015-08-14': 

In [475]:
with open('Dolly_Parton_Intl_Pageviews.tsv', 'w') as output_file:
    print("source\tday\tviews", file=output_file)
    
    for day in total_views_by_day:
        innerday = total_views_by_day[day]
        for innerday, innervalue in innerday.items():
            print(f"{day}\t{innerday}\t{innervalue}", file=output_file)

#Hard to organize the excel sheet to show all 3 categories visually but heres the link
#https://docs.google.com/spreadsheets/d/1PqwsN2LYcfnC2jQydqtSPMx1JceD4IS3MNxbPnBPLzU/edit?usp=sharing

PermissionError: [Errno 13] Permission denied: 'Dolly_Parton_Intl_Pageviews.tsv'

In [491]:
#Collect page view data on the articles about Marvel Comics and DC Comics in English Wikipedia. 
import requests
import json

def get_wikipedia_pageviews(country):
    # /metrics/pageviews/per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}
    url = ("https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/" + 
           f"en.wikipedia.org/all-access/user/{page_title}/daily/20220101/20221231")

    headers = {
        'User-Agent': 'python data collection bot by awadj@uw.edu'
    }

    response = requests.get(url, headers=headers)

    if not response.status_code == 200:
        print("ERROR, request not OK")
    
    data = response.json()
    return data


In [492]:
pages = ["Marvel_Comics", "DC_Comics"]

with open("Marvel_vs_DC.jsonl", "w") as my_file:
    for page_title in pages:
        data = get_wikipedia_pageviews(page_title)
        json_string = json.dumps(data)
        print(json_string, file=my_file)

In [493]:
#Which has more total page views in 2022?
with open("Marvel_vs_DC.jsonl", 'r') as input_file:
    input_data = input_file.read()

In [494]:
comic_dicts = []
with open("Marvel_vs_DC.jsonl", 'r') as input_file:
    for line in input_file.readlines():
        new_data = json.loads(line)
        comic_dicts = comic_dicts + new_data['items']

In [495]:
comic_dicts[0]

{'project': 'en.wikipedia',
 'article': 'Marvel_Comics',
 'granularity': 'daily',
 'timestamp': '2022010100',
 'access': 'all-access',
 'agent': 'user',
 'views': 8482}

In [499]:
len(comic_dicts)

730

In [496]:
count_marvel = 0
count_dc = 0

for d in comic_dicts:
    if d['article'] == 'Marvel_Comics':
        count_marvel += 1
    elif d['article'] == 'DC_Comics':
        count_dc += 1

print(f"There are {count_marvel} articles about Marvel Comics and {count_dc} articles about DC Comics") 
#I'm not sure why they are equal, maybe because there were a limited amount of searches? The length of the dictionary we 
#got was 730, so half were marvel and half were DC


There are 365 articles about Marvel Comics and 365 articles about DC Comics


In [502]:
#Can you draw a visualization in a spreadsheet that shows this? (Again, provide a link.)

with open('Marvel_vs_DC.tsv', 'w') as output_file:
    print("comic", file=output_file)
    
    for day in comic_dicts:
            print(f"{day['article']}", file=output_file)


In [None]:
#Can you draw a visualization in a spreadsheet that shows this? (Again, provide a link.)
#https://docs.google.com/spreadsheets/d/1ZV7ORrOn-76IwCzHG-KHI5NVmrkoUoGfly1aRsJIPpI/edit?usp=sharing

In [None]:
#Were there any years when 2022's more popular page was instead the less popular of the two? How many and which ones?
#Were there any months was this reversal of relative popularity occurred? How many and which ones?
#How about any days? How many?

In [None]:
#I've made this file available which includes list of more than 100 Wikipedia articles about alternative rock bands 
#from Washington state that I built from this category in Wikipedia.[*] It's a .jsonl file. Download the file 
#(click "raw" and then save the file onto your drive). Now read it in, and request monthly page view data from all of them. 
#If you need some help with loading it in, I've included some sample code at the bottom of this page.
#Once you've done this, sum up all of the page views from all of the pages and print out a TSV file with these total numbers.
#You know the routine by now! Now, make a time series graph of these numbers and include a link in your notebook.

<b>Identify an API you will (or might!) want to use for your project.</b>
https://developer.apple.com/accessibility/

<b>Find documentation for that API and include links in your notebook.</b>
https://developer.apple.com/documentation/accessibility

<b>What are the API endpoints you plan to use? What are the parameters you will need to use at that endpoint?</b>

This was a bit hard to navigate for me unfortunately. Mako mentioned I might also use some bibliometrics related to articles for ASL. I havent emailed him yet but I'm going to email him as soon as I can.


<b>Is there a Python module that exists that helps make contact with the API? (See if you can you find example code on how to use it).    
    If so, download it, install it, and import it into your notebook.
Does the API require authentication? Does it need to be approved?
    If so, sign up for a developer account and get your keys. (Do this early because it often takes time for these accounts to be approved.) Does the API list rate limits? Does it make any requests about how you should use it?
Make a single API call, either directly using requests or using the Python module you have used. It doesn't matter for what. The goal is that you can get something'.</b>

Not that I know of. My project initially was related to cleaning data on psychopy and I'm still thinking of how to incorporate APIs that are related to my ASL test Battery data. Accessibility looks good so far but I may be asking for your help in class on how to access the data on the links I showed above or get advice on modules for the bibliometrics mako mentioned.