In [1]:
# 1. First modify the code from first sets of notebooks I used in the Community Data Science Course (Spring 2023)/Week 6 lecture to download data (and metadata) about revisions to the 5 articles you chose from Wikipedia.

import json
import datetime
import requests


def get_article_revision_json_from_wikipedia(title):
    revisions = []
    # create a base URL for the WIKI API
    wp_api_url = "http://en.wikipedia.org/w/api.php"
    # API parameters to get revision data
    parameters = {
        'action': 'query',
        'titles': title,
        'prop': 'revisions',
        'rvprop': 'flags|timestamp|user|size|ids',
        'rvlimit': 500,
        'format': 'json'
    }
    while True:
        # make API request
        call = requests.get(wp_api_url, params=parameters)
        # convert API response to JSON
        api_answer = call.json()
        # append revision data to list
        revisions.extend(next(iter(api_answer["query"]["pages"].values()), {}).get("revisions", []))
        # 'continue' tells us there's more revisions to add
        if 'continue' in api_answer.keys():
            # update parameters with continue parameter to fetch more data
            parameters.update(api_answer['continue'])
        else:
            break
    return revisions


def get_article_pageview_json_from_wikimedia(title, start_date):
    end_date = datetime.datetime.now().date()
    url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/all-agents/{title}/daily/{start_date.strftime('%Y%m%d')}/{end_date.strftime('%Y%m%d')}"
    headers = {
        'User-Agent': 'data collection from <your email address> for studing'
    }
    response = requests.get(url, headers=headers)
    if not response.status_code == 200:
        print("ERROR! status was not 200")
    data = response.json()
    return data["items"]


# list of Avatar-related article titles
article_titles = [
    'Avatar (2009 film)',
    'Avatar: The Last Airbender',
    'Avatar: The Last Airbender (season 1)',
    'Avatar: The Last Airbender (season 2)',
    'Avatar: The Last Airbender (season 3)'
]

raw_date = {title: {} for title in article_titles}

data = {title: {} for title in article_titles}


for title in article_titles:
    revisions = get_article_revision_json_from_wikipedia(title)
    start_date = datetime.datetime.strptime(min(map(lambda revision: revision["timestamp"], revisions)), "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=datetime.timezone.utc).astimezone(tz=None)
    pageviews = get_article_pageview_json_from_wikimedia(title, start_date)
    raw_date[title] = {"revisions": revisions, "pageviews": pageviews}

with open("raw_data.json", "w") as file_out:
    json.dump(raw_date, file_out)

In [3]:
for title in article_titles:
    with open(f"{title}.tsv", "w") as file_out:
        file_out.write("date\trevision\tsize\tsize_diff\tviews\n")
        last_size = 0
        for date in sorted(data[title].keys()):
            curr_size = data[title][date].get("size", last_size)
            size_diff = abs(last_size - curr_size)
            last_size = curr_size
            file_out.write(f"{date}\t{data[title][date].get('revision_count', 0)}\t{curr_size}\t{size_diff}\t{data[title][date].get('views', 0)}\n")

1. What is your proposed unit of analysis? In other words, if/when you end up building something like a spreadsheet, what are rows going to represent?

Rows represent date

2. What specific measures associated with each unit do you want to collect? In other words, what are the columns in the spreadsheet going to be?

Columns are some data like size, differences of size(The size of the wiki content and the amount of change compared to the previous day), revision count, and dataviews. 

3. Tell us what you've learned about the API:
    Are you going to be able to get the data you want with one API call or many? If more than one, how many?
    If it's more than one call, how will you know when you have collected all your data?

For revision, use many call to get data, for pageview, use one call. Judging whether the data acquisition is completed by whether there is a continue flag

4. Make one API call and save the output to your desk in either a .json or .jsonl file. Be sure to share the code you used to do this. Be sure not to include any API keys in your notebook!

I save all data to raw_data.json

5. How big is the JSON file that you saved on your disk (i.e., in bytes or kilobytes)? If it is not your full dataset, what is your estimate for how much larger the full dataset will be? How big will the total dataset be? Is that a problem?

It's 6,562,283 B.