In [1]:
import requests
import json

In [2]:
def get_article_revision_json(title):
    api_answers = []

    # create a base url for the api and then a normal url which is initially
    # just a copy of it
    # The following line is what the requests call is doing, basically.
    # f"http://en.wikipedia.org/w/api.php/?action=query&titles={title}&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json&continue="
    # e.g.: http://en.wikipedia.org/w/api.php/?action=query&titles=Soundgarden&prop=revisions&rvprop=flags|timestamp|user|size|ids&rvlimit=500&format=json
    wp_api_url = "http://en.wikipedia.org/w/api.php/"

    parameters = {'action' : 'query',
                  'titles' : title,
                  'prop' : 'revisions',
                  'rvprop' : 'flags|timestamp|user|size|ids',
                  'rvlimit' : 500,
                  'format' : 'json',
                   }

    # we'll repeat this forever (i.e., we'll only stop when we find
    # the "break" command)
    while True:
        # this will wait for one second
        # time.sleep(1)
        
        # the first line open the urls but also handles unicode urls
        call = requests.get(wp_api_url, params=parameters)
        api_answer = call.json()
        
        # now we'll add this to whatever we are tracking
        api_answers.append(api_answer)
        
        # 'continue' tells us there's more revisions to add
        if 'continue' in api_answer.keys():
            # replace the 'continue' parameter with the contents of the
            # api_answer dictionary.
            parameters.update(api_answer['continue'])
        else:
            break
        
    return(api_answers)

In [3]:
lost_articles = ['Lost_(TV_series)','List_of_Lost_characters','List_of_Lost_episodes','Mythology_of_Lost','Lost_Original_Television_Soundtracks']
#with open('list_of_washington_alternative_rocks_bands_wikipedia-2023-04-25.jsonl', 'r') as input_file,\
with open("Lost_wp_revisions.jsonl", 'w') as output_file:
    for article in lost_articles:
        page_title = article
        print(f"now working on: {page_title}")
        api_answers = get_article_revision_json(page_title)
        for api_answer in api_answers:
            print(json.dumps(api_answer), file=output_file)

now working on: Lost_(TV_series)
now working on: List_of_Lost_characters
now working on: List_of_Lost_episodes
now working on: Mythology_of_Lost
now working on: Lost_Original_Television_Soundtracks


In [4]:
revisions = []

with open("Lost_wp_revisions.jsonl", 'r') as input_file:
    for line in input_file.readlines():
        api_answer = json.loads(line)
        
        # get the list of pages from the json object
        pages = api_answer["query"]["pages"]

        # for every page, (there should always be only one) get its revisions:
        for page_id in pages.keys():
            query_revisions = pages[page_id]["revisions"]
            title = pages[page_id]['title']

            # for every revision, first we do some cleaning up
            for rev in query_revisions:
                #print(rev)
                # let's continue/skip this revision if the user is hidden
                if "userhidden" in rev.keys():
                    continue
                
                # 1: add a title field for the article because we're going to mix them together
                rev["title"] = title

                # 2: let's "recode" anon so it's true or false instead of present/missing
                if "anon" in rev.keys():
                    rev["anon"] = True
                else:
                    rev["anon"] = False

                # 3: let's recode "minor" in the same way
                if "minor" in rev.keys():
                    rev["minor"] = True
                else:
                    rev["minor"] = False

                # we're going to change the timestamp to make it work a little better in excel/spreadsheets
                rev["timestamp"] = rev["timestamp"].replace("T", " ")
                rev["timestamp"] = rev["timestamp"].replace("Z", "")

                # finally, save the revisions we've seen to a varaible
                revisions.append(rev)

In [5]:
num_edits = len(revisions)
num_edits

32816

In [6]:
# count the number of anonymous edits 
num_anon = 0

for rev in revisions:
    if rev["anon"]:
        num_anon = num_anon + 1

prop_anon = num_anon / num_edits

print(f"total edits: {num_edits}")
print(f"anon edits: {num_anon}")
print(f"proportion anon: {prop_anon}")

total edits: 32816
anon edits: 12928
proportion anon: 0.393954168698196


In [7]:
# lets count the number of edits by day
edits_by_day = {}
for rev in revisions:
    day_string = rev['timestamp'][0:10]

    if day_string in edits_by_day.keys():
        edits_by_day[day_string] = edits_by_day[day_string] + 1
    else:
        edits_by_day[day_string] = 1

In [8]:
# write out a TSV file we could analyze in google docs
with open("Lost_edits_by_day.tsv", "w", encoding='utf-8') as output_file:
    # write a header
    print("date\tedits", file=output_file)

    # iterate through every day and print out data into the file
    for day_string in edits_by_day.keys():
        print("\t".join([day_string, str(edits_by_day[day_string])]), file=output_file)

In [9]:
# write out the rock bands full TSV if we wanted to load that up too!
with open('Lost_wp_revisions.tsv', 'w', encoding='utf-8') as output_file:
    print("title\ttimestamp\tuser\tminor\tanon", file=output_file)
    for rev in revisions:
        print(f"{rev['title']}\t{rev['timestamp']}\t{rev['user']}\t{rev['minor']}\t{rev['anon']}", file=output_file)

In [10]:
print(day_string)

2007-01-29


In [11]:
porportion_anon = num_anon / len(revisions)
print(porportion_anon)

0.393954168698196


In [12]:
# count the number of minor edits 
num_minor = 0

for rev in revisions:
    if rev["minor"]:
        num_minor = num_minor + 1

prop_minor = num_minor / num_edits

print(f"total edits: {num_edits}")
print(f"minor edits: {num_minor}")
print(f"proportion minor: {prop_minor}")

total edits: 32816
minor edits: 6933
proportion minor: 0.21126889322281814


In [13]:
#Chart: https://docs.google.com/spreadsheets/d/1oNLHzboH6MreKBzO7iuBG2nfaxInNEwDq-N1YzTG0QA/edit?usp=sharing