In [1]:
import json

In [2]:
revisions = []

with open("rock_band_wp_revisions.jsonl", 'r') as input_file:
    for line in input_file.readlines():
        api_answer = json.loads(line)
        
        # get the list of pages from the json object
        pages = api_answer["query"]["pages"]

        # for every page, (there should always be only one) get its revisions:
        for page_id in pages.keys():
            query_revisions = pages[page_id]["revisions"]
            title = pages[page_id]['title']

            # for every revision, first we do some cleaning up
            for rev in query_revisions:
                #print(rev)
                # let's continue/skip this revision if the user is hidden
                if "userhidden" in rev.keys():
                    continue
                
                # 1: add a title field for the article because we're going to mix them together
                rev["title"] = title

                # 2: let's "recode" anon so it's true or false instead of present/missing
                if "anon" in rev.keys():
                    rev["anon"] = True
                else:
                    rev["anon"] = False

                # 3: let's recode "minor" in the same way
                if "minor" in rev.keys():
                    rev["minor"] = True
                else:
                    rev["minor"] = False

                # we're going to change the timestamp to make it work a little better in excel/spreadsheets
                rev["timestamp"] = rev["timestamp"].replace("T", " ")
                rev["timestamp"] = rev["timestamp"].replace("Z", "")

                # finally, save the revisions we've seen to a varaible
                revisions.append(rev)

In [3]:
num_edits = len(revisions)
num_edits

78864

In [4]:
# count the number of anonymous edits 
num_anon = 0

for rev in revisions:
    if rev["anon"]:
        num_anon = num_anon + 1

prop_anon = num_anon / num_edits

print(f"total edits: {num_edits}")
print(f"anon edits: {num_anon}")
print(f"proportion anon: {prop_anon}")

total edits: 78864
anon edits: 30668
proportion anon: 0.38887198214648


In [5]:
# lets count the number of edits by day
edits_by_day = {}
for rev in revisions:
    day_string = rev['timestamp'][0:10]

    if day_string in edits_by_day.keys():
        edits_by_day[day_string] = edits_by_day[day_string] + 1
    else:
        edits_by_day[day_string] = 1

In [6]:
# write out a TSV file we could analyze in google docs
with open("rock_bands_edits_by_day.tsv", "w", encoding='utf-8') as output_file:
    # write a header
    print("date\tedits", file=output_file)

    # iterate through every day and print out data into the file
    for day_string in edits_by_day.keys():
        print("\t".join([day_string, str(edits_by_day[day_string])]), file=output_file)

In [7]:
# write out the rock bands full TSV if we wanted to load that up too!
with open('rock_bands_wp_revisions.tsv', 'w', encoding='utf-8') as output_file:
    print("title\ttimestamp\tuser\tminor\tanon", file=output_file)
    for rev in revisions:
        print(f"{rev['title']}\t{rev['timestamp']}\t{rev['user']}\t{rev['minor']}\t{rev['anon']}", file=output_file)