In [47]:
import json

In [48]:
revisions = []

with open("Starwars_articles_revisions.jsonl", 'r') as input_file:
    for line in input_file.readlines():
        api_answer = json.loads(line)
        
        # get the list of pages from the json object
        pages = api_answer["query"]["pages"]

        # for every page, (there should always be only one) get its revisions:
        for page_id in pages.keys():
            query_revisions = pages[page_id]["revisions"]
            title = pages[page_id]['title']

            # for every revision, first we do some cleaning up
            for rev in query_revisions:
                #print(rev)
                # let's continue/skip this revision if the user is hidden
                if "userhidden" in rev.keys():
                    continue
                
                # 1: add a title field for the article because we're going to mix them together
                rev["title"] = title

                # 2: let's "recode" anon so it's true or false instead of present/missing
                if "anon" in rev.keys():
                    rev["anon"] = True
                else:
                    rev["anon"] = False

                # 3: let's recode "minor" in the same way
                if "minor" in rev.keys():
                    rev["minor"] = True
                else:
                    rev["minor"] = False

                # we're going to change the timestamp to make it work a little better in excel/spreadsheets
                rev["timestamp"] = rev["timestamp"].replace("T", " ")
                rev["timestamp"] = rev["timestamp"].replace("Z", "")

                # finally, save the revisions we've seen to a varaible
                revisions.append(rev)

In [49]:
num_edits = len(revisions)
num_edits

43914

In [50]:
num_anon = 0

for rev in revisions:
    if rev["anon"]:
        num_anon = num_anon + 1

prop_anon = num_anon / num_edits

print(f"total edits: {num_edits}")
print(f"anon edits: {num_anon}")
print(f"proportion anon: {prop_anon}")

total edits: 43914
anon edits: 15804
proportion anon: 0.359885230222708


In [51]:
num_minor = 0

for rev in revisions:
    if rev["minor"]:
        num_minor = num_minor + 1

prop_minor = num_minor / num_edits

print(f"total edits: {num_edits}")
print(f"minor edits: {num_minor}")
print(f"proportion minor: {prop_minor}")

total edits: 43914
minor edits: 10200
proportion minor: 0.23227216832900668


In [52]:
edits_by_day = {}
for rev in revisions:
    day_string = rev['timestamp'][0:10]

    if day_string in edits_by_day.keys():
        edits_by_day[day_string] = edits_by_day[day_string] + 1
    else:
        edits_by_day[day_string] = 1

In [53]:
with open("Starwars_editbyday.tsv", "w", encoding='utf-8') as output_file:
    # write a header
    print("date\tedits", file=output_file)

    # iterate through every day and print out data into the file
    for day_string in edits_by_day.keys():
        print("\t".join([day_string, str(edits_by_day[day_string])]), file=output_file)

https://docs.google.com/spreadsheets/d/1PkxUCjd3PoxPuCbu58FnsK5f81nwnmVXoO5bE8aktEY/edit#gid=0

In [54]:
revisionsfandom = []

with open("FandomStarwars_articles_revisions.jsonl", 'r') as input_file:
    for line in input_file.readlines():
        api_answer = json.loads(line)
        
        # get the list of pages from the json object
        pages = api_answer["query"]["pages"]
      

        # for every page, (there should always be only one) get its revisions:
        for page_id in pages.keys():
            query_revisions = pages[page_id]["revisions"]
            title = pages[page_id]['title']

            # for every revision, first we do some cleaning up
            for rev in query_revisions:
               
                if "userhidden" in rev.keys():
                    continue
                
                # 1: add a title field for the article because we're going to mix them together
                rev["title"] = title

                # 2: let's "recode" anon so it's true or false instead of present/missing
                if "anon" in rev.keys():
                    rev["anon"] = True
                else:
                    rev["anon"] = False

                # 3: let's recode "minor" in the same way
                if "minor" in rev.keys():
                    rev["minor"] = True
                else:
                    rev["minor"] = False

                # we're going to change the timestamp to make it work a little better in excel/spreadsheets
                rev["timestamp"] = rev["timestamp"].replace("T", " ")
                rev["timestamp"] = rev["timestamp"].replace("Z", "")

                # finally, save the revisions we've seen to a varaible
                revisionsfandom.append(rev)

In [55]:
num_edits = len(revisionsfandom)
num_edits

4752

In [56]:
num_anon = 0

for rev in revisionsfandom:
    if rev["anon"]:
        num_anon = num_anon + 1

prop_anon = num_anon / num_edits

print(f"total edits: {num_edits}")
print(f"anon edits: {num_anon}")
print(f"proportion anon: {prop_anon}")

total edits: 4752
anon edits: 179
proportion anon: 0.03766835016835017


In [58]:
num_minor = 0

for rev in revisionsfandom:
    if rev["minor"]:
        num_minor = num_minor + 1

prop_minor = num_minor / num_edits

print(f"total edits: {num_edits}")
print(f"minor edits: {num_minor}")
print(f"proportion minor: {prop_minor}")

total edits: 4752
minor edits: 1546
proportion minor: 0.3253367003367003


In [59]:
edits_by_day = {}
for rev in revisionsfandom:
    day_string = rev['timestamp'][0:10]

    if day_string in edits_by_day.keys():
        edits_by_day[day_string] = edits_by_day[day_string] + 1
    else:
        edits_by_day[day_string] = 1

In [60]:
with open("fandomStarwars_editbyday.tsv", "w", encoding='utf-8') as output_file:
    # write a header
    print("date\tedits", file=output_file)

    # iterate through every day and print out data into the file
    for day_string in edits_by_day.keys():
        print("\t".join([day_string, str(edits_by_day[day_string])]), file=output_file)

https://docs.google.com/spreadsheets/d/1PkxUCjd3PoxPuCbu58FnsK5f81nwnmVXoO5bE8aktEY/edit#gid=0

In [71]:
with open("starwars_sizebyday.tsv", "w", encoding='utf-8') as output_file:
    size_by_day = {}
    # write a header
    print("article\tdate\tsize", file=output_file)
    for rev in revisions:
        if type(rev['title']) == str:
            day_string = rev['timestamp'][0:10]
            print("\t".join([rev['title'], day_string, str(rev['size'])]), file=output_file)

https://docs.google.com/spreadsheets/d/1PkxUCjd3PoxPuCbu58FnsK5f81nwnmVXoO5bE8aktEY/edit#gid=1699429451