In [1]:
import json
import pandas as pd

In [2]:
revisions = []

with open("rock_band_wp_revisions.jsonl", 'r') as input_file:
    for line in input_file.readlines():
        api_answer = json.loads(line)
        
        # get the list of pages from the json object
        pages = api_answer["query"]["pages"]

        # for every page, (there should always be only one) get its revisions:
        for page_id in pages.keys():
            query_revisions = pages[page_id]["revisions"]
            title = pages[page_id]['title']

            # for every revision, first we do some cleaning up
            for rev in query_revisions:
                #print(rev)
                # let's continue/skip this revision if the user is hidden
                if "userhidden" in rev.keys():
                    continue
                
                # 1: add a title field for the article because we're going to mix them together
                rev["title"] = title

                # 2: let's "recode" anon so it's true or false instead of present/missing
                if "anon" in rev.keys():
                    rev["anon"] = True
                else:
                    rev["anon"] = False

                # 3: let's recode "minor" in the same way
                if "minor" in rev.keys():
                    rev["minor"] = True
                else:
                    rev["minor"] = False

                # we're going to change the timestamp to make it work a little better in excel/spreadsheets
                rev["timestamp"] = rev["timestamp"].replace("T", " ")
                rev["timestamp"] = rev["timestamp"].replace("Z", "")

                # finally, save the revisions we've seen to a varaible
                revisions.append(rev)

In [3]:
revisions_dataframe = pd.DataFrame(revisions)

In [4]:
revisions_dataframe

Unnamed: 0,revid,parentid,minor,user,timestamp,size,title,anon
0,1151924741,1150442184,True,Sepguilherme,2023-04-27 02:30:06,92419,Nirvana (band),False
1,1150442184,1146907324,True,Majash2020,2023-04-18 06:20:57,92417,Nirvana (band),False
2,1146907324,1146566677,False,BoxxyBoy,2023-03-27 17:45:26,92840,Nirvana (band),False
3,1146566677,1146566554,False,Popcornfud,2023-03-25 17:38:53,92900,Nirvana (band),False
4,1146566554,1146566457,False,Popcornfud,2023-03-25 17:38:04,92914,Nirvana (band),False
...,...,...,...,...,...,...,...,...
78863,1121918079,1054500785,False,67.87.161.113,2022-11-14 21:21:05,5373,Gun Outfit,True
78864,1054500785,1018151506,False,Seemaba,2021-11-10 11:27:09,5373,Gun Outfit,False
78865,1018151506,1018150539,False,Lewishhh,2021-04-16 14:53:51,5369,Gun Outfit,False
78866,1018150539,1018150266,False,Lewishhh,2021-04-16 14:48:21,5252,Gun Outfit,False


In [5]:
revisions_dataframe.to_csv("rock_bands_wp_revisions-NEW.tsv", sep="\t")