# Wikipedia Data

In [None]:
import json
import datetime
import requests
import pandas as pd
from tqdm.notebook import tqdm
import time
import os
from collections import defaultdict, Counter
import pickle

with open('wmapicred.json', 'r') as f:
    credentials = json.load(f)

In [None]:
def generate_pageid_strings(dict_list, max_count=50, max_length=300):
    combined_string = ''
    count = 0

    for item in dict_list:
        pageid = str(item['pageid'])
        if combined_string:
            # Check if adding this pageid will exceed the length limit or count limit
            if len(combined_string) + len(pageid) + 1 > max_length or count >= max_count:
                yield combined_string
                combined_string = pageid
                count = 1  # Reset count for the new string
            else:
                combined_string += '|' + pageid
                count += 1
        else:
            combined_string = pageid
            count = 1

    # Yield the last combined string if it's not empty
    if combined_string:
        yield combined_string


def generate_titles_strings(inlist, max_count=50, max_length=300):
    combined_string = ''
    count = 0

    for title in inlist:
        if combined_string:
            # Check if adding this pageid will exceed the length limit or count limit
            if len(combined_string) + len(title) + 1 > max_length or count >= max_count:
                yield combined_string
                combined_string = title
                count = 1  # Reset count for the new string
            else:
                combined_string += '|' + title
                count += 1
        else:
            combined_string = title
            count = 1

    # Yield the last combined string if it's not empty
    if combined_string:
        yield combined_string

In [None]:
# Python 3

# Get today's date in YYYY/MM/DD format.

today = datetime.datetime.now()
date = today.strftime('%Y/%m/%d')

# Choose your language, and get today's featured content.
language_code = 'en' # English
headers = {
  'Authorization': f'Bearer {credentials["access_token"]}',
  # 'User-Agent': 'YOUR_APP_NAME (YOUR_EMAIL_OR_CONTACT_PAGE)'
}

base_url = 'https://api.wikimedia.org/feed/v1/wikipedia/'
url = base_url + language_code + '/featured/' + date
response = requests.get(url, headers=headers)

In [None]:
response.text

## Base Query

In [None]:
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

SEARCHPAGE = "#MeToo"

PARAMS = {
    "action": "query",
    "format": "json",
    "list": "search",
    "srsearch": SEARCHPAGE,
    "srlimit": 500,
    "sroffset": 0
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()

# if DATA['query']['search'][0]['title'] == SEARCHPAGE:
    # print("Your search page '" + SEARCHPAGE + "' exists on English Wikipedia")

In [None]:
DATA['query']['searchinfo']['totalhits']

In [None]:
# with pagination
S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"
SEARCHPAGE = "#MeToo"
PARAMS = {
    "action": "query",
    "format": "json",
    "list": "search",
    "srsearch": SEARCHPAGE,
    "srlimit": 500,
    "sroffset": 0
}
TO_CONTINUE = True
RESULTS = []

while TO_CONTINUE:
    print(f"Processing Offset Num {PARAMS['sroffset']}")
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    RESULTS.extend(DATA['query']['search'])
    if 'error' in DATA:
        raise Exception(DATA['error'])
    if 'warnings' in DATA:
        print(DATA['warnings'])
    if 'continue' not in DATA:
        break
    elif '||' in DATA['continue']['continue']:
        TO_CONTINUE=True
        PARAMS['sroffset']=DATA['continue']['sroffset']

In [None]:
len(RESULTS)

In [None]:
RESULTS

In [None]:
savepath = '/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/metoo_base.json'

with open(savepath, 'w') as file:
    json.dump(RESULTS, file)

## Base Query with changed sorting

In [None]:
srsorts = [
    'create_timestamp_asc',
    'create_timestamp_desc',
    'incoming_links_asc',
    'incoming_links_desc',
    'just_match',
    'last_edit_asc',
    'last_edit_desc',
    'none',
    'random',
    'relevance',
    'user_random'
]

In [None]:
RESULTS = []

# with pagination
S = requests.Session()

for srsort in srsorts:
    print(f'SORT TYPE: {srsort}')
    URL = "https://en.wikipedia.org/w/api.php"
    SEARCHPAGE = "#MeToo"
    PARAMS = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": SEARCHPAGE,
        "srlimit": 500,
        "sroffset": 0,
        "srsort": srsort
    }
    TO_CONTINUE = True

    while TO_CONTINUE:
        print(f"Processing Offset Num {PARAMS['sroffset']}")
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        RESULTS.extend(DATA['query']['search'])
        if 'error' in DATA:
            raise Exception(DATA['error'])
        if 'warnings' in DATA:
            print(DATA['warnings'])
        if 'continue' not in DATA:
            break
        elif '||' in DATA['continue']['continue']:
            TO_CONTINUE=True
            PARAMS['sroffset']=DATA['continue']['sroffset']

In [None]:
# discard duplicate results
def remove_duplicates(dict_list):
    unique_pageids = set()
    unique_dicts = []

    for d in dict_list:
        pageid = d['pageid']
        if pageid not in unique_pageids:
            unique_pageids.add(pageid)
            unique_dicts.append(d)

    return unique_dicts


RESULTS_DEDUPED = remove_duplicates(RESULTS)
print(len(RESULTS))
print(len(RESULTS_DEDUPED))

In [None]:
savepath = '/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/metoo_base_multisort.json'

with open(savepath, 'w') as file:
    json.dump(RESULTS, file)

## What about just random?

In [None]:
savepath = '/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/metoo_base_multisort_random_aug.json'

with open(savepath, 'r') as file:
    INRES = json.load(file)

In [None]:
# read in max results collected

counter = 0
max_tries = 500
while True:
    print(f'Call number {counter}')
    counter += 1
    URL = "https://en.wikipedia.org/w/api.php"
    SEARCHPAGE = "#MeToo"
    PARAMS = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": SEARCHPAGE,
        "srlimit": 500,
        "sroffset": 0,
        "srsort": 'random'
    }
    TO_CONTINUE = True

    # while TO_CONTINUE:
        # print(f"Processing Offset Num {PARAMS['sroffset']}")
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    original_uniq = len(INRES)
    INRES.extend(DATA['query']['search'])
    INRES = remove_duplicates(INRES)
    new_uniq = len(INRES)
    if 'error' in DATA:
        raise Exception(DATA['error'])
    if 'warnings' in DATA:
        print(DATA['warnings'])
    print(f'Unique Count: Before - {original_uniq}; After - {new_uniq}; Diff - {new_uniq-original_uniq}')
    time.sleep(2)
    if counter >= max_tries or len(INRES) > 100234:
        break



In [None]:
savepath = '/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/metoo_base_multisort_random_aug.json'

with open(savepath, 'w') as file:
    json.dump(INRES, file)

## Using workaround

In [None]:
query_list = [
    "#MeToo intitle:/[A-G]/ -intitle:/[H-Z]/",
    "#MeToo intitle:/[H-Z]/ -intitle:/[A-G]/",
    "#MeToo intitle:/[A-G]/",
    "#MeToo intitle:/[H-Z]/",
    "#MeToo intitle:/[A-B]/",
    "#MeToo intitle:/[B-C]/",
    "#MeToo intitle:/[C-D]/",
    "#MeToo intitle:/[D-E]/",
    "#MeToo intitle:/[E-F]/",
    "#MeToo intitle:/[F-G]/",
    "#MeToo intitle:/[G-H]/",
    "#MeToo intitle:/[H-I]/",
    "#MeToo intitle:/[I-J]/",
    "#MeToo intitle:/[J-K]/",
    "#MeToo intitle:/[K-L]/",
    "#MeToo intitle:/[L-M]/",
    "#MeToo intitle:/[M-N]/",
    "#MeToo intitle:/[N-O]/",
    "#MeToo intitle:/[O-P]/",
    "#MeToo intitle:/[P-Q]/",
    "#MeToo intitle:/[Q-R]/",
    "#MeToo intitle:/[R-S]/",
    "#MeToo intitle:/[S-T]/",
    "#MeToo intitle:/[T-U]/",
    "#MeToo intitle:/[U-V]/",
    "#MeToo intitle:/[V-W]/",
    "#MeToo intitle:/[W-X]/",
    "#MeToo intitle:/[X-Y]/",
    "#MeToo intitle:/[Y-Z]/",
]

individual_letters = [
    "#MeToo intitle:/[A]/",
    "#MeToo intitle:/[B]/",
    "#MeToo intitle:/[C]/",
    "#MeToo intitle:/[D]/",
    "#MeToo intitle:/[E]/",
    "#MeToo intitle:/[F]/",
    "#MeToo intitle:/[G]/",
    "#MeToo intitle:/[H]/",
    "#MeToo intitle:/[I]/",
    "#MeToo intitle:/[J]/",
    "#MeToo intitle:/[K]/",
    "#MeToo intitle:/[L]/",
    "#MeToo intitle:/[M]/",
    "#MeToo intitle:/[N]/",
    "#MeToo intitle:/[O]/",
    "#MeToo intitle:/[P]/",
    "#MeToo intitle:/[Q]/",
    "#MeToo intitle:/[R]/",
    "#MeToo intitle:/[S]/",
    "#MeToo intitle:/[T]/",
    "#MeToo intitle:/[U]/",
    "#MeToo intitle:/[V]/",
    "#MeToo intitle:/[W]/",
    "#MeToo intitle:/[X]/",
    "#MeToo intitle:/[Y]/",
    "#MeToo intitle:/[Z]/",
]

extended_query_list = [
    "#MeToo intitle:/[A]/ intitle:/[B-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[A]/ -intitle:/[B-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[B]/ intitle:/[A,C-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[B]/ -intitle:/[A,C-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[C]/ intitle:/[A-B,D-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[C]/ -intitle:/[A-B,D-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[D]/ intitle:/[A-C,E-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[D]/ -intitle:/[A-C,E-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[E]/ intitle:/[A-D,F-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[E]/ -intitle:/[A-D,F-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[F]/ intitle:/[A-E,G-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[F]/ -intitle:/[A-E,G-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[G]/ intitle:/[A-F,H-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[G]/ -intitle:/[A-F,H-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[H]/ intitle:/[A-G,I-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[H]/ -intitle:/[A-G,I-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[I]/ intitle:/[A-H,J-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[I]/ -intitle:/[A-H,J-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[J]/ intitle:/[A-I,K-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[J]/ -intitle:/[A-I,K-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[K]/ intitle:/[A-J,L-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[K]/ -intitle:/[A-J,L-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[L]/ intitle:/[A-K,M-N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[L]/ -intitle:/[A-K,M-N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[M]/ intitle:/[A-L,N]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[M]/ -intitle:/[A-L,N]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[N]/ intitle:/[A-M]/ -intitle:/[O-Z]/",
    "#MeToo intitle:/[N]/ -intitle:/[A-M]/ intitle:/[O-Z]/",
    "#MeToo intitle:/[O]/ intitle:/[A-N]/ -intitle:/[P-Z]/",
    "#MeToo intitle:/[O]/ -intitle:/[A-N]/ intitle:/[P-Z]/",
    "#MeToo intitle:/[P]/ intitle:/[A-N]/ -intitle:/[Q-Z]/",
    "#MeToo intitle:/[P]/ -intitle:/[A-N]/ intitle:/[Q-Z]/",
    "#MeToo intitle:/[Q]/ intitle:/[A-N]/ -intitle:/[R-Z]/",
    "#MeToo intitle:/[Q]/ -intitle:/[A-N]/ intitle:/[R-Z]/",
    "#MeToo intitle:/[R]/ intitle:/[A-N]/ -intitle:/[S-Z]/",
]

multi_ht_search = [
    
]

In [None]:
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
half1 = "A-M"
half2 = "N-Z"

query_list = []

for letter in alphabet:
    # Case 1: The letter is the only capital letter in the title
    query_list.append(f"#MeToo intitle:/{letter}/ -intitle:/[A-Z&&[^{letter}]]/")

    # Case 2: The letter + at least one letter from one half of the alphabet
    if letter in half1:
        query_list.append(f"#MeToo intitle:/{letter}/ intitle:/[{half2}]/")
    else:
        query_list.append(f"#MeToo intitle:/{letter}/ intitle:/[{half1}]/")

    # Case 3: The letter + at least one letter from the other half of the alphabet
    if letter in half1:
        query_list.append(f"#MeToo intitle:/{letter}/ intitle:/[{half1}&&[^{letter}]]/")
    else:
        query_list.append(f"#MeToo intitle:/{letter}/ intitle:/[{half2}&&[^{letter}]]/")

# Print or use the query_list as needed


In [None]:
query_list

In [None]:
process_one_query("#MeToo intitle:/^(.*[A-Z]){0,2}.*$/")

In [None]:
PARAMS = {
    "action": "query",
    "format": "json",
    "list": "search",
    "srsearch": 'MeToo OR BalanceTonPorc OR MoiAussi OR نه_یعنی_نه OR 米兔 OR 我也是 OR وأنا كمان OR GamAni OR TôiCũngVậy OR 私も OR WatashiMo OR 나도 OR 나도당했다 OR גםאנחנו OR Ятоже OR RiceBunny OR EnaZeda OR AnaKaman OR YoTambien OR SendeAnlat OR KuToo OR WithYou OR WeToo OR cuentalo OR QuellaVoltaChe',
    "srlimit": 500,
    "sroffset": 0
}
# TO_CONTINUE = True
# RESULTS = []

# while TO_CONTINUE:
    # print(f"Processing Offset Num {PARAMS['sroffset']}")
R = S.get(url=URL, params=PARAMS)
DATA = R.json()
# return DATA['query']['searchinfo']['totalhits']

In [None]:
DATA

In [None]:
process_one_query("#MeToo intitle:/A/ intitle:/[N-Z]/")

In [None]:
process_one_query("#MeToo intitle:/A/ intitle:/[A-M&&[^A]]/")

In [None]:
# with pagination

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

def process_one_query(query):


    PARAMS = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": query,
        "srlimit": 500,
        "sroffset": 0
    }
    # TO_CONTINUE = True
    # RESULTS = []

    # while TO_CONTINUE:
        # print(f"Processing Offset Num {PARAMS['sroffset']}")
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    return DATA['query']['searchinfo']['totalhits']
    # RESULTS.extend(DATA['query']['search'])
        # break
        # if 'error' in DATA:
        #     raise Exception(DATA['error'])
        # if 'warnings' in DATA:
        #     print(DATA['warnings'])
        # if 'continue' not in DATA:
        #     break
        # elif '||' in DATA['continue']['continue']:
        #     TO_CONTINUE=True
        #     PARAMS['sroffset']=DATA['continue']['sroffset']

In [None]:
# if not query
query_hits = []
for query in individual_letters:
    print(f'Processing Query: {query}')
    query_hits.append(process_one_query(query))

In [None]:
out = pd.DataFrame.from_dict({
    'Query': individual_letters,
    'Total Hits': query_hits
})

In [None]:
out

## Get all of query lists

In [None]:
# with pagination

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

def save_one_query(savename, query):

    PARAMS = {
        "action": "query",
        "format": "json",
        "list": "search",
        "srsearch": query,
        "srlimit": 500,
        "sroffset": 0
    }
    TO_CONTINUE = True
    RESULTS = []

    while TO_CONTINUE:
        print(f"Processing Offset Num {PARAMS['sroffset']}")
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        RESULTS.extend(DATA['query']['search'])
        if 'error' in DATA:
            raise Exception(DATA['error'])
        if 'warnings' in DATA:
            print(DATA['warnings'])
        if 'continue' not in DATA:
            break
        elif '||' in DATA['continue']['continue']:
            TO_CONTINUE=True
            PARAMS['sroffset']=DATA['continue']['sroffset']
    print(f"Total Hits: {DATA['query']['searchinfo']['totalhits']}")

    savepath = f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}.json'

    with open(savepath, 'w') as file:
        json.dump(RESULTS, file)

    print(f'Saved to {savepath}')

In [None]:
# if not query
hashtags = [
    "MeToo", "BalanceTonPorc", "MoiAussi", "نه_یعنی_نه", "米兔", "我也是", 
    "وأنا كمان", "GamAni", "TôiCũngVậy", "私も", "WatashiMo", "나도", 
    "나도당했다", "גםאנחנו", "Ятоже", "RiceBunny", "EnaZeda", "AnaKaman", 
    "YoTambien", "SendeAnlat", "KuToo", "WithYou", "WeToo", "cuentalo", 
    "QuellaVoltaChe", "NiUnaMenos", "WoYeShi", "MyHarveyWeinstein", 
    "NousToutes", "stilleforopptak", "nårdansenstopper", "nårmusikkenstilner", 
    "memyös", "timesup", "NiEre", "JoTambe", "미투", "운동"
]

query_hits = []
for savename, query in enumerate(hashtags):
    print(f'Processing Query: {query}')
    save_one_query(savename, query)

## Getting Wikidata Id from Pageid

In [None]:
savename=0
with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}.json', 'r') as f:
    x = json.load(f)

In [None]:
query_generator = list(generate_pageid_strings(x))


In [None]:
len(query_generator)

In [None]:
RESULTS = []
for query in tqdm(query_generator):
    PARAMS = {
        "action": "query",
        "format": "json",
        "pageids": query,
        # "srsearch": query,
        # "srlimit": 500,
        # "sroffset": 0,
        "prop": "pageprops"
    }
    TO_CONTINUE = True

    # while TO_CONTINUE:
        # print(f"Processing Offset Num {PARAMS['sroffset']}")
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()

    RESULTS.append(DATA)

In [None]:
with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}_wikidata.json', 'w') as f:
    json.dump(RESULTS, f)

In [None]:
len(DATA['query']['pages'])

In [None]:
#Full pipeline:

for savename in range(1, 38):
    with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}.json', 'r') as f:
        x = json.load(f)

    query_generator = list(generate_pageid_strings(x))

    RESULTS = []
    for query in tqdm(query_generator):
        PARAMS = {
            "action": "query",
            "format": "json",
            "pageids": query,
            # "srsearch": query,
            # "srlimit": 500,
            # "sroffset": 0,
            "prop": "pageprops"
        }
        TO_CONTINUE = True

        # while TO_CONTINUE:
            # print(f"Processing Offset Num {PARAMS['sroffset']}")
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()

        RESULTS.append(DATA)

    with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}_wikidata.json', 'w') as f:
        json.dump(RESULTS, f)

In [None]:
#Full pipeline for select files

files = [
    'metoo_base',
    'metoo_base_multisort',
    'metoo_base_multisort_random_aug'
]

for savename in files:
    with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}.json', 'r') as f:
        x = json.load(f)

    query_generator = list(generate_pageid_strings(x))

    RESULTS = []
    for query in tqdm(query_generator):
        PARAMS = {
            "action": "query",
            "format": "json",
            "pageids": query,
            # "srsearch": query,
            # "srlimit": 500,
            # "sroffset": 0,
            "prop": "pageprops"
        }
        TO_CONTINUE = True

        # while TO_CONTINUE:
            # print(f"Processing Offset Num {PARAMS['sroffset']}")
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()

        RESULTS.append(DATA)

    with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}_wikidata.json', 'w') as f:
        json.dump(RESULTS, f)

# Getting Langlinks + Links

In [None]:
# same thing but with revisions and langlinks

#Full pipeline for select fiels

files = [
    'metoo_base',
    # 'metoo_base_multisort',
    'metoo_base_multisort_random_aug'
]

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

for savename in files[1:]:
# for savename in range(0,38):
    with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}.json', 'r') as f:
        x = json.load(f)
    # print(len(x))

    query_generator = list(generate_pageid_strings(x))

    RESULTS = []
    for index, query in enumerate(tqdm(query_generator)):
        PARAMS = {
            "action": "query",
            "format": "json",
            "pageids": query,
            "lllimit": 500,
            "llprop": "url|langname|autonym",
            "plnamespace":0,
            "prop": "pageprops|langlinks|links",
            "pllimit": 500,
        }
        TO_CONTINUE = True

        # while TO_CONTINUE:
            # print(f"Processing Offset Num {PARAMS['sroffset']}")
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        RESULTS.append(DATA)
        if "continue" in DATA:
            # print(f'WARNING: "continue" key present in query index {index}.')
            # # print(DATA)
            while True:
                PARAMS = {
                    "action": "query",
                    "format": "json",
                    "pageids": query,
                    "lllimit": 500,
                    "llcontinue": DATA['continue'].get('llcontinue', None),
                    "llprop": "url|langname|autonym",
                    "plnamespace":0,
                    "prop": "pageprops|langlinks|links",
                    "plcontinue": DATA['continue'].get('plcontinue', None),
                    "pllimit": 500,
                }
                R = S.get(url=URL, params=PARAMS)
                DATA = R.json()
                # print(DATA)
                if 'error' in DATA:
                    print(DATA)
                    raise ValueError
                RESULTS.append(DATA)
                if "continue" not in DATA:
                    break

        #REMOVE LATER
        # break


    with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}_lang+pageprops.json', 'w') as f:
        json.dump(RESULTS, f)

# Getting Revisions

In [None]:
# same thing but with revisions and langlinks

#Full pipeline for select fiels

files = [
    'metoo_base',
    'metoo_base_multisort_random_aug'
]
import os

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"

IDS_IN_RESULTS = []

for savename in files[:1]:
    path = f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}_rev+pageprops.json'
    if os.path.isfile(path):
        with open(path, 'r') as f:
            RESULTS = json.load(f) 
        print('Data Loaded In')
    else:
        RESULTS = []
    with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}.json', 'r') as f:
        x = json.load(f)

    PAGEIDS = [i['pageid'] for i in x]
    # PAGEIDS = [i for i in PAGEIDS if ]
    for index, query in enumerate(tqdm(PAGEIDS[:200])):
        PARAMS = {
            "action": "query",
            "format": "json",
            "pageids": str(query),
            "prop": "pageprops|revisions",
            "rvprop": "ids|timestamp|flags|comment|user|content|tags|userid",
            "rvlimit": 25,
            "rvslots": "main"
        }

        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        RESULTS.append(DATA)
        if "continue" in DATA:
            while True:
                PARAMS = {
                    "action": "query",
                    "format": "json",
                    "pageids": str(query),
                    "prop": "pageprops|revisions",
                    "rvprop": "ids|timestamp|flags|comment|user|content|tags|userid",
                    "rvlimit": 25,
                    "rvslots": "main",
                    "rvcontinue": DATA['continue'].get('rvcontinue', ''),
                }
                R = S.get(url=URL, params=PARAMS)
                DATA = R.json()
                RESULTS.append(DATA)
                if "continue" not in DATA or "batchcomplete" in DATA:
                    break

        with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}_rev+pageprops.json', 'w') as f:
            json.dump(RESULTS, f)

In [None]:
path = f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/metoo_base_rev+pageprops.json'
if os.path.isfile(path):
    with open(path, 'r') as f:
        RESULTS = json.load(f) 

In [None]:
RESULTS[0]['query']['pages'].keys()

In [None]:
RESULTS = []
PARAMS = {
    "action": "query",
    "format": "json",
    "pageids": '55551931',
    "prop": "pageprops|revisions",
    "rvprop": "ids|timestamp|flags|comment|user|content|tags|userid",
    "rvlimit": 25,
    "rvslots": "main"
}

R = S.get(url=URL, params=PARAMS)
DATA = R.json()
while True:
    PARAMS = {
        "action": "query",
        "format": "json",
        "pageids": query,
        "prop": "pageprops|revisions",
        "rvprop": "ids|timestamp|flags|comment|user|content|tags|userid",
        "rvlimit": 25,
        "rvcontinue": DATA['continue'].get('rvcontinue', ''),
        "rvslots": "main",
    }
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()
    RESULTS.append(DATA)
    if 'warnings' in DATA:
        print(DATA['warnings'])
    if "continue" not in DATA:
        print('nocont')
        break
    if "batchcomplete" in DATA:
        print('batchcomplete')
        break
    if "rvcontinue" not in DATA['continue']:
        print('no rvcontinue')
    else:
        print(DATA['continue'])

# Getting Page Categories

In [None]:
# same thing but with revisions and langlinks
overwrite = False

#Full pipeline for select fiels

files = [
    'metoo_base',
    # 'metoo_base_multisort',
    'metoo_base_multisort_random_aug'
]

files = files + [f'{i}' for i in range(38)]

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"



for index, savename in enumerate(tqdm(files)):
# for savename in range(0,38):
    with open(f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}.json', 'r') as f:
        x = json.load(f)
    # print(len(x))

    outfile = f'/Users/hubert/Drive/DPhil/DPhil_Studies/2023-08-Study_C/Data/Wikipedia/{savename}_categories.json'
    if os.path.isfile(outfile) and not overwrite:
        print(f'Skipping {savename}')
        continue

    print(f'Processing -> {savename}. {index+1} of {len(files)}')

    query_generator = list(generate_pageid_strings(x))

    RESULTS = []
    for index, query in enumerate(tqdm(query_generator)):
        PARAMS = {
            "action": "query",
            "format": "json",
            "pageids": query,
            "prop": "categories|pageprops",
            "cl": "sortkey|timestamp|hidden",
            "cllimit": 500,
        }
        TO_CONTINUE = True

        # while TO_CONTINUE:
            # print(f"Processing Offset Num {PARAMS['sroffset']}")
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        if 'error' in DATA:
            print(DATA)
            raise ValueError
        RESULTS.append(DATA)
        if "continue" in DATA:
            # print(f'WARNING: "continue" key present in query index {index}.')
            # print(DATA)
            while True:
                PARAMS = {
                    "action": "query",
                    "format": "json",
                    "pageids": query,
                    "prop": "categories|pageprops",
                    "cl": "sortkey|timestamp|hidden",
                    "cllimit": 500,
                    "clcontinue": DATA['continue'].get('clcontinue', None),
                }
                R = S.get(url=URL, params=PARAMS)
                DATA = R.json()
                if 'error' in DATA:
                    print(DATA)
                    raise ValueError
                RESULTS.append(DATA)
                if "continue" not in DATA:
                    break

    with open(outfile, 'w') as f:
        json.dump(RESULTS, f)

    print(f'Saved to {outfile}')

# Then get Corresponding Category Info

In [None]:
CAT = defaultdict(set)
ALLCATS = Counter()

files = [
    # 'metoo_base',
    # 'metoo_base_multisort',
    'metoo_base_multisort_random_aug'
]

# omit 0 == 'MeToo'
files = files + [f'{i}' for i in range(1, 38)]

PAGEID_ALREADY_SEEN = set()
for file in tqdm(files):

    with open(f'./Data/Wikipedia/{file}_categories.json', 'r') as f:
        RESULTS = json.load(f)

    for count, i in enumerate(RESULTS):

        if 'query' not in i:
            print(count, i)
            continue
        for k, v in i['query']['pages'].items():

            assert k == str(v['pageid'])
            if 'categories' in v:
                for cat in v['categories']:
                    if cat['title'] in CAT[k]:
                        continue
                    else:
                        CAT[k].add(cat['title'])
                        ALLCATS[cat['title']]+=1

print(f'Total number of unique categories collected: {len(ALLCATS)}')


In [None]:
ALLCATS.most_common(100)

In [None]:
len(CAT)

In [None]:
with open('./Data/Wikipedia/all_cat_desc.json', 'w') as f:
    json.dump(ALLCATS, f)
with open('./Data/Wikipedia/cat_per_article.pkl', 'wb') as f:
    pickle.dump(CAT, f)

In [None]:
query_generator = list(generate_titles_strings(ALLCATS.keys()))

In [57]:
# same thing but with revisions and langlinks
overwrite = False

S = requests.Session()

URL = "https://en.wikipedia.org/w/api.php"


outfile = './Data/Wikipedia/all_cat_info.json'
query_generator = list(generate_titles_strings(ALLCATS.keys()))

if os.path.isfile(outfile) and not overwrite:
    print(f'{outfile} already exists. Ending.')
else:
    RESULTS = []
    for index, query in enumerate(tqdm(query_generator)):
        PARAMS = {
            "action": "query",
            "format": "json",
            "titles": query,
            "prop": "categoryinfo",
        }
        R = S.get(url=URL, params=PARAMS)
        DATA = R.json()
        if 'error' in DATA:
            print(DATA)
            raise ValueError
        RESULTS.append(DATA)
        if "continue" in DATA:
            while True:
                PARAMS = {
                    "action": "query",
                    "format": "json",
                    "titles": query,
                    "prop": "categoryinfo",
                    "clcontinue": DATA['continue'].get('clcontinue', None),
                }
                R = S.get(url=URL, params=PARAMS)
                DATA = R.json()
                if 'error' in DATA:
                    print(DATA)
                    raise ValueError
                RESULTS.append(DATA)
                if "continue" not in DATA:
                    break

    with open(outfile, 'w') as f:
        json.dump(RESULTS, f)

    print(f'Saved to {outfile}')

  0%|          | 0/44460 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [58]:
len(RESULTS)

27652

In [59]:
with open(outfile, 'w') as f:
    json.dump(RESULTS, f)