<a href="https://colab.research.google.com/github/janmechtel/party/blob/master/bayer_on_wp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Given categories and users, list all the pages they have been editing.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
DRIVE = './drive/MyDrive/Marvin'
BATCH_PAGEIDS = 50

In [None]:
!pip3 uninstall googletrans
!pip3 install googletrans==3.1.0a0

[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting googletrans==3.1.0a0
  Downloading googletrans-3.1.0a0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 KB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting sniffio
  Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 KB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chardet==3.*
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 KB[0m [31m13.0 MB/s[0m eta [36m0:00:0

In [None]:
import os
import requests
from datetime import datetime
from typing import List

def init_session():
  """ (Re)Start session and re-use for session obj multiple queries. """
  return requests.Session()

def get_logfile(country_code, suffix:List[str]=None):
  logfile = country_code + '_' + ('_'.join(suffix) if suffix else '')
  t = datetime.now()
  suffix = '_{}{}{}_{}:{}:{}.csv'.format(t.year, t.month, t.day, t.hour, t.minute, t.second)
  logfile += suffix
  return os.path.join(DRIVE, logfile)

def get_api_url(country_code: str='en', params: dict=None):
  api_url = 'https://{}.wikipedia.org/w/api.php'.format(country_code)
  if params:
    api_url += '?' + '&'.join([k + '=' + str(v) for k, v in params.items()])
  return api_url

def get_page_link(country_code, pageid):
  return 'https://{}.wikipedia.org/w/index.php?curid={}'.format(country_code, pageid)

def get_cmtitles_by_page(session, country_code, pageid):
  api_url = 'https://{}.wikipedia.org/w/api.php'.format(country_code)
  params = {'action': 'query',
            'format': 'json',
            'prop': 'categories',
            'cllimit': 20,
            'pageids': pageid}
            #https://de.wikipedia.org/w/api.php?format=json&action=query&prop=categories&pageids=11036748
  results = session.get(url=api_url, params=params)
  data = results.json()
  try:
    categs = []
    # put-3-6afd97d5b0e6>", line 35, in get_cmtitles_by_page
    #for value in data['query']['pages'][str(pageid)]['categories'].values():
    # AttributeError: 'list' object has no attribute 'values'
    for value in data['query']['pages'][str(pageid)]['categories']:
      categs.append(value['title'])
    return categs

  except Exception:
    traceback.print_exc()
    print('ERROR: could not retrieve categories for pageid = ', pageid)
  return []


In [None]:
import copy
from datetime import datetime
from pandas import DataFrame
import requests
import sys
import traceback
from typing import List, Optional, Union

DELIM = ';'

# batch size for pageids query
BATCH = 50

def get_pages_by_user_and_categ(session: requests.Session, country_code:str,
                      users:List[str], cmtitles:List[str]):
  """ Collect all Wp page ids existing for a specific language. """

  logfile = get_logfile(country_code, ['user_and_categ'])

  with open(logfile, 'w') as f:
    f.write('country_code;user;pageid;title;link;categories')

  # get all pageids of input categories
  params1 = {
    'action': 'query',
    'list': 'categorymembers',
    'cmlimit': 'max',
    'format': 'json',
    'cmtitle': None
  }

  api_url1 = get_api_url(country_code)

  pageids = set()

  for cmtitle in cmtitles:
    while True:
      try:
        params1['cmtitle'] = cmtitle
        results = session.get(url=api_url1, params=params1)
        data = results.json()
        if 'query' not in data:
          print(data)
          print(api_url1)
          print(params1)
          sys.exit()
        try:

          pageids_new = [int(page['pageid']) for page in data['query']['categorymembers']]
          for pageid in pageids_new:
            pageids.add(pageid)
        except Exception:
          print(data)
          traceback.print_exc()
          sys.exit()
      except Exception:
        traceback.print_exc()
        continue
      if 'continue' not in data:
        break
      params1['cmcontinue'] = data['continue']['cmcontinue']

  print('STATUS: Found {} pages for '.format(len(pageids)), cmtitles)

  # get allrevisions for a user
  params2 = {
    'action': 'query',
    'format': 'json',
    'list': 'allrevisions',
    'user': None,
    'arvlimit': 500,
    'arvprop': 'user|size'
  }

  api_url2 = get_api_url(country_code)

  #https://de.wikipedia.org/w/api.php?action=query&format=json&list=allrevisions&arvlimit=max&arvprop=user|size&arvuser=blech

  for i, user in enumerate(users):
    params2['arvuser'] = user

    # revisions may relate to identical pages, don't track them twice
    pages_by_user = set()
    while True:
      try:
        results = session.get(url=api_url2, params=params2)
        data2 = results.json()
        if 'allrevisions' not in data2['query']:
          print('WARNING: no revisions found for user {}'.format(user))
          break
        for value in data2['query']['allrevisions']:
          pageid = int(value['pageid'])
          if pageid not in pageids or pageid in pages_by_user:
            continue
          pages_by_user.add(pageid)
          title = value['title']
          link = get_page_link(country_code, pageid)
          cmtitles_sub = get_cmtitles_by_page(session, country_code, pageid)
          logstr = DELIM.join([country_code, user, str(pageid), title, link, '|'.join(cmtitles_sub)])
          #print(logstr)
          with open(logfile, 'a') as f:
            f.write('\n' + logstr)

        if 'continue' not in data2:
          if 'arvcontinue' in params2:
            del params2['arvcontinue']
          break
        params2['arvcontinue'] = data2['continue']['arvcontinue']

      except Exception:
        traceback.print_exc()
        break

    sys.stdout.write('\r')
    sys.stdout.write("%d / %d users processed" % (i, len(users)))
    sys.stdout.flush()


In [None]:
users = []
with open(os.path.join(DRIVE, 'Data', 'bayer_users.csv'), 'r') as f:
  users = [line.strip() for line in f.readlines()]

cmtitles = []
with open(os.path.join(DRIVE, 'Data', 'bayer_cmtitles.csv'), 'r') as f:
  cmtitles = [line.strip() for line in f.readlines()]

session = init_session()
get_pages_by_user_and_categ(session=session, country_code='de', users=users, cmtitles=cmtitles)

FileNotFoundError: ignored

In [None]:
from collections import Counter
from pandas import DataFrame
from pandas import read_csv
import requests
import sys

LANG = 'be'
logfile = get_logfile(LANG)

# del next row
logfile = os.path.join(DRIVE, 'be_220512.csv')

df = read_csv(logfile, delimiter=';', header='infer')

api_url = get_api_url(LANG)
params = {
          'action': 'query',
          'format': 'json',
          'prop': 'categories',
          'cllimit': 'max'
          }

print('api_url = ', api_url)
# https://ru.wikipedia.org/w/api.php?action=query&format=json&prop=categories&pageids=7960758
pageids = [str(pi) for pi in df.pageid]
categories = Counter()
categ_tag = 'id="catlinks" class="catlinks"'

ctr_pages_w_categs = 0

for pageid in pageids:
  params['pageids'] = pageid
  results = session.get(url=api_url, params=params)
  data = results.json()
  if 'categories' in data[params['action']]['pages'][pageid]:
    aux = [c['title'] for c in data[params['action']]['pages'][pageid]['categories']]
  else:  # category field not provided for every page via MW api!
    url = 'https://{}.wikipedia.org/w/index.php?curid={}'.format(COUNTRY_CODE, pageid)
    r = requests.get(url)
    cont = r.text
    pos = cont.find(categ_tag)
    if pos == -1:
      print('ERROR: no categories could be retrieved for pageid = ', pageid)
      continue
    aux = cont[pos:cont.find('\n', pos + 1)].split('title="')[2:]
    aux = [c[:c.find('"')] for c in aux]
  ctr_pages_w_categs += 1
  categories += Counter(aux)
  if 'continue' in data:
    print('WARNING: continue flag in data for pageid = ', pageid)

  sys.stdout.write('\r')
  sys.stdout.write("%d %d" % (ctr_pages_w_categs, df.shape[0]))
  sys.stdout.flush()

print()
print(categories)

api_url =  https://be.wikipedia.org/w/api.php
317 437ERROR: no categories could be retrieved for pageid =  661167
327 437ERROR: no categories could be retrieved for pageid =  706248
363 437ERROR: no categories could be retrieved for pageid =  703126
434 437
Counter({'Катэгорыя:Вікіпедыя:Артыкулы з пераазначэннем значэння з Вікідадзеных': 300, 'Катэгорыя:Асобы': 277, 'Катэгорыя:Вікіпедыя:Біяграфіі сучаснікаў': 226, 'Катэгорыя:Вікіпедыя:Артыкулы пра асоб, для якіх не існуюць старонкі віду «І. Іпб. Прозвішча»': 189, 'Катэгорыя:Вікіпедыя:Артыкулы пра асоб, для якіх не існуюць старонкі віду «І. Прозвішча»': 175, 'Катэгорыя:Вікіпедыя:Запыты на пераклад з тарашкевіцы': 155, 'Катэгорыя:Вікіпедыя:Запыты на пераклад з рускай': 133, 'Катэгорыя:Старонкі з няправільным сінтаксісам спасылак на крыніцы': 128, 'Катэгорыя:Вікіпедыя:Артыкулы з крыніцамі з Вікідадзеных': 116, 'Катэгорыя:Нарадзіліся ў Мінску': 80, 'Катэгорыя:Вікіпедыя:Артыкулы з непрацоўнымі спасылкамі': 76, 'Катэгорыя:Асобы, прызнаныя вя

In [None]:
from googletrans import Translator
import matplotlib.pyplot as plt

LANG = 'be'

tlr = Translator()
topk = 30
print(categories)

kv = list(categories.items())
kv = sorted(kv, key=lambda kv: kv[1], reverse=True)[:topk]

x = [pt[0][pt[0].rfind(':') + 1:].strip() for pt in kv]
x_tld = [tlr.translate(v, src = LANG, dest='en').text for v in x]
x = [v + '\n' + w for v, w in zip(x, x_tld)]

y = [pt[1] for pt in kv]

fig, ax = plt.subplots(figsize=(10, 16))

for i, v in enumerate(y):
  ax.text(v + 3, i + .25, str(v), color='black')
plt.xlabel('Number of Pages')
plt.ylabel('Category')
h = plt.barh(x, y)

ax.set_title('Top {} Categories of Pages with Hidden Users (lang={})'.format(topk, LANG))


ModuleNotFoundError: ignored