Skip to content

Commit

Permalink
Added wiki API and basic wiki scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
BackSlasher committed May 23, 2023
1 parent e259f67 commit b256d0d
Show file tree
Hide file tree
Showing 4 changed files with 160 additions and 0 deletions.
Empty file.
Empty file.
52 changes: 52 additions & 0 deletions djang/versions/management/commands/collect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from django.core.management.base import BaseCommand, CommandError
from versions.services import wiki
from versions import models


def process_law(law_in: wiki.PageResult, force: bool):
# Get law
if not force and models.Law.objects.filter(wiki_page_id=law_in.page_id).exists():
return
law, law_created = models.Law.objects.get_or_create(
wiki_page_id=law_in.page_id, defaults={"title": law_in.title}
)
law.save()
if not law_created:
models.Revision.objects.filter(law=law).delete()
# Revisions
revisions_in = wiki.get_revisions_for_page(page_title=law_in.title)
# TODO do the splits!
for revision_in in revisions_in:
models.Revision(
law=law,
wiki_rev_id=revision_in.id,
name=revision_in.comment,
effective_date_start=revision_in.timestamp,
source_text=revision_in.content,
).save()

# Get all revisions for said law
# group by major revisions
# how to identifiy major revision:
# extract Makor: line with <מקור> until double newline
# if Makor changed, this is a major revision
# for each major revision, choose the latest non-major revision as source for text.
# extract from the latest non-major revision the title from the Makor row
# from the earliest revision in the major revision, get the edit date as the effective date
pass


class Command(BaseCommand):
def add_arguments(self, parser):
parser.add_argument("--law-name", required=False)
parser.add_argument("--force", action="store_true")

def handle(self, *args, **options):
force = options["force"]
if law_name := options["law_name"]:
page = wiki.get_page(law_name)
process_law(page, force=force)
else:
for page in wiki.get_pages_in_category():
print(page.title)
process_law(page, force=force)
108 changes: 108 additions & 0 deletions djang/versions/services/wiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import requests
from typing import NamedTuple, Iterator
from datetime import datetime


class PageResult(NamedTuple):
title: str
page_id: int


class RevisionResult(NamedTuple):
id: int
timestamp: datetime
comment: str
content: str


def get_pages_in_category(cont_dict=None) -> Iterator[PageResult]:
params = {
"action": "query",
"list": "categorymembers",
"cmtitle": "קטגוריה:בוט חוקים",
"cmnamespace": "0",
"format": "json",
}
if cont_dict:
params.update(cont_dict)
response = requests.get(
"https://he.wikisource.org/w/api.php",
params=params,
)
response.raise_for_status()
j = response.json()
results = j["query"]["categorymembers"]
yield from (PageResult(title=r["title"], page_id=r["pageid"]) for r in results)
if "continue" in j and "continue" in j["continue"]:
cont_dict = j["continue"]
cont_dict.pop("continue")
yield from get_pages_in_category(cont_dict)


def get_page(title=None, cont_dict=None) -> PageResult:
params = {
"action": "query",
"titles": title,
"format": "json",
}
response = requests.get(
"https://he.wikisource.org/w/api.php",
params=params,
)
response.raise_for_status()
j = response.json()
(page,) = j["query"]["pages"].values()
return PageResult(title=page["title"], page_id=page["pageid"])


def get_revisions_for_page(page_title: str, cont_dict=None) -> Iterator[RevisionResult]:
# api.php?action=query&prop=revisions&titles=AntiSpoof&formatversion=2&redirects=1 [try in ApiSandbox]
params = {
"action": "query",
"prop": "revisions",
"titles": page_title,
"formatversion": 2,
"format": "json",
"rvprop": "timestamp|comment|content|ids",
"rvslots": "main",
"rvlimit": "max",
}
if cont_dict:
params.update(cont_dict)
response = requests.get(
"https://he.wikisource.org/w/api.php",
params=params,
)
response.raise_for_status()
j = response.json()
(page,) = j["query"]["pages"]
yield from (
RevisionResult(
id=int(r["revid"]),
timestamp=datetime.fromisoformat(r["timestamp"]),
comment=r["comment"],
content=r["slots"]["main"]["content"],
)
for r in page["revisions"]
)
if "continue" in j and "continue" in j["continue"]:
cont_dict = j["continue"]
cont_dict.pop("continue")
yield from get_revisions_for_page(page_title, cont_dict)


import itertools


def bla():
"""
pages = itertools.islice(get_pages_in_category(), 10)
for page in pages:
[print(r) for r in get_revisions_for_page(page.title)]
"""
[
print(r.id)
for r in get_revisions_for_page(
"חוק איסור פרסומת והגבלת השיווק של מוצרי טבק ועישון"
)
]

0 comments on commit b256d0d

Please sign in to comment.