-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added wiki API and basic wiki scraper
- Loading branch information
1 parent
e259f67
commit b256d0d
Showing
4 changed files
with
160 additions
and
0 deletions.
There are no files selected for viewing
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from django.core.management.base import BaseCommand, CommandError | ||
from versions.services import wiki | ||
from versions import models | ||
|
||
|
||
def process_law(law_in: wiki.PageResult, force: bool): | ||
# Get law | ||
if not force and models.Law.objects.filter(wiki_page_id=law_in.page_id).exists(): | ||
return | ||
law, law_created = models.Law.objects.get_or_create( | ||
wiki_page_id=law_in.page_id, defaults={"title": law_in.title} | ||
) | ||
law.save() | ||
if not law_created: | ||
models.Revision.objects.filter(law=law).delete() | ||
# Revisions | ||
revisions_in = wiki.get_revisions_for_page(page_title=law_in.title) | ||
# TODO do the splits! | ||
for revision_in in revisions_in: | ||
models.Revision( | ||
law=law, | ||
wiki_rev_id=revision_in.id, | ||
name=revision_in.comment, | ||
effective_date_start=revision_in.timestamp, | ||
source_text=revision_in.content, | ||
).save() | ||
|
||
# Get all revisions for said law | ||
# group by major revisions | ||
# how to identifiy major revision: | ||
# extract Makor: line with <מקור> until double newline | ||
# if Makor changed, this is a major revision | ||
# for each major revision, choose the latest non-major revision as source for text. | ||
# extract from the latest non-major revision the title from the Makor row | ||
# from the earliest revision in the major revision, get the edit date as the effective date | ||
pass | ||
|
||
|
||
class Command(BaseCommand): | ||
def add_arguments(self, parser): | ||
parser.add_argument("--law-name", required=False) | ||
parser.add_argument("--force", action="store_true") | ||
|
||
def handle(self, *args, **options): | ||
force = options["force"] | ||
if law_name := options["law_name"]: | ||
page = wiki.get_page(law_name) | ||
process_law(page, force=force) | ||
else: | ||
for page in wiki.get_pages_in_category(): | ||
print(page.title) | ||
process_law(page, force=force) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
import requests | ||
from typing import NamedTuple, Iterator | ||
from datetime import datetime | ||
|
||
|
||
class PageResult(NamedTuple): | ||
title: str | ||
page_id: int | ||
|
||
|
||
class RevisionResult(NamedTuple): | ||
id: int | ||
timestamp: datetime | ||
comment: str | ||
content: str | ||
|
||
|
||
def get_pages_in_category(cont_dict=None) -> Iterator[PageResult]: | ||
params = { | ||
"action": "query", | ||
"list": "categorymembers", | ||
"cmtitle": "קטגוריה:בוט חוקים", | ||
"cmnamespace": "0", | ||
"format": "json", | ||
} | ||
if cont_dict: | ||
params.update(cont_dict) | ||
response = requests.get( | ||
"https://he.wikisource.org/w/api.php", | ||
params=params, | ||
) | ||
response.raise_for_status() | ||
j = response.json() | ||
results = j["query"]["categorymembers"] | ||
yield from (PageResult(title=r["title"], page_id=r["pageid"]) for r in results) | ||
if "continue" in j and "continue" in j["continue"]: | ||
cont_dict = j["continue"] | ||
cont_dict.pop("continue") | ||
yield from get_pages_in_category(cont_dict) | ||
|
||
|
||
def get_page(title=None, cont_dict=None) -> PageResult: | ||
params = { | ||
"action": "query", | ||
"titles": title, | ||
"format": "json", | ||
} | ||
response = requests.get( | ||
"https://he.wikisource.org/w/api.php", | ||
params=params, | ||
) | ||
response.raise_for_status() | ||
j = response.json() | ||
(page,) = j["query"]["pages"].values() | ||
return PageResult(title=page["title"], page_id=page["pageid"]) | ||
|
||
|
||
def get_revisions_for_page(page_title: str, cont_dict=None) -> Iterator[RevisionResult]: | ||
# api.php?action=query&prop=revisions&titles=AntiSpoof&formatversion=2&redirects=1 [try in ApiSandbox] | ||
params = { | ||
"action": "query", | ||
"prop": "revisions", | ||
"titles": page_title, | ||
"formatversion": 2, | ||
"format": "json", | ||
"rvprop": "timestamp|comment|content|ids", | ||
"rvslots": "main", | ||
"rvlimit": "max", | ||
} | ||
if cont_dict: | ||
params.update(cont_dict) | ||
response = requests.get( | ||
"https://he.wikisource.org/w/api.php", | ||
params=params, | ||
) | ||
response.raise_for_status() | ||
j = response.json() | ||
(page,) = j["query"]["pages"] | ||
yield from ( | ||
RevisionResult( | ||
id=int(r["revid"]), | ||
timestamp=datetime.fromisoformat(r["timestamp"]), | ||
comment=r["comment"], | ||
content=r["slots"]["main"]["content"], | ||
) | ||
for r in page["revisions"] | ||
) | ||
if "continue" in j and "continue" in j["continue"]: | ||
cont_dict = j["continue"] | ||
cont_dict.pop("continue") | ||
yield from get_revisions_for_page(page_title, cont_dict) | ||
|
||
|
||
import itertools | ||
|
||
|
||
def bla(): | ||
""" | ||
pages = itertools.islice(get_pages_in_category(), 10) | ||
for page in pages: | ||
[print(r) for r in get_revisions_for_page(page.title)] | ||
""" | ||
[ | ||
print(r.id) | ||
for r in get_revisions_for_page( | ||
"חוק איסור פרסומת והגבלת השיווק של מוצרי טבק ועישון" | ||
) | ||
] |