-
Notifications
You must be signed in to change notification settings - Fork 1
/
article_fetcher.py
executable file
·91 lines (82 loc) · 3.58 KB
/
article_fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# JPxG 2023 01 04
import datetime
import requests
import json
import urllib
import luadata
import sys
import weg_ver
# python -m pip install --upgrade luadata
# This is a secret tool that will come in handy later.
signpost = "Wikipedia Signpost"
def fetch(year_start=int(datetime.datetime.now().year), year_end=int(datetime.datetime.now().year), month_range=13, format="dict"):
headers = weg_ver.headers()
page_list_array = []
# All of the queries to send to the server and get a PrefixIndex-like list. Not returned.
articles_array = []
# Simple array of the pages, as they come from PrefixIndex.
all_articles = {}
# Formatted dict of all the pages, as they are in Module:Signpost.
for year in range(year_start, (year_end + 1)):
all_articles[str(year)] = []
#for month in range(1, 13):
for month in range(1, month_range):
if month < 10:
month = "0" + str(month)
page_list_query = f"https://en.wikipedia.org/w/api.php?format=json&action=query&list=allpages&apprefix={signpost}%2F{year}-{month}&aplimit=max&apnamespace=4"
month = int(month)
# print(page_list_query)
page_list_array.append(page_list_query)
# Now we hit the API to populate the articles_array.
for url in page_list_array:
response = requests.get(url, headers=headers)
if response.status_code == 200:
print(f'Retrieved {url}')
data = response.json()
for item in data["query"]["allpages"]:
# print(item["title"])
articles_array.append(item["title"])
else:
print(f'Error retrieving {url}')
#print(articles_array)
# Now we are going to mess with this array a little bit to try and make it more useful.
# Right now we have this gigantic list of articles (ordered chronologically at least), but it's a mess.
# Wouldn't it be nicer if the data were structured a little better?
# For this, I am using the JSON formatting from Module:Signpost, which you can see at, for example:
# https://en.wikipedia.org/wiki/Module:Signpost/Index/2022
for article in articles_array:
# Wikipedia:Wikipedia_Signpost/2022-08-01/Essay
# 000000000011111111112222222222333333333344444444445
# 012345678901234567890123456789012345678901234567890
pref_len = len("Wikipedia:") + len(signpost) + len("/")
issue_date = article[pref_len:(pref_len+10)]
# 2022-08-01
# 0123456789
issue_y = issue_date[:4]
issue_m = issue_date[5:7]
issue_d = issue_date[8:]
# pref_len plus eleven, to get just the article department
article_dept = article[(pref_len+11):]
# print(f"{issue_y} / {issue_m} / {issue_d} / {article_dept}")
little_dict = {}
little_dict["date"] = issue_date
little_dict["subpage"] = article_dept
if (article_dept != "") and (article_dept != "SPV"):
# No subpage name means it's the main page for that issue.
# "SPV" is the old (2006esque) way that single issues were compiled
# (new ones are Wikipedia Signpost/Single/YYYY-MM-DD).
all_articles[issue_y].append(little_dict)
"""
# Old version, which organized them by issue. I think this is easier to work with...
# But it doesn't gel with the formatting in Module:Signpost. Why reinvent the wheel?
# If that ends up being irrelevant, maybe we can go back to this version.
print(f"{issue_y} / {issue_m} / {issue_d} / {article_dept}")
if issue_date not in all_articles[str(issue_y)]:
all_articles[str(issue_y)][issue_date] = {}
if article_dept != "":
all_articles[str(issue_y)][issue_date][article_dept] = -1
"""
if format == "array":
return articles_array
else:
return all_articles