/
metrics.py
317 lines (232 loc) Β· 9.01 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import print_function
import re
import datetime
from boltons.iterutils import unique
from hyperlink import parse as parse_url
import requests
MW_API_URL = parse_url('https://en.wikipedia.org/w/api.php')
REST_API_BASE_URL = parse_url('https://en.wikipedia.org/api/rest_v1/')
REF_API_BASE_URL = REST_API_BASE_URL.child('page', 'references')
from log import tlog
def format_datetime(dt):
if isinstance(dt, datetime.date):
dt = datetime.datetime(dt.year, dt.month, dt.day, 0, 0, 0)
return dt.isoformat().split('.')[0] + 'Z'
## PTArticle-based Metrics
def get_revid(pta):
return _get_revid_at_timestamp(pta.title, format_datetime(pta.timestamp))
def get_talk_revid(pta):
return _get_revid_at_timestamp(pta.talk_title, format_datetime(pta.timestamp))
def get_templates(pta):
return _get_templates(pta.rev_id)
def get_talk_templates(pta):
return _get_templates(pta.talk_rev_id)
def get_assessments(pta):
return _get_assessments(pta.title)
def get_wikiprojects(pta):
return [tc.replace('WikiProject ', '') for tc in pta.talk_templates
if 'wikiproject' in tc.lower()]
def get_citations(pta):
return _get_citations(pta.title, pta.rev_id)
def get_wikidata_item(pta):
return _get_article_wikidata_item(pta.rev_id)
def article_exists(pta):
if not pta.rev_id:
return False
return True
def ref_count(pta):
if not pta.citations:
return 0
return len(pta.citations['references_by_id'].keys())
def ref_wikidata_count(pta):
if not pta.citations:
return 0
return len([c for c in pta.citations['references_by_id'].items()
if 'https://www.wikidata.org/wiki/Q'
in c[1]['content']['html']])
def wikidata_item(pta):
return len(pta.wikidata_item)
def assessment_avg(pta, scale=None, wikiproject=None):
# TODO
"""assessments = pta.assessments.items()
if wikiproject:
assessments = [(p, a) for p, a in assessment if p == wikiproject]
if scale:
pass
"""
pass
def in_wikiproject(pta, wikiproject=None, case_sensitive=False):
wikiprojects = pta.wikiprojects
if not case_sensitive:
wikiprojects = unique([w.lower() for w in wikiprojects])
wikiproject = wikiproject.lower()
return wikiproject in wikiprojects
def template_count(pta, template_name=None, template_names=None, template_regex=None, case_sensitive=False):
article_tmpl_names = pta.templates
if template_name:
if template_names:
raise RuntimeError('template_count metric expected one of'
' "template_name" or "template_names" arg, not both')
template_names = [template_name]
if not template_names:
return len(article_tmpl_names)
if not case_sensitive:
article_tmpl_names = unique([t.lower() for t in article_tmpl_names])
template_names = unique([tn.lower() for tn in template_names])
if template_regex:
template_pattern = re.compile(template_regex)
return len([t for t in article_tmpl_names if re.search(template_pattern, t)])
return len(set(template_names) & set(article_tmpl_names))
##
@tlog.wrap('info', inject_as='act')
def get_json(url, params=None, act=None): # TODO: option for validating status code
params = dict(params or {})
for k, v in params.items():
url = url.set(unicode(k), unicode(v))
if act:
act['url'] = unicode(url)
resp = requests.get(url, params=params)
return resp.json()
def get_wapi_json(params):
url = MW_API_URL
return get_json(url, params)
def _get_revid_at_timestamp(title, timestamp):
"""Get page revision id at a particular timestamp
:param title: a page title; note, the MW API only supports a
single title in titles when using rvstart
:param timestamp: in the form of ISO8601 '2018-11-29T20:09:07Z'
:return: a map from page title to the revision id
"""
resp = get_wapi_json(params={
'action': 'query',
'prop': 'revisions',
'format': 'json',
'titles': title,
'rvlimit': '1',
'rvstart': timestamp
})
try:
ret = {page['title']: page['revisions'][0]['revid'] for page in resp['query']['pages'].values()}
ret = ret.values()[0]
except KeyError as e:
ret = None
return ret
def _get_templates(oldid):
"""Get a list of templates as well as number of calls per template for a given revision (oldid)
NOTE: we parse the info from the transclusion expansion report from the parse API, which might be unstable.
:param oldid:
:return: a list of
"""
if not oldid:
return []
revisionResponse = get_wapi_json(params={
'action': 'parse',
'oldid': oldid,
'format': 'json',
})
try:
templates = revisionResponse['parse']['templates']
except KeyError:
raise
ret = [t['*'].replace('Template:', '') for t in templates]
return ret
def _get_article_wikidata_item(oldid):
params = {'action': 'query',
'prop': 'wbentityusage',
'revids': oldid,
'format': 'json'}
resp = get_wapi_json(params)
try:
wbentities = resp['query']['pages'].values()[0]['wbentityusage']
except KeyError as e:
return []
return [q for (q, val) in wbentities.items() if 'S' in val['aspects']]
def _get_assessments(title):
# can't actually get assessments from past versions of an article
# see: https://phabricator.wikimedia.org/T211485
params = {'action': 'query',
'prop': 'pageassessments',
'titles': title,
'formatversion': 2,
'format': 'json'}
resp = get_wapi_json(params)
try:
return resp['query']['pages'][0]['pageassessments']
except KeyError:
return {}
def check_infobox(template_calls):
for template_call in template_calls:
if 'infobox' in template_call.lower():
return True
return False
def check_infobox_wikidata(template_calls):
wikidata_template_pattern = re.compile(r'infobox(.*)\/wikidata')
for template_call in template_calls:
if re.search(wikidata_template_pattern, template_call.lower()):
return True
return False
def get_wikiproject(wikiproject, talk_revid):
if not talk_revid:
return False
talk_templates = _get_templates(talk_revid)
wikiprojects = [tc.replace('WikiProject ', '') for tc in talk_templates
if 'wikiproject' in tc.lower()]
if wikiproject in wikiprojects:
return True
return False
def _get_citations(title, old_id):
# This API was depricated:
# https://phabricator.wikimedia.org/T247991
title = title.replace(' ', '_') # rest endpoint doesn't like url encoded spaces
api_url = REF_API_BASE_URL.child(title, unicode(old_id))
citations = get_json(api_url)
return citations
def get_citation_stats(title, oldid):
citations = _get_citations(title, oldid)
try:
ref_count = len(citations['references_by_id'].keys())
except KeyError as e:
return {'reference_count': 0,
'reference_wikidata_count': 0,
'reference_wikidata_percent': 0}
ref_wikidata_count = len([c for c in citations['references_by_id'].items()
if 'https://www.wikidata.org/wiki/Q'
in c[1]['content']['html']])
if ref_count:
ref_wikidata_percent = ref_wikidata_count / (ref_count * 1.0)
else:
ref_wikidata_percent = 0
return {'reference_count': ref_count,
'reference_wikidata_count': ref_wikidata_count,
'reference_wikidata_percent': ref_wikidata_percent}
def get_all_stats(title, wikiproject, date):
revid = _get_revid_at_timestamp(title, date)
talk_title = 'Talk:' + title
talk_revid = _get_revid_at_timestamp(talk_title, date)
templates = _get_templates(revid)
assessments = _get_assessments(title)
stats = {'wikipedia_exists': bool(revid),
'wikidata_exists': _get_article_wikidata_item(revid),
'infobox': check_infobox(templates),
'infobox_wikidata': check_infobox_wikidata(templates),
'citation_stats': get_citation_stats(title, revid),
'in_wikiproject': get_wikiproject(wikiproject, talk_revid),
'quality': assessments.get(wikiproject, {}).get('class'),
'importance': assessments.get(wikiproject, {}).get('importance'),
'wikipedia_talk_exists': bool(talk_revid),
'metadata': {'target_date': date,
'revision': revid,
'talk_revision': talk_revid}}
return stats
if __name__ == '__main__':
title = 'Coffee'
wikiproject = 'Newspapers'
date = '2018-12-05T20:00:00Z'
stats = get_all_stats(title, wikiproject, date)
print(stats)
old_date = '2010-12-05T20:00:00Z'
stats = get_all_stats(title, wikiproject, old_date)
print(stats)
import pdb;pdb.set_trace()