-
Notifications
You must be signed in to change notification settings - Fork 26
/
mediacounts-stats.py
executable file
·139 lines (118 loc) · 4.15 KB
/
mediacounts-stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import csv, json, argparse, sys, datetime, os, re, time
# Requires wikitools 1.3+ to use generators
try:
from wikitools import wiki, category, api
except ImportError:
HAS_WIKITOOLS = False
else:
HAS_WIKITOOLS = True
# These are taken from
# http://dumps.wikimedia.org/other/mediacounts/README.txt
FIELDS = [
"filename",
"total_response_bytes",
"total_transfers",
"total_transfers_raw",
"total_transfers_audio",
"reserved6",
"reserved7",
"total_transfers_image",
"total_transfers_image_0x199",
"total_transfers_image_200x399",
"total_transfers_image_400x599",
"total_transfers_image_600x799",
"total_transfers_image_800x999",
"total_transfers_image_1000plus",
"reserved15",
"reserved16",
"total_transfers_movie",
"total_transfers_movie_0x239",
"total_transfers_movie_240x479",
"total_transfers_movie_480plus",
"reserved21",
"reserved22",
"total_transfers_refer_wmf",
"total_transfers_refer_nonwmf",
"total_transfers_refer_invalid"
]
args = None
def init_argparse():
parser = argparse.ArgumentParser(
description='Get mediacounts for a specific media file or list of files'
)
parser.add_argument('-i', '--input', help="Path to TSV file", required = True)
parser.add_argument('-o', '--output', help="Path to output CSV file", required = True)
parser.add_argument('-q', '--query', help="Media file to search for")
parser.add_argument('-qf', '--queryfile', help="Path to a newline separated file of files to search for")
parser.add_argument('-cat', '--category', help="Name of a Wikimedia Commons category of files to search for")
parser.add_argument('-v', '--verbose', help="Output verbose results", action="store_true")
parser.add_argument('-p', '--progress', help="Show progress", action="store_true")
return parser.parse_args()
def log(msg):
if args.verbose:
print msg
def process():
tsvfile = open(args.input)
tsvfilesize = os.path.getsize(args.input)
csvfile = open(args.output, "w")
writer = csv.writer(csvfile)
rowwritten = False
# Actually benefit from the generator, e.g. batch
query = frozenset([ l for l in queries() ])
for index, line in enumerate(tsvfile):
if args.progress:
if index % 200000 == 0:
percent = tsvfile.tell() / float(tsvfilesize)
print "{0:.2f}%".format(percent * 100)
row = line.split("\t")
if args.category:
filename = row[0]
else:
filename = row[0].split("/")[-1]
if filename not in query:
continue
log("MATCH " + row[0])
if not rowwritten:
writer.writerow(FIELDS)
rowwritten = True
writer.writerow(row)
tsvfile.close()
csvfile.close()
def queries():
if args.queryfile:
for l in open(args.queryfile):
yield l.strip()
elif args.query:
yield args.query
elif args.category and not HAS_WIKITOOLS:
sys.exit("-cat option given, but wikitools package is not present, see < https://github.com/alexz-enwp/wikitools >")
elif args.category and HAS_WIKITOOLS:
site = wiki.Wiki("https://commons.wikimedia.org/w/api.php")
query = []
params = {
'action': 'query',
'prop': 'imageinfo',
'iiprop': 'url',
'generator': 'categorymembers',
'gcmtitle': 'Category:' + args.category,
'gcmnamespace': '6',
'gcmprop': 'title'
}
req = api.APIRequest(site, params)
for data in req.queryGen():
keys = data['query']['pages'].keys()
for key in keys:
url = data['query']['pages'][key]['imageinfo'][0]['url']
yield re.sub("https://upload.wikimedia.org", "", url)
else:
sys.exit("No query given")
def main():
global args
args = init_argparse()
now = time.time()
log("Starting " + datetime.datetime.now().isoformat())
process()
log("Ending " + datetime.datetime.now().isoformat())
log("Query took %s seconds" % round(time.time() - now, 2))
if __name__ == "__main__":
main()