In [4]:
import mwxml
import bz2
import gzip
import itertools
import json
from mwxml import Dump
import csv
import mwparserfromhell as mwparser
from collections import Counter


This notebook calculates the templates used in each Wikipedia page.

In [44]:
def clean_template_name(x):
    """Standardize the name of mwparserfromhell.Template objects"""
    return str(x.name).lower().strip().replace(" ", "_")

def count_templates(x):
    """Count templates used in a revision"""
    parsed = mwparser.parse(revision.text)
    return list(Counter(clean_template_name(x) for x in parsed.filter_templates()).items())
    

In [61]:
import tqdm 
from multiprocessing import Pool, cpu_count

def iter_revisions(dump, max_revisions=None):
    i = 0
    for page in tqdm.tqdm(dump.pages):
        for revision in page:
            if max_revisions and i >= max_revisions:
                return None
            yield revision
            i += 1
        

def article_templates(input_file, output_file, n=None, workers=cpu_count() - 1, chunksize=1):
    with Pool(workers) as pool:
        fields = ("template", "count")
        with bz2.open(input_file, "rt") as fin, gzip.open(output_file, "wt") as fout:
            dump = Dump.from_file(fin)
            writer = csv.writer(fout)
            writer.writerow(fields)
            for partition_all(iter_revisions(dump, max_revisions=n)
            
            for res in (count_templates(x) for x in revisions):
                writer.writerows(res)
            

In [63]:
import itertoolz

ModuleNotFoundError: No module named 'itertoolz'

In [62]:
input_file = "../data/dumps.wikipedia.org/enwiki/20180901/enwiki-20180901-pages-articles.xml.bz2"
output_file = "../data/enwiki-articles-templates.csv.gz"
n = 100

article_templates(input_file, output_file, workers=2, chunksize=5000, n=None)



0it [00:00, ?it/s][A[A

1it [00:00,  6.02it/s][A[A

2it [00:00,  6.17it/s][A[A

4it [00:00,  6.88it/s][A[A

5it [00:00,  6.95it/s][A[A

7it [00:00,  7.54it/s][A[A

8it [00:01,  7.41it/s][A[A

10it [00:01,  7.90it/s][A[A

11it [00:01,  7.65it/s][A[A

13it [00:01,  7.97it/s][A[A

15it [00:01,  8.25it/s][A[A

16it [00:01,  7.69it/s][A[A

18it [00:02,  8.01it/s][A[A

19it [00:02,  7.71it/s][A[A

21it [00:02,  7.96it/s][A[A

22it [00:02,  7.59it/s][A[A

24it [00:02,  8.00it/s][A[A

25it [00:03,  7.30it/s][A[A

27it [00:03,  7.51it/s][A[A

28it [00:03,  6.96it/s][A[A

30it [00:03,  7.28it/s][A[A

31it [00:03,  6.99it/s][A[A

33it [00:04,  7.36it/s][A[A

34it [00:04,  7.26it/s][A[A

36it [00:04,  7.75it/s][A[A

37it [00:04,  7.54it/s][A[A

39it [00:04,  7.96it/s][A[A

40it [00:05,  7.68it/s][A[A

42it [00:05,  8.12it/s][A[A

44it [00:05,  8.50it/s][A[A

45it [00:05,  8.03it/s][A[A

47it [00:05,  8.38it/s][A[A

48it [00:05,  7.6

391it [00:49,  8.30it/s][A[A

392it [00:49,  7.96it/s][A[A

394it [00:49,  8.26it/s][A[A

395it [00:50,  7.38it/s][A[A

397it [00:50,  7.48it/s][A[A

399it [00:50,  7.71it/s][A[A

400it [00:50,  7.24it/s][A[A

402it [00:50,  7.55it/s][A[A

403it [00:51,  7.23it/s][A[A

405it [00:51,  7.71it/s][A[A

406it [00:51,  7.56it/s][A[A

408it [00:51,  7.81it/s][A[A

409it [00:51,  7.57it/s][A[A

411it [00:52,  7.99it/s][A[A

412it [00:52,  7.78it/s][A[A

414it [00:52,  8.21it/s][A[A

415it [00:52,  7.88it/s][A[A

417it [00:52,  8.13it/s][A[A

419it [00:53,  8.30it/s][A[A

420it [00:53,  7.60it/s][A[A

422it [00:53,  7.89it/s][A[A

423it [00:53,  7.41it/s][A[A

425it [00:53,  7.75it/s][A[A

426it [00:53,  7.19it/s][A[A

428it [00:54,  7.48it/s][A[A

429it [00:54,  7.01it/s][A[A

430it [00:54,  7.60it/s][A[A

431it [00:54,  7.07it/s][A[A

432it [00:54,  6.71it/s][A[A

434it [00:55,  7.13it/s][A[A

435it [00:55,  6.69it/s][A[A

437it [0

775it [01:38,  7.44it/s][A[A

777it [01:38,  7.89it/s][A[A

778it [01:38,  7.57it/s][A[A

780it [01:38,  8.01it/s][A[A

782it [01:38,  8.23it/s][A[A

783it [01:39,  7.62it/s][A[A

785it [01:39,  7.87it/s][A[A

786it [01:39,  7.55it/s][A[A

788it [01:39,  7.88it/s][A[A

789it [01:39,  7.38it/s][A[A

791it [01:40,  7.62it/s][A[A

792it [01:40,  7.09it/s][A[A

794it [01:40,  7.43it/s][A[A

795it [01:40,  6.97it/s][A[A

797it [01:40,  7.27it/s][A[A

798it [01:41,  6.80it/s][A[A

800it [01:41,  7.16it/s][A[A

801it [01:41,  6.77it/s][A[A

803it [01:41,  7.15it/s][A[A

805it [01:42,  7.38it/s][A[A

806it [01:42,  6.89it/s][A[A

808it [01:42,  7.27it/s][A[A

809it [01:42,  6.97it/s][A[A

811it [01:42,  7.49it/s][A[A

812it [01:42,  7.40it/s][A[A

814it [01:43,  7.90it/s][A[A

815it [01:43,  7.66it/s][A[A

817it [01:43,  8.07it/s][A[A

818it [01:43,  7.79it/s][A[A

820it [01:43,  8.21it/s][A[A

821it [01:43,  7.82it/s][A[A

822it [0

1155it [02:26,  7.06it/s][A[A

1157it [02:26,  7.45it/s][A[A

1158it [02:26,  6.95it/s][A[A

1160it [02:26,  7.27it/s][A[A

1162it [02:27,  7.45it/s][A[A

1163it [02:27,  6.95it/s][A[A

1165it [02:27,  7.27it/s][A[A

1166it [02:27,  6.61it/s][A[A

1168it [02:28,  6.99it/s][A[A

1169it [02:28,  6.61it/s][A[A

1171it [02:28,  6.96it/s][A[A

1172it [02:28,  6.70it/s][A[A

1174it [02:28,  7.05it/s][A[A

1175it [02:29,  6.93it/s][A[A

1177it [02:29,  7.48it/s][A[A

1178it [02:29,  7.38it/s][A[A

1180it [02:29,  7.86it/s][A[A

1182it [02:29,  8.25it/s][A[A

1183it [02:29,  7.89it/s][A[A

1185it [02:30,  8.25it/s][A[A

1186it [02:30,  7.84it/s][A[A

1188it [02:30,  8.25it/s][A[A

1189it [02:30,  7.80it/s][A[A

1191it [02:30,  7.90it/s][A[A

1192it [02:31,  7.63it/s][A[A

1194it [02:31,  8.03it/s][A[A

1195it [02:31,  7.73it/s][A[A

1197it [02:31,  8.15it/s][A[A

1198it [02:31,  7.78it/s][A[A

1200it [02:31,  8.03it/s][A[A

1202it [02

1534it [03:14,  7.52it/s][A[A

1535it [03:14,  7.01it/s][A[A

1537it [03:14,  7.25it/s][A[A

1538it [03:14,  6.79it/s][A[A

1540it [03:15,  7.11it/s][A[A

1541it [03:15,  6.72it/s][A[A

1543it [03:15,  7.00it/s][A[A

1544it [03:15,  6.72it/s][A[A

1546it [03:15,  7.34it/s][A[A

1547it [03:16,  7.30it/s][A[A

1549it [03:16,  7.78it/s][A[A

1551it [03:16,  8.25it/s][A[A

1552it [03:16,  7.85it/s][A[A

1554it [03:16,  8.24it/s][A[A

1555it [03:17,  7.89it/s][A[A

1557it [03:17,  8.31it/s][A[A

1558it [03:17,  7.91it/s][A[A

1560it [03:17,  8.29it/s][A[A

1561it [03:17,  7.94it/s][A[A

1563it [03:17,  8.34it/s][A[A

1564it [03:18,  7.92it/s][A[A

1566it [03:18,  8.33it/s][A[A

1567it [03:18,  7.93it/s][A[A

1569it [03:18,  8.33it/s][A[A

1571it [03:18,  8.52it/s][A[A

1572it [03:19,  7.55it/s][A[A

1574it [03:19,  7.69it/s][A[A

1575it [03:19,  6.73it/s][A[A

1577it [03:19,  7.16it/s][A[A

1578it [03:19,  6.98it/s][A[A

1580it [03

1912it [04:02,  6.98it/s][A[A

1913it [04:02,  6.87it/s][A[A

1915it [04:02,  7.39it/s][A[A

1917it [04:02,  7.90it/s][A[A

1918it [04:02,  7.69it/s][A[A

1920it [04:03,  8.16it/s][A[A

1921it [04:03,  7.46it/s][A[A

1922it [04:03,  8.00it/s][A[A

1923it [04:03,  7.65it/s][A[A

1924it [04:03,  7.43it/s][A[A

1926it [04:03,  7.83it/s][A[A

1927it [04:04,  7.62it/s][A[A

1929it [04:04,  7.97it/s][A[A

1930it [04:04,  7.68it/s][A[A

1932it [04:04,  8.10it/s][A[A

1933it [04:04,  7.56it/s][A[A

1935it [04:05,  7.90it/s][A[A

1936it [04:05,  7.11it/s][A[A

1938it [04:05,  7.30it/s][A[A

1940it [04:05,  7.52it/s][A[A

1941it [04:05,  6.89it/s][A[A

1943it [04:06,  7.28it/s][A[A

1944it [04:06,  6.93it/s][A[A

1946it [04:06,  7.40it/s][A[A

1947it [04:06,  7.22it/s][A[A

1949it [04:06,  7.70it/s][A[A

1950it [04:07,  7.47it/s][A[A

1952it [04:07,  7.85it/s][A[A

1953it [04:07,  7.44it/s][A[A

1954it [04:07,  7.93it/s][A[A

1955it [04

2288it [04:49,  7.61it/s][A[A

2289it [04:50,  8.05it/s][A[A

2290it [04:50,  7.53it/s][A[A

2291it [04:50,  7.33it/s][A[A

2293it [04:50,  7.74it/s][A[A

2294it [04:50,  7.56it/s][A[A

2296it [04:50,  7.96it/s][A[A

2298it [04:51,  8.30it/s][A[A

2299it [04:51,  7.90it/s][A[A

2301it [04:51,  8.29it/s][A[A

2302it [04:51,  7.83it/s][A[A

2304it [04:51,  7.99it/s][A[A

2305it [04:52,  7.22it/s][A[A

2307it [04:52,  7.43it/s][A[A

2308it [04:52,  6.92it/s][A[A

2310it [04:52,  7.31it/s][A[A

2311it [04:52,  7.12it/s][A[A

2313it [04:53,  7.66it/s][A[A

2314it [04:53,  7.45it/s][A[A

2316it [04:53,  7.84it/s][A[A

2318it [04:53,  8.00it/s][A[A

2319it [04:53,  7.66it/s][A[A

2321it [04:54,  8.06it/s][A[A

2322it [04:54,  7.76it/s][A[A

2324it [04:54,  8.15it/s][A[A

2325it [04:54,  7.63it/s][A[A

2327it [04:54,  7.82it/s][A[A

2328it [04:54,  7.29it/s][A[A

2330it [04:55,  7.65it/s][A[A

2331it [04:55,  7.18it/s][A[A

2333it [04

2667it [05:38,  8.01it/s][A[A

2668it [05:38,  7.23it/s][A[A

2670it [05:38,  7.47it/s][A[A

2671it [05:38,  6.91it/s][A[A

2673it [05:39,  7.17it/s][A[A

2674it [05:39,  6.70it/s][A[A

2676it [05:39,  7.10it/s][A[A

2677it [05:39,  6.68it/s][A[A

2679it [05:40,  7.26it/s][A[A

2681it [05:40,  7.71it/s][A[A

2682it [05:40,  7.56it/s][A[A

2684it [05:40,  8.03it/s][A[A

2685it [05:40,  7.72it/s][A[A

2687it [05:40,  8.12it/s][A[A

2688it [05:41,  7.81it/s][A[A

2690it [05:41,  8.02it/s][A[A

2691it [05:41,  7.39it/s][A[A

2693it [05:41,  7.67it/s][A[A

2694it [05:41,  7.38it/s][A[A

2696it [05:42,  7.68it/s][A[A

2697it [05:42,  7.20it/s][A[A

2699it [05:42,  7.48it/s][A[A

2701it [05:42,  7.67it/s][A[A

2702it [05:42,  7.09it/s][A[A

2704it [05:43,  7.32it/s][A[A

2705it [05:43,  6.56it/s][A[A

2707it [05:43,  7.01it/s][A[A

2708it [05:43,  6.67it/s][A[A

2710it [05:44,  7.02it/s][A[A

2711it [05:44,  6.65it/s][A[A

2713it [05

3044it [06:26,  8.40it/s][A[A

3045it [06:26,  7.96it/s][A[A

3047it [06:26,  8.14it/s][A[A

3048it [06:27,  7.50it/s][A[A

3050it [06:27,  7.83it/s][A[A

3051it [06:27,  7.29it/s][A[A

3053it [06:27,  7.63it/s][A[A

3054it [06:27,  7.05it/s][A[A

3056it [06:28,  7.35it/s][A[A

3057it [06:28,  6.93it/s][A[A

3059it [06:28,  7.25it/s][A[A

3060it [06:28,  6.83it/s][A[A

3062it [06:28,  7.11it/s][A[A

3063it [06:29,  6.80it/s][A[A

3065it [06:29,  7.12it/s][A[A

3067it [06:29,  7.28it/s][A[A

3068it [06:29,  6.85it/s][A[A

3070it [06:29,  7.23it/s][A[A

3071it [06:30,  7.10it/s][A[A

3073it [06:30,  7.44it/s][A[A

3074it [06:30,  7.32it/s][A[A

3076it [06:30,  7.78it/s][A[A

3077it [06:30,  7.59it/s][A[A

3079it [06:31,  8.04it/s][A[A

3080it [06:31,  7.76it/s][A[A

3082it [06:31,  8.10it/s][A[A

3083it [06:31,  7.72it/s][A[A

3085it [06:31,  8.05it/s][A[A

3087it [06:32,  8.36it/s][A[A

3088it [06:32,  7.92it/s][A[A

3090it [06

3420it [07:14,  7.37it/s][A[A

3421it [07:14,  7.83it/s][A[A

3422it [07:14,  7.30it/s][A[A

3423it [07:15,  6.85it/s][A[A

3425it [07:15,  7.11it/s][A[A

3426it [07:15,  6.65it/s][A[A

3428it [07:15,  6.99it/s][A[A

3429it [07:15,  6.70it/s][A[A

3431it [07:16,  7.10it/s][A[A

3432it [07:16,  6.67it/s][A[A

3434it [07:16,  7.06it/s][A[A

3435it [07:16,  6.65it/s][A[A

3437it [07:16,  7.04it/s][A[A

3439it [07:17,  7.55it/s][A[A

3440it [07:17,  7.33it/s][A[A

3442it [07:17,  7.82it/s][A[A

3443it [07:17,  7.57it/s][A[A

3445it [07:17,  7.82it/s][A[A

3446it [07:18,  7.55it/s][A[A

3448it [07:18,  7.99it/s][A[A

3449it [07:18,  7.63it/s][A[A

3451it [07:18,  7.97it/s][A[A

3452it [07:18,  7.64it/s][A[A

3454it [07:18,  8.03it/s][A[A

3455it [07:19,  7.74it/s][A[A

3457it [07:19,  8.12it/s][A[A

3459it [07:19,  8.42it/s][A[A

3460it [07:19,  7.80it/s][A[A

3462it [07:19,  8.12it/s][A[A

3463it [07:20,  7.44it/s][A[A

3465it [07

3792it [08:02,  6.56it/s][A[A

3794it [08:02,  6.88it/s][A[A

3795it [08:02,  6.49it/s][A[A

3797it [08:02,  6.87it/s][A[A

3799it [08:03,  7.19it/s][A[A

3800it [08:03,  6.90it/s][A[A

3802it [08:03,  7.38it/s][A[A

3803it [08:03,  7.22it/s][A[A

3805it [08:04,  7.54it/s][A[A

3806it [08:04,  7.36it/s][A[A

3808it [08:04,  7.76it/s][A[A

3809it [08:04,  7.49it/s][A[A

3811it [08:04,  7.85it/s][A[A

3812it [08:04,  7.44it/s][A[A

3814it [08:05,  7.83it/s][A[A

3815it [08:05,  7.47it/s][A[A

3817it [08:05,  7.93it/s][A[A

3819it [08:05,  8.26it/s][A[A

3820it [08:05,  7.85it/s][A[A

3822it [08:06,  8.24it/s][A[A

3823it [08:06,  7.86it/s][A[A

3825it [08:06,  8.18it/s][A[A

3826it [08:06,  7.39it/s][A[A

3828it [08:06,  7.53it/s][A[A

3829it [08:07,  6.90it/s][A[A

3831it [08:07,  7.15it/s][A[A

3832it [08:07,  6.71it/s][A[A

3834it [08:07,  7.26it/s][A[A

3835it [08:07,  7.19it/s][A[A

3837it [08:08,  7.70it/s][A[A

3838it [08

KeyboardInterrupt: 