In [2]:
%load_ext autoreload

In [3]:
%autoreload
import pha
import pha.htmltools
archive = pha.Archive.default_location()
print(archive)

<Archive at '/Users/ianbicking/src/personal-history-archive' 19596/52625 fetched, 31043 errored>


In [4]:
histories = archive.histories_with_page()
print(len(histories))

14995


In [6]:
histories[0]

<History https://github.com/mozilla-services/screenshots/pulls #visits=522>

In [7]:
from collections import Counter

def count_classes(doc):
    counter = Counter()
    for el in doc.cssselect("*[class]"):
        for phrase in pha.htmltools.normalize_classes(el):
            counter[phrase] += 1
    return counter

In [8]:
base_counter = Counter()
by_doc = Counter()
for history in histories:
    c = count_classes(history.page.lxml)
    base_counter.update(c)
    by_doc.update(c.keys())

In [9]:
len(base_counter), len(by_doc)

(280151, 280151)

In [10]:
print("Total counts:", sorted(base_counter.items(), key=lambda x: x[1])[-20:])
print("By document:", sorted(by_doc.items(), key=lambda x: x[1])[-20:])

Total counts: [('blank-may', 115345), ('noncollaps', 122856), ('reportform', 127379), ('thing', 127395), ('child', 127401), ('entri', 127438), ('parent', 127660), ('flat-list', 128492), ('taglin', 128495), ('-gb', 143091), ('button', 163053), ('bylink', 169683), ('arrow', 205311), ('scope-style', 215843), ('ctrl-f-no', 253178), ('clearleft', 254758), ('score', 299354), ('unvot', 328883), ('access-requir', 503082), ('login-requir', 602254)]
By document: [('clear', 1513), ('js', 1548), ('hidden', 1553), ('undefin', 1604), ('comment', 1707), ('md', 1751), ('col', 1752), ('link', 1784), ('activ', 1858), ('titl', 1948), ('author', 2014), ('dropdown', 2113), ('footer', 2136), ('button', 2155), ('select', 2354), ('fit-shrink-to', 2396), ('btn', 2534), ('contain', 2539), ('icon', 2632), ('content', 3173)]


## Prepare classes to be vectorized

This creates one long file that has all the concatenated stemmed class names for all documents. This is reasonable for training different embedding vectors (mapping class names to vectors of floats):

In [52]:
%autoreload
import pha.glovehelper
pha.glovehelper.set_glove_path("/Users/ianbicking/src/personal-history-archive/tmp/GloVe")

In [53]:
%autoreload
import pha.htmltools
import random
shuffled_histories = list(histories)
random.shuffle(shuffled_histories)
all_classes = []
for history in shuffled_histories:
    for el in history.page.lxml.iter():
        classes = pha.htmltools.normalize_classes(el, shuffle=True)
        if classes:
            all_classes.extend(classes)
        else:
            all_classes.append("no-class")
print("#:", len(all_classes), "Mb:", len(" ".join(all_classes)) // 1000000)

#: 23005752 Mb: 232


In [54]:
from pha.glovehelper import vectorize
class_vectors = vectorize(
    all_classes, 50)
print(list(class_vectors.keys())[:10])

['dtlwc-report-t', 'bqe-id-t', 'nj', 'drjof-id-t', '--c-waypoint-waypoint-xsk', 'wi', 'amphtml-i-interfac-video', 'navig-target', 'i', 'aafa-sx']


In [55]:
tag_shuffled_histories = list(histories)
random.shuffle(tag_shuffled_histories)
all_tags = []
for history in shuffled_histories:
    for el in history.page.lxml.iter():
        all_tags.append(el.tag)
print("#:", len(all_tags), "Mb:", len(" ".join(all_tags)) // 1000000)

#: 15403930 Mb: 60


In [56]:
from collections import Counter
tag_counter = Counter(all_tags)
print("Number of tags:", len(tag_counter))
removed = 0
for tag, count in tag_counter.most_common():
    if count <= 5:
        removed += 1
        all_tags.remove(tag)
        del tag_counter[tag]
print("Removed", removed, "tags, with:", len(tag_counter), "left")

Number of tags: 588
Removed 223 tags, with: 365 left


In [57]:
tag_vectors = vectorize(
    all_tags, 20)

In [58]:
import json
json.dump({"classes": class_vectors, "tags": tag_vectors}, open("html-vectors.json", "w"))