In [1]:
%load_ext autoreload

In [2]:
%autoreload
import pha
import pha.htmltools
archive = pha.Archive.default_location()
print(archive)

<Archive at '/Users/ianbicking/src/personal-history-archive'>


In [3]:
histories = archive.histories_with_page()
print(len(histories))

13049


In [6]:
histories[2]

<History https://news.ycombinator.com/ #visits=356>

In [7]:
from collections import Counter

def count_classes(doc):
    counter = Counter()
    for el in doc.cssselect("*[class]"):
        for phrase in pha.htmltools.normalize_classes(el):
            counter[phrase] += 1
    return counter

In [8]:
base_counter = Counter()
by_doc = Counter()
for history in histories:
    c = count_classes(history.page.lxml)
    base_counter.update(c)
    by_doc.update(c.keys())

In [9]:
len(base_counter), len(by_doc)

(280151, 280151)

In [10]:
print("Total counts:", sorted(base_counter.items(), key=lambda x: x[1])[-20:])
print("By document:", sorted(by_doc.items(), key=lambda x: x[1])[-20:])

Total counts: [('blank-may', 115345), ('noncollaps', 122856), ('reportform', 127379), ('thing', 127395), ('child', 127401), ('entri', 127438), ('parent', 127660), ('flat-list', 128492), ('taglin', 128495), ('-gb', 143091), ('button', 163053), ('bylink', 169683), ('arrow', 205311), ('scope-style', 215843), ('ctrl-f-no', 253178), ('clearleft', 254758), ('score', 299354), ('unvot', 328883), ('access-requir', 503082), ('login-requir', 602254)]
By document: [('clear', 1513), ('js', 1548), ('hidden', 1553), ('undefin', 1604), ('comment', 1707), ('md', 1751), ('col', 1752), ('link', 1784), ('activ', 1858), ('titl', 1948), ('author', 2014), ('dropdown', 2113), ('footer', 2136), ('button', 2155), ('select', 2354), ('fit-shrink-to', 2396), ('btn', 2534), ('contain', 2539), ('icon', 2632), ('content', 3173)]


## Prepare classes to be vectorized

This creates one long file that has all the concatenated stemmed class names for all documents. This is reasonable for training different embedding vectors (mapping class names to vectors of floats):

In [14]:
%autoreload
import pha.htmltools
import random
shuffled_histories = list(histories)
random.shuffle(shuffled_histories)
all_classes = []
for history in shuffled_histories:
    for el in history.page.lxml.cssselect("*[class]"):
        all_classes.extend(pha.htmltools.normalize_classes(el, shuffle=True))
print("#:", len(all_classes), "Mb:", len(" ".join(all_classes)) // 1000000)

#: 16318017 Mb: 170


In [15]:
with open("all-classes.txt", "w") as fp:
    fp.write(" ".join(all_classes))