In [12]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [121]:
%autoreload
import pha
import pha.htmltools
archive = pha.Archive.default_location()
print(archive)

<Archive at '/Users/ianbicking/src/personal-history-archive'>


In [122]:
histories = archive.histories_with_page()
print(len(histories))

3698


In [123]:
histories[0]

<History http://talkingpointsmemo.com/ #visits=292>

In [105]:
import re
from nltk.stem import PorterStemmer

mixed_regex = re.compile(r'([a-z])([A-Z])')
non_char_regex = re.compile(r'[^a-z]', re.I)
stemmer = PorterStemmer()

def wordify(c):
    """Changes a class into a set of words"""
    c = mixed_regex.sub(r"\1 \2", c)
    c = c.replace("-", " ").replace("_", " ")
    c = non_char_regex.sub("", c)
    return " ".join(c.lower().split())

def stem_words(c):
    return " ".join([stemmer.stem(w) for w in c.split()])

def sort_words(c):
    return " ".join(sorted(c.split()))


In [124]:
from collections import Counter

def count_classes(doc):
    counter = Counter()
    for el in doc.cssselect("*[class]"):
        for phrase in pha.htmltools.normalize_classes(el):
            counter[phrase] += 1
    return counter

In [125]:
base_counter = Counter()
by_doc = Counter()
for history in histories:
    c = count_classes(history.page.lxml)
    base_counter.update(c)
    by_doc.update(c.keys())

In [108]:
len(base_counter), len(by_doc)

(162660, 162660)

In [110]:
print("Total counts:", sorted(base_counter.items(), key=lambda x: x[1])[-20:])
print("By document:", sorted(by_doc.items(), key=lambda x: x[1])[-20:])

Total counts: [('author', 67643), ('noncollaps', 71266), ('mayblank', 71672), ('thing', 74249), ('reportform', 74249), ('child', 74249), ('entri', 74270), ('parent', 74387), ('flatlist', 74999), ('taglin', 74999), ('gb', 78481), ('button', 90534), ('bylink', 99740), ('arrow', 121432), ('noctrlf', 147611), ('clearleft', 148501), ('score', 178709), ('unvot', 194032), ('accessrequir', 298519), ('loginrequir', 357697)]
By document: [('bottom', 844), ('separ', 873), ('option', 892), ('ad', 905), ('activ', 915), ('link', 937), ('comment', 961), ('hidden', 966), ('dropdown', 999), ('footer', 1031), ('author', 1049), ('button', 1067), ('titl', 1087), ('col', 1120), ('md', 1123), ('content', 1133), ('btn', 1161), ('contain', 1168), ('icon', 1233), ('select', 1242)]


In [126]:
all_classes = []
for history in histories:
    for el in history.page.lxml.cssselect("*[class]"):
        all_classes.extend(pha.htmltools.normalize_classes(el))
print("#:", len(all_classes), "bytes:", len(" ".join(all_classes)))

#: 7625836 bytes: 76143784


In [127]:
with open("all-classes.txt", "w") as fp:
    fp.write(" ".join(all_classes))