In [33]:
%matplotlib notebook

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from unidecode import unidecode
import re

import glob
import yaml

In [37]:
data = pd.read_csv("./pages-hits.csv", index_col="Page")
del data['Page Value']
data = data.drop(["/", np.NaN, "/tag/"])
data.reset_index(inplace=True)
data['Page'] = data['Page'].apply(lambda x: int(x[1:-1]))
data.set_index('Page', inplace=True)


def get_tags(filename):
    lines = open(filename, "r").readlines()

    yaml.reader.Reader.NON_PRINTABLE = re.compile(
    u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]')
    info = yaml.load(
        "".join(lines[1:lines.index("---\n", 2)])
    )
    return info['tags']

tag_lookup = {
    filename.split("-")[-1].split('.')[0]: get_tags(filename)
    for filename in glob.glob("../_posts/*.md")
}

tags = {}
for post, ts in tag_lookup.items():
    for tag in ts:
        if tag not in tags:
            tags[tag] = 0

tag_uses = {}
for post, ts in tag_lookup.items():
    for tag in ts:
        if tag not in tag_uses:
            tag_uses[tag] = 0
        tag_uses[tag] += 1

In [38]:
data

Unnamed: 0_level_0,Pageviews,Unique Pageviews,Avg. Time on Page,Entrances,Bounce Rate,% Exit
Page,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,35,22,00:01:36,21,61.90%,45.71%
49,25,21,00:01:11,21,76.19%,80.00%
3,15,9,00:00:37,7,57.14%,33.33%
19,13,13,00:00:00,13,100.00%,100.00%
2,11,5,00:00:40,4,50.00%,45.45%
23,11,10,00:00:13,9,66.67%,63.64%
30,11,9,00:03:42,8,62.50%,72.73%
9,10,10,00:00:42,7,71.43%,70.00%
31,9,6,00:07:22,4,75.00%,55.56%
37,6,5,00:06:49,4,25.00%,33.33%


In [39]:
for page, ts in tag_lookup.items():
    page = int(page)
    if page in data.index:
        views = data.loc[page, "Pageviews"]
        for t in ts:
            tags[t] += views

In [40]:
tag_names = [t for t in tags.keys()]
tag_views = [tags[t] for t in tag_names]
tag_occurs = [tag_uses[t] for t in tag_names]

import plotly
from plotly.graph_objs import Scatter, Layout, Bar

plotly.offline.init_notebook_mode(connected=True)

data = Scatter(
    y=tag_views, x=tag_occurs,
    text=tag_names,
    mode= 'markers',
)
fig = plotly.graph_objs.Figure(data=[data], layout=Layout(
    title= 'Tag Popularity and Usage',
    hovermode= 'closest',
    xaxis= dict(
        title= 'Tag Use (Number of Posts)',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Tag Views (Number of Unique Viewers)',
        ticklen= 5,
        gridwidth= 2,
    ),
    showlegend= False
))
plotly.offline.iplot(fig)

Small x and large y means that a tag is disproportionately popular; large x and small y means I like a topic but no one else does.

In [41]:
sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)

plotly.offline.iplot(plotly.graph_objs.Figure(
    data=[
        Bar(
            x=[i[0] for i in sorted_tags],
            y=[i[1] for i in sorted_tags],
        )
    ], layout=Layout(
    title= 'Tag Use',
    hovermode= 'closest',
    xaxis= dict(
#         title= 'Tag',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
        dtick=1
    ),
    yaxis=dict(
        title= 'Tag Frequency',
        ticklen= 5,
        gridwidth= 2,
    ),
    showlegend= False
)))

In [42]:
sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)

plotly.offline.iplot(plotly.graph_objs.Figure(
    data=[
        Bar(
            x=[i[0] for i in sorted_tags],
            y=[v/o for v, o in zip(tag_views, tag_occurs)],
        )
    ], layout=Layout(
    title= 'Tag Popularity',
    hovermode= 'closest',
    xaxis= dict(
#         title= 'Tag',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
        dtick=1
    ),
    yaxis=dict(
        title= 'Use / Views',
        ticklen= 5,
        gridwidth= 2,
    ),
    showlegend= False
)))

In [43]:
sorted(tags.keys())

['3d',
 'BCI',
 'EEG',
 'GABA',
 'GCaMP6',
 'GIS',
 'MRI',
 'OCT',
 'PET',
 'RGC',
 'aav2',
 'accelerometer',
 'algorithms',
 'alzheimers',
 'archaeology',
 'artificial-intelligence',
 'astronomy',
 'audio',
 'automata',
 'autonomy',
 'banjo',
 'behavior',
 'biomimicry',
 'birds',
 'bitcoin',
 'botnet',
 'brain-volume',
 'brainwaves',
 'cake',
 'cancer',
 'cas9',
 'cfg',
 'challenge',
 'circuitry',
 'clickbait',
 'cnn',
 'cognition',
 'common-cold',
 'computation',
 'computed-tomography',
 'computer-vision',
 'connectome',
 'connectomics',
 'contest',
 'crispr',
 'cryptocurrency',
 'cultural-heritage',
 'culture',
 'curves',
 'cv',
 'data-science',
 'deep-brain-stimulation',
 'deep-learning',
 'deep-sea',
 'development',
 'distributed',
 'driving',
 'drug',
 'drum',
 'economics',
 'egypt',
 'ehr',
 'electrical-synapses',
 'electron-microscopy',
 'electrophysiology',
 'em',
 'emoji',
 'epilepsy',
 'evolution',
 'explanation',
 'extrastriate-cortex',
 'eye',
 'fMRI',
 'fcn',
 'finance',
