# Experiment 0 Annotator

This annotates the text with the number of times that passage has been quoted. 

In [1]:
import pandas as pd
import nltk
%matplotlib inline
from ast import literal_eval
import numpy as np
import re
from matplotlib import pyplot as plt
from colour import Color
from IPython.core.display import HTML
plt.rcParams["figure.figsize"] = [16, 6]

In [2]:
df = pd.read_csv('e0a/log.txt')

In [3]:
# Adapted from text-matcher
class Text: 
    def __init__(self, filename): 
        self.filename = filename
        
    @property
    def text(self):
        """ Reads the file in memory. """
        f = open(self.filename, encoding='utf-8', errors='ignore')
        return f.read() 

    @property
    def tokens(self, removeStopwords=True): 
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        self.length = spans[-1][-1] 
        tokens = tokenizer.tokenize(self.text)
        tokens = [ token.lower() for token in tokens ] # make them lowercase
        if not removeStopwords: 
            self.spans = spans
            return tokens
        tokenSpans = list(zip(tokens, spans)) # zip it up
        stopwords = nltk.corpus.stopwords.words('english') # get stopwords
        tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
        self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
        return [ x[0] for x in tokenSpans ] # unzip; get tokens

In [4]:
mm = Text('middlemarch.txt')

In [5]:
# Get the size of the text. 
textALength = df['Text A Length'][0]

# Make an empty array the size of the text. 
tally = np.zeros(textALength, dtype=np.int)

In [6]:
# Read the locations from the CSV file, and literally evaluate them into lists. 
locations = df['Locations in A']
locations = locations.apply(literal_eval)

In [7]:
# Tally up every time a letter in the novel is quoted. 
for article in locations: 
    for locRange in article: 
         for i in range(locRange[0], locRange[1]+1):
                tally[i] += 1

In [8]:
# Make a color list in hex for all the values in the tally. 
# Let's hope there aren't too many. 
colors = list(Color("blue").range_to(Color("red"),tally.max()+1))
colorList = [color.get_hex() for color in colors]
len(colorList)

15

In [9]:
# Create a CSS Stylesheet for each color value in the map. 
colorCSS = ""
for i, color in zip(range(0, tally.max()+1), colorList): 
    colorCSS += ".c-%s { color: %s; }\n" % (i, color)
colorCSS

'.c-0 { color: #00f; }\n.c-1 { color: #0049ff; }\n.c-2 { color: #0092ff; }\n.c-3 { color: #00dbff; }\n.c-4 { color: #00ffdb; }\n.c-5 { color: #00ff92; }\n.c-6 { color: #00ff49; }\n.c-7 { color: #0f0; }\n.c-8 { color: #49ff00; }\n.c-9 { color: #92ff00; }\n.c-10 { color: #dbff00; }\n.c-11 { color: #ffdb00; }\n.c-12 { color: #ff9200; }\n.c-13 { color: #ff4900; }\n.c-14 { color: #f00; }\n'

In [10]:
n = 20

checkpoints = np.linspace(0, textALength, n).round()
checkpoints = [int(point) for point in checkpoints]

In [11]:
def span(val): 
    return '<span class="c-%s">' % val

previousVal = None
for i, valChar in enumerate(zip(tally, mm.text)):
    val, char = valChar[0], valChar[1]
    if previousVal == None: 
        # First character. 
        out = '<span class="c-%s">' % val
    elif val != previousVal: 
        out += '</span><span class="c-%s">' % val
    if i in checkpoints: 
        out += '<a name="b-%s"></a>' % checkpoints.index(i)
    out += char
    previousVal = val

In [12]:
out[:2000]

'<span class="c-0"><a name="b-0"></a>\ufeff\nMiddlemarch\n\n\nBy\n\nGeorge Eliot\n\n\n\nPRELUDE\n\n\nWho that </span><span class="c-4">cares much to know </span><span class="c-3">the history of man, and how the mysterious\nmixture behaves under the varying experiments of Time, has not dwelt,\nat least briefly, on the life </span><span class="c-1">of Saint Theresa, has not smiled with\nsome gentleness at the thought of the little girl walking forth one\nmorning hand-in-hand with her still smaller brother, to go and seek\nmartyrdom in the country of the Moors?  Out they </span><span class="c-2">toddled </span><span class="c-1">from rugged\nAvila, wide-eyed and </span><span class="c-2">helpless-looking as two fawns, but with human\n</span><span class="c-1">hearts, already </span><span class="c-2">beating to a national idea; until </span><span class="c-3">domestic reality met\nthem in the shape of uncles, and turned them back </span><span class="c-1">from their great\nresolve.  That child-

In [13]:
# Get dates
def getDate(filename): 
    """
    Extract dates from filenames. 
    """
    m = re.search('_(\d{4})_', filename)
    if m is not None: 
        return int(m.group(1))
    else:
        return None

df['Date'] = df['Text B'].apply(getDate)
df['Decade'] = df['Date'] - (df['Date'] % 10)

# Make a list of valid decades. 
decades = np.arange(1930, 2020, 10)

# Make a dictionary of decades. 
# Values are a list of locations.  
decadeDict = {}
for i, row in df.iterrows():
    decade = row['Decade']
    locations = literal_eval(row['Locations in A'])
    if decade not in decadeDict: 
        decadeDict[decade] = locations
    else: 
        decadeDict[decade] += locations 
        
# Grab the beginnings of quotes. 
decadeStarts = {decade: [item[0] for item in loc] for decade, loc in decadeDict.items()}

decadesBinned = {decade: 
                 np.histogram(locations, bins=n, range=(0, textALength))[0]
                 for decade, locations in decadeStarts.items() if decade in decades}

decadesDF = pd.DataFrame(decadesBinned).T
decadesDF


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
1950.0,8,2,0,0,5,4,6,1,3,0,3,7,0,0,5,0,0,5,24,27
1960.0,7,2,1,10,11,9,6,1,2,0,1,7,0,1,0,2,4,6,23,21
1970.0,54,20,3,18,16,11,20,1,9,4,7,6,5,5,5,2,1,9,12,20
1980.0,50,13,17,38,27,10,18,5,11,14,8,6,7,7,3,4,0,11,23,21
1990.0,51,20,23,50,49,7,13,12,19,11,28,12,6,5,9,8,2,12,28,37
2000.0,59,21,13,17,19,26,18,27,9,8,12,13,11,6,8,3,8,9,12,16
2010.0,53,13,3,12,15,15,10,6,7,3,4,2,5,11,3,11,2,11,13,23


In [14]:
# Normalize the totals for each section. 
normalizedBlocks = decadesDF.sum() / decadesDF.sum().max()

# Now use the scale that we're already using for the CSS. 
normalizedBlocks = round(normalizedBlocks * tally.max())
normalizedBlocks

0     14.0
1      5.0
2      3.0
3      7.0
4      7.0
5      4.0
6      5.0
7      3.0
8      3.0
9      2.0
10     3.0
11     3.0
12     2.0
13     2.0
14     2.0
15     1.0
16     1.0
17     3.0
18     7.0
19     8.0
dtype: float64

In [15]:
blockHTML = '<section id="blocks">'
for i, block in enumerate(normalizedBlocks): 
    blockHTML += '<a href="#b-%s"><div class="block b-%s"></div></a>' % (i, int(block))
blockHTML = blockHTML + "</section>"
blockHTML

'<section id="blocks"><a href="#b-0"><div class="block b-14"></div></a><a href="#b-1"><div class="block b-5"></div></a><a href="#b-2"><div class="block b-3"></div></a><a href="#b-3"><div class="block b-7"></div></a><a href="#b-4"><div class="block b-7"></div></a><a href="#b-5"><div class="block b-4"></div></a><a href="#b-6"><div class="block b-5"></div></a><a href="#b-7"><div class="block b-3"></div></a><a href="#b-8"><div class="block b-3"></div></a><a href="#b-9"><div class="block b-2"></div></a><a href="#b-10"><div class="block b-3"></div></a><a href="#b-11"><div class="block b-3"></div></a><a href="#b-12"><div class="block b-2"></div></a><a href="#b-13"><div class="block b-2"></div></a><a href="#b-14"><div class="block b-2"></div></a><a href="#b-15"><div class="block b-1"></div></a><a href="#b-16"><div class="block b-1"></div></a><a href="#b-17"><div class="block b-3"></div></a><a href="#b-18"><div class="block b-7"></div></a><a href="#b-19"><div class="block b-8"></div></a></secti

In [16]:
blockCSS = """
#blocks { position: fixed; right: 1em; }
.block { width: 30px; height: 30px; }
"""

for i, color in zip(range(0, tally.max()+1), colorList): 
    blockCSS += '.b-%s { background-color: %s; }\n' % (i, color)
colorCSS += blockCSS

In [17]:
colorCSS

'.c-0 { color: #00f; }\n.c-1 { color: #0049ff; }\n.c-2 { color: #0092ff; }\n.c-3 { color: #00dbff; }\n.c-4 { color: #00ffdb; }\n.c-5 { color: #00ff92; }\n.c-6 { color: #00ff49; }\n.c-7 { color: #0f0; }\n.c-8 { color: #49ff00; }\n.c-9 { color: #92ff00; }\n.c-10 { color: #dbff00; }\n.c-11 { color: #ffdb00; }\n.c-12 { color: #ff9200; }\n.c-13 { color: #ff4900; }\n.c-14 { color: #f00; }\n\n#blocks { position: fixed; right: 1em; }\n.block { width: 30px; height: 30px; }\n.b-0 { background-color: #00f; }\n.b-1 { background-color: #0049ff; }\n.b-2 { background-color: #0092ff; }\n.b-3 { background-color: #00dbff; }\n.b-4 { background-color: #00ffdb; }\n.b-5 { background-color: #00ff92; }\n.b-6 { background-color: #00ff49; }\n.b-7 { background-color: #0f0; }\n.b-8 { background-color: #49ff00; }\n.b-9 { background-color: #92ff00; }\n.b-10 { background-color: #dbff00; }\n.b-11 { background-color: #ffdb00; }\n.b-12 { background-color: #ff9200; }\n.b-13 { background-color: #ff4900; }\n.b-14 { backgr

In [18]:
html = """<!DOCTYPE html><html><head><style>%s</style></head>
<body>%s<pre>%s</pre></body></html>
""" % (colorCSS, blockHTML, out)

In [19]:
html[:1000]

'<!DOCTYPE html><html><head><style>.c-0 { color: #00f; }\n.c-1 { color: #0049ff; }\n.c-2 { color: #0092ff; }\n.c-3 { color: #00dbff; }\n.c-4 { color: #00ffdb; }\n.c-5 { color: #00ff92; }\n.c-6 { color: #00ff49; }\n.c-7 { color: #0f0; }\n.c-8 { color: #49ff00; }\n.c-9 { color: #92ff00; }\n.c-10 { color: #dbff00; }\n.c-11 { color: #ffdb00; }\n.c-12 { color: #ff9200; }\n.c-13 { color: #ff4900; }\n.c-14 { color: #f00; }\n\n#blocks { position: fixed; right: 1em; }\n.block { width: 30px; height: 30px; }\n.b-0 { background-color: #00f; }\n.b-1 { background-color: #0049ff; }\n.b-2 { background-color: #0092ff; }\n.b-3 { background-color: #00dbff; }\n.b-4 { background-color: #00ffdb; }\n.b-5 { background-color: #00ff92; }\n.b-6 { background-color: #00ff49; }\n.b-7 { background-color: #0f0; }\n.b-8 { background-color: #49ff00; }\n.b-9 { background-color: #92ff00; }\n.b-10 { background-color: #dbff00; }\n.b-11 { background-color: #ffdb00; }\n.b-12 { background-color: #ff9200; }\n.b-13 { background

In [20]:
with open('annotated.html', 'w') as f: 
    f.write(html)
    f.close()