# Generate font size data visualization

This notebook generates a visualization for quotation data in which each word has font size proportional to the frequency of quotation.

In [20]:
# Probably some of these are irrelevant imports used for earlier visualizations

import sys
print(sys.version)
import pandas as pd
import nltk
import math
import json
from ast import literal_eval
import numpy as np
import re
from IPython.core.display import HTML

3.10.9 (main, Mar  1 2023, 12:33:47) [Clang 14.0.6 ]


In [21]:
# ACTION: copy path to results JSONL file here (filename will probably end "_results_[hyperparameters].jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Joyce/1922_Ulysses/Results/Joyce_1922_Ulysses_results_t2-c3-n2-m3-nostops.jsonl"

In [22]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 4)[-3]
publicationYear = startData.rsplit("_", 4)[-4]
authorSurname = startData.rsplit("_", 4)[-5]
authorSurname = authorSurname.rsplit("/", 1)[-1]
hyperparSuffix = startData.rsplit("_", 4)[-1]
hyperparSuffix = f"_{hyperparSuffix[:-6]}"
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nHyperparameters suffix: {hyperparSuffix}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Source"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Corpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Joyce
Publication year: 1922
Text title: Ulysses
Hyperparameters suffix: _t2-c3-n2-m3-nostops
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [23]:
# Load results JSONL file as pandas dataframe

with open(startData) as f: 
    rawMatches = f.readlines()

data = [json.loads(line) for line in rawMatches]
df = pd.DataFrame(data)

In [24]:
# Adapted from text-matcher
class Text: 
    def __init__(self, filename): 
        self.filename = filename
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        self.length = spans[-1][-1] 
        
    @property
    def text(self):
        """ Reads the file in memory. """
        f = open(self.filename, encoding='utf-8', errors='ignore')
        return f.read() 

    @property
    def tokens(self, removeStopwords=True): 
        """ Tokenizes the text, breaking it up into words, removing punctuation. """
        tokenizer = nltk.RegexpTokenizer('[a-zA-Z]\w+\'?\w*') # A custom regex tokenizer. 
        spans = list(tokenizer.span_tokenize(self.text))
        # Take note of how many spans there are in the text
        self.length = spans[-1][-1] 
        tokens = tokenizer.tokenize(self.text)
        tokens = [ token.lower() for token in tokens ] # make them lowercase
        if not removeStopwords: 
            self.spans = spans
            return tokens
        tokenSpans = list(zip(tokens, spans)) # zip it up
        stopwords = nltk.corpus.stopwords.words('english') # get stopwords
        tokenSpans = [ token for token in tokenSpans if token[0] not in stopwords ] # remove stopwords from zip
        self.spans = [ x[1] for x in tokenSpans ] # unzip; get spans
        return [ x[0] for x in tokenSpans ] # unzip; get tokens

In [25]:
# Load source text

sourceText = f"{sourceDir}/{projectName}_plaintext.txt"

text = Text(sourceText)

In [26]:
# Get the size of the text
textALength = text.length

# Make an empty array the size of the text
tally = np.zeros(textALength, dtype=int)

In [27]:
# Read the matched locations from the results dataset, and literally evaluate them into lists. 

locationsInA = df['Locations in A']

In [28]:
# Tally up every time a letter in the text is quoted. 
for article in locationsInA: 
    for locRange in article: 
         for i in range(locRange[0], min(locRange[1]+1, len(tally))):
                tally[i] += 1

In [29]:
# Make a font size list for all the values in the tally. 

font = list(np.arange(12,(tally.max()+13)))
fontList = font

In [30]:
# Create a CSS Stylesheet for each font value in the map. 
fontCSS = ""
for i, font in zip(range(0, tally.max()+1), fontList): 
    fontCSS += ".c-%s { font-size: %spx; }" % (i, font)

In [31]:
#n = 50
#
#checkpoints = np.linspace(0, textALength, n).round()
#checkpoints = [int(point) for point in checkpoints]

In [32]:
def span(val): 
    return '<span class="c-%s">' % val

previousVal = None
for i, valChar in enumerate(zip(tally, text.text)):
    val, char = valChar[0], valChar[1]
    if previousVal == None: 
        # First character. 
        out = '<span class="c-%s">' % val
    elif val != previousVal: 
        out += '</span><span class="c-%s">' % val
    out += char
    previousVal = val

In [33]:
html = """<!DOCTYPE html>
<html>
<head>
  <link href="https://fonts.googleapis.com/css?family=Raleway" rel="stylesheet"> 
  <style>
  main { 
      width: 600px; 
  }
  %s
  </style>
  """ % (fontCSS)
html += """
  </head>
  <body><main>%s</main></body></html>
  """ % (out)

In [34]:
# Add html paragraph tags (for text where paragraphing is indicated by an empty line, i.e. two line breaks)

html = re.sub("\n\n", "</p>\n\n<p>", html)

In [35]:
# Where no span class is specified at start of para, reiterate most recent span class

untaggedParas = len(re.findall(r'\n\n<p>([^<])', html))

while untaggedParas != 0:
    oldUntagged = untaggedParas
    html = re.sub(r'(<span class="c-\d">)([^<>]*)</p>\n\n<p>([^<])', r'\1\2</p>\n\n<p>\1\3', html)
    untaggedParas = len(re.findall(r'\n\n<p>([^<])', html))
    print(untaggedParas)
    if untaggedParas == oldUntagged:
        print(f"Done: {untaggedParas} paragraphs couldn't be corrected")
        break
    if untaggedParas == 0:
        print("Done: no untagged paragraphs remaining")
        break

3258
1942
1278
872
623
455
344
263
205
165
136
115
102
91
81
76
71
66
61
57
56
55
54
53
53
Done: 53 paragraphs couldn't be corrected


In [36]:
with open(f"{resultsDir}/{projectName}_fontsizeviz{hyperparSuffix}.html", 'w') as f: 
    f.write(html)
    f.close()