In [1]:
import os
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials

In [2]:
scope = ['https://spreadsheets.google.com/feeds']

In [3]:
secrets = "/Users/jeriwieringa/Dissertation/drafts/code/secrets/dissertation-881847769b13.json"

In [4]:
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)

In [5]:
gc = gspread.authorize(credentials)

In [6]:
dts = gc.open('Topics').sheet1
frame = pd.DataFrame(dts.get_all_records())

In [7]:
frame

Unnamed: 0,endYear,initialPubLocation,periodicalTitle,startYear,title,topic
0,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education
1,1900,"Oakland, CA",American Sentinel,1886,AmSn,Religious Liberty
2,1919,"Washington, D.C.",Advent Review and Sabbath Herald,1909,ARAI,Denominational
3,1920,"Washington, D.C.",Christian Education,1909,CE,Education
4,1920,"Academia, OH",Welcome Visitor (Columbia Union Visitor),1901,CUV,Regional
5,1899,"Battle Creek, MI",Christian Educator,1897,EDU,Education
6,1918,"Battle Creek, MI",General Conference Bulletin,1863,GCB,Denominational
7,1920,"Yazoo City, MS",Gospel Herald,1898,GH,Regional
8,1899,"Battle Creek, MI",Gospel of Health,1897,GOH,Health
9,1888,"Battle Creek, MI",Gospel Sickle,1886,GS,Missions


In [8]:
final_dirs = "/Users/jeriwieringa/Dissertation/drafts/data/2017-03-24-final-correction-files.txt"

In [9]:
with open(final_dirs) as f:
    content = [x.strip() for x in f.readlines()] 

In [10]:
content

['ADV/correction15',
 'AmSn/correction9',
 'ARAI/correction7',
 'CE/correction9',
 'CUV/correction9',
 'EDU/correction9',
 'GCB/correction9',
 'GH/correction8',
 'GoH/correction9',
 'GS/correction7',
 'HM/correction9',
 'HR/correction9',
 'IR/correction9',
 'LB/correction9',
 'LH/correction9',
 'LibM/correction10',
 'LUH/correction7',
 'NMN/correction8',
 'PHJ/correction9',
 'PTAR/correction7',
 'PUR/correction8',
 'RH1850-1889/correction8',
 'RH1890-1920/correction9',
 'Sligo/correction8',
 'SOL/correction6',
 'ST/correction9',
 'SUW/correction8',
 'TCOG/correction6',
 'TMM/correction6',
 'WMH/correction8',
 'YI/correction10']

In [11]:
text_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/"

In [12]:
counts = {}
for each in content:
    dir_name = os.path.join(text_dir, each)
    pref = each.split('/')[0]
    pages = [name for name in os.listdir(dir_name)]
    counts[pref] = len(pages)

In [13]:
df = pd.DataFrame.from_dict(counts, orient='index')

In [14]:
df = df.rename({'RH1850-1889': 'RH', 'RH1890-1920': 'RH'})

In [15]:
df = df.groupby(df.index).sum()

In [16]:
df.columns = ['pages']

In [17]:
df.index.name = 'title'

In [18]:
df = df.reset_index()

In [19]:
joined_frames = pd.merge(frame, df, on='title')

In [20]:
joined_frames

Unnamed: 0,endYear,initialPubLocation,periodicalTitle,startYear,title,topic,pages
0,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,2642
1,1900,"Oakland, CA",American Sentinel,1886,AmSn,Religious Liberty,6702
2,1919,"Washington, D.C.",Advent Review and Sabbath Herald,1909,ARAI,Denominational,96
3,1920,"Washington, D.C.",Christian Education,1909,CE,Education,3896
4,1920,"Academia, OH",Welcome Visitor (Columbia Union Visitor),1901,CUV,Regional,6250
5,1899,"Battle Creek, MI",Christian Educator,1897,EDU,Education,474
6,1918,"Battle Creek, MI",General Conference Bulletin,1863,GCB,Denominational,4524
7,1920,"Yazoo City, MS",Gospel Herald,1898,GH,Regional,2023
8,1888,"Battle Creek, MI",Gospel Sickle,1886,GS,Missions,560
9,1897,"Battle Creek, MI",Home Missionary,1889,HM,Missions,2698


In [21]:
from bokeh.charts import Bar, show
from bokeh.charts import defaults
from bokeh.palettes import viridis
from bokeh.models import Axis
from bokeh.io import output_notebook

In [22]:
output_notebook()

In [23]:
defaults.width = 850
defaults.height = 950

In [24]:
p = Bar(joined_frames, 
        'topic', 
        values='pages',
        agg='sum',
        stack='title', 
        palette=viridis(30), 
        title="Total Pages by Topic")

In [25]:
p.left[0].formatter.use_scientific = False


In [26]:
p.legend.location = "top_right"

In [27]:
show(p)