In [1]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import os
import pandas as pd
import re

In [2]:
scope = ['https://spreadsheets.google.com/feeds']
secrets = "/Users/jeriwieringa/Dissertation/drafts/code/secrets/dissertation-881847769b13.json"
credentials = ServiceAccountCredentials.from_json_keyfile_name(secrets, scope)
gc = gspread.authorize(credentials)

In [3]:
dts = gc.open('Topics').sheet1
frame = pd.DataFrame(dts.get_all_records())

In [4]:
frame

Unnamed: 0,endYear,initialPubLocation,periodicalTitle,startYear,title,topic
0,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education
1,1900,"Oakland, CA",American Sentinel,1886,AmSn,Religious Liberty
2,1919,"Washington, D.C.",Advent Review and Sabbath Herald,1909,ARAI,Denominational
3,1920,"Washington, D.C.",Christian Education,1909,CE,Education
4,1920,"Academia, OH",Welcome Visitor (Columbia Union Visitor),1901,CUV,Regional
5,1899,"Battle Creek, MI",Christian Educator,1897,EDU,Education
6,1918,"Battle Creek, MI",General Conference Bulletin,1863,GCB,Denominational
7,1920,"Yazoo City, MS",Gospel Herald,1898,GH,Regional
8,1899,"Battle Creek, MI",Gospel of Health,1897,GOH,Health
9,1888,"Battle Creek, MI",Gospel Sickle,1886,GS,Missions


In [5]:
final_dirs = "/Users/jeriwieringa/Dissertation/drafts/data/2017-03-24-final-correction-files.txt"

In [6]:
with open(final_dirs) as f:
    content = [x.strip() for x in f.readlines()] 

In [7]:
content

['ADV/correction15',
 'AmSn/correction9',
 'ARAI/correction7',
 'CE/correction9',
 'CUV/correction9',
 'EDU/correction9',
 'GCB/correction9',
 'GH/correction8',
 'GoH/correction9',
 'GS/correction7',
 'HM/correction9',
 'HR/correction9',
 'IR/correction9',
 'LB/correction9',
 'LH/correction9',
 'LibM/correction10',
 'LUH/correction7',
 'NMN/correction8',
 'PHJ/correction9',
 'PTAR/correction7',
 'PUR/correction8',
 'RH1850-1889/correction8',
 'RH1890-1920/correction9',
 'Sligo/correction8',
 'SOL/correction6',
 'ST/correction9',
 'SUW/correction8',
 'TCOG/correction6',
 'TMM/correction6',
 'WMH/correction8',
 'YI/correction10']

In [8]:
text_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/"

In [9]:
filenames = []
for each in content:
    dir_name = os.path.join(text_dir, each)
    pref = each.split('/')[0]
    pages = [name for name in os.listdir(dir_name)]
    filenames.append(pages)

In [10]:
def extract_title_info(dictionary, page_list):
    for page in page_list:
        split_page = page.split('-')
        
        title = re.match("[A-Za-z]+", split_page[0])
        dates = re.search(r'[0-9]+', split_page[0])
        
        year = dates.group()[:4]

        dictionary[page] = {'title': title.group(), 'year': year}

In [11]:
pages = {}

In [12]:
for each in filenames:
#     print(each)
    extract_title_info(pages, each)

In [13]:
df = pd.DataFrame.from_dict(pages, orient='index')

In [14]:
df

Unnamed: 0,title,year
ADV18981201-V02-01-page1.txt,ADV,1898
ADV18981201-V02-01-page10.txt,ADV,1898
ADV18981201-V02-01-page11.txt,ADV,1898
ADV18981201-V02-01-page12.txt,ADV,1898
ADV18981201-V02-01-page13.txt,ADV,1898
ADV18981201-V02-01-page14.txt,ADV,1898
ADV18981201-V02-01-page15.txt,ADV,1898
ADV18981201-V02-01-page16.txt,ADV,1898
ADV18981201-V02-01-page17.txt,ADV,1898
ADV18981201-V02-01-page18.txt,ADV,1898


In [15]:
df.index.name = 'docs'

In [16]:
df = df.reset_index()

In [17]:
df

Unnamed: 0,docs,title,year
0,ADV18981201-V02-01-page1.txt,ADV,1898
1,ADV18981201-V02-01-page10.txt,ADV,1898
2,ADV18981201-V02-01-page11.txt,ADV,1898
3,ADV18981201-V02-01-page12.txt,ADV,1898
4,ADV18981201-V02-01-page13.txt,ADV,1898
5,ADV18981201-V02-01-page14.txt,ADV,1898
6,ADV18981201-V02-01-page15.txt,ADV,1898
7,ADV18981201-V02-01-page16.txt,ADV,1898
8,ADV18981201-V02-01-page17.txt,ADV,1898
9,ADV18981201-V02-01-page18.txt,ADV,1898


In [18]:
df2 = df.groupby(["title", "year"], as_index=False).docs.count()

In [19]:
df2

Unnamed: 0,title,year,docs
0,ADV,1898,26
1,ADV,1899,674
2,ADV,1900,463
3,ADV,1901,389
4,ADV,1902,440
5,ADV,1903,428
6,ADV,1904,202
7,ADV,1905,20
8,ARAI,1909,64
9,ARAI,1919,32


In [20]:
df2.loc[df2['year'] == '1919']

Unnamed: 0,title,year,docs
9,ARAI,1919,32
35,CE,1919,288
55,CUV,1919,413
117,GH,1919,74
205,LB,1919,432
222,LH,1919,384
235,LUH,1919,624
250,LibM,1919,120
290,PUR,1919,408
361,RH,1919,1664


In [21]:
joined_frames = pd.merge(frame, df2, on='title')

In [22]:
joined_frames

Unnamed: 0,endYear,initialPubLocation,periodicalTitle,startYear,title,topic,year,docs
0,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,1898,26
1,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,1899,674
2,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,1900,463
3,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,1901,389
4,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,1902,440
5,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,1903,428
6,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,1904,202
7,1905,"Battle Creek, MI",Training School Advocate,1898,ADV,Education,1905,20
8,1900,"Oakland, CA",American Sentinel,1886,AmSn,Religious Liberty,1886,96
9,1900,"Oakland, CA",American Sentinel,1886,AmSn,Religious Liberty,1887,96


In [23]:
from bokeh.charts import Bar, show
from bokeh.charts import defaults
from bokeh.palettes import viridis
from bokeh.io import output_notebook

In [24]:
output_notebook()

In [25]:
defaults.width = 750
defaults.height = 1050

In [26]:
p = Bar(joined_frames, 
        'year', 
        values='docs',
        agg='sum', 
        stack='topic',
        palette= viridis(6), 
        title="Pages per Topic per Year")

In [27]:
show(p)