In [1]:
import os
import pandas as pd
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import re

In [2]:
final_dirs = "/Users/jeriwieringa/Dissertation/drafts/data/2017-03-24-final-correction-files.txt"

In [3]:
with open(final_dirs) as f:
    content = [x.strip() for x in f.readlines()] 

In [4]:
content

['ADV/correction15',
 'AmSn/correction9',
 'ARAI/correction7',
 'CE/correction9',
 'CUV/correction9',
 'EDU/correction9',
 'GCB/correction9',
 'GH/correction8',
 'GoH/correction9',
 'GS/correction7',
 'HM/correction9',
 'HR/correction9',
 'IR/correction9',
 'LB/correction9',
 'LH/correction9',
 'LibM/correction10',
 'LUH/correction7',
 'NMN/correction8',
 'PHJ/correction9',
 'PTAR/correction7',
 'PUR/correction8',
 'RH1850-1889/correction8',
 'RH1890-1920/correction9',
 'Sligo/correction8',
 'SOL/correction6',
 'ST/correction9',
 'SUW/correction8',
 'TCOG/correction6',
 'TMM/correction6',
 'WMH/correction8',
 'YI/correction10']

In [5]:
text_dir = "/Users/jeriwieringa/Dissertation/text/text/2017-01-31-corpus-with-utf8-split-into-titles-cleaning/"

In [6]:
filenames = []
for each in content:
    dir_name = os.path.join(text_dir, each)
    pref = each.split('/')[0]
    pages = [name for name in os.listdir(dir_name)]
    filenames.append(pages)

In [7]:
def extract_title_info(dictionary, page_list):
    for page in page_list:
        split_page = page.split('-')
        
        title = re.match("[A-Za-z]+", split_page[0])
        dates = re.search(r'[0-9]+', split_page[0])
        
        year = dates.group()[:4]

        dictionary[page] = {'title': title.group(), 'year': year}

In [8]:
pages = {}

In [9]:
for each in filenames:
#     print(each)
    extract_title_info(pages, each)

In [10]:
df = pd.DataFrame.from_dict(pages, orient='index')

In [11]:
df

Unnamed: 0,year,title
ADV18981201-V02-01-page1.txt,1898,ADV
ADV18981201-V02-01-page10.txt,1898,ADV
ADV18981201-V02-01-page11.txt,1898,ADV
ADV18981201-V02-01-page12.txt,1898,ADV
ADV18981201-V02-01-page13.txt,1898,ADV
ADV18981201-V02-01-page14.txt,1898,ADV
ADV18981201-V02-01-page15.txt,1898,ADV
ADV18981201-V02-01-page16.txt,1898,ADV
ADV18981201-V02-01-page17.txt,1898,ADV
ADV18981201-V02-01-page18.txt,1898,ADV


In [12]:
df.index.name = 'docs'

In [13]:
df = df.reset_index()

In [14]:
df

Unnamed: 0,docs,year,title
0,ADV18981201-V02-01-page1.txt,1898,ADV
1,ADV18981201-V02-01-page10.txt,1898,ADV
2,ADV18981201-V02-01-page11.txt,1898,ADV
3,ADV18981201-V02-01-page12.txt,1898,ADV
4,ADV18981201-V02-01-page13.txt,1898,ADV
5,ADV18981201-V02-01-page14.txt,1898,ADV
6,ADV18981201-V02-01-page15.txt,1898,ADV
7,ADV18981201-V02-01-page16.txt,1898,ADV
8,ADV18981201-V02-01-page17.txt,1898,ADV
9,ADV18981201-V02-01-page18.txt,1898,ADV


In [15]:
df2 = df.groupby(["title", "year"], as_index=False).docs.count()

In [16]:
df2

Unnamed: 0,title,year,docs
0,ADV,1898,26
1,ADV,1899,674
2,ADV,1900,463
3,ADV,1901,389
4,ADV,1902,440
5,ADV,1903,428
6,ADV,1904,202
7,ADV,1905,20
8,ARAI,1909,64
9,ARAI,1919,32


In [17]:
df2.loc[df2['year'] == '1919']

Unnamed: 0,title,year,docs
9,ARAI,1919,32
35,CE,1919,288
55,CUV,1919,413
117,GH,1919,74
205,LB,1919,432
222,LH,1919,384
235,LUH,1919,624
250,LibM,1919,120
290,PUR,1919,408
361,RH,1919,1664


In [18]:
from bokeh.charts import Bar, show
from bokeh.charts import defaults
from bokeh.palettes import viridis
from bokeh.io import output_notebook

In [19]:
output_notebook()

In [20]:
defaults.width = 950
defaults.height = 1050

In [21]:
p = Bar(df2, 
        'year', 
        values='docs',
        agg='sum', 
        stack='title',
        palette= viridis(30), 
        title="Pages per Title per Year")

In [22]:
show(p)