In [1]:
import pandas as pd
import numpy as np
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
import os
import re

In [2]:
# Import Documents into dataframe

"""
Creating a dictionary with the following structure: 
{type: 
    { doc_id: content,
    }
}
"""

auto_corrected = "/Users/jeriwieringa/Documents/Research/ocr-and-nlp/data/text/auto_corrected/"
ground_truth = "/Users/jeriwieringa/Documents/Research/ocr-and-nlp/data/text/ground_truth/"

def load_docs(folder):
    dictionary = {}
    for each in os.listdir(folder):
        with open(os.path.join(folder, each), "r") as f:
            content = f.read()
        id_ = "{}-{}".format(each, os.path.basename(os.path.normpath(folder)))
        dictionary[id_] = content
    
    return dictionary

auto_corrected_docs = load_docs(auto_corrected)
ground_truth_docs = load_docs(ground_truth)


In [3]:
auto_df = pd.DataFrame.from_dict(auto_corrected_docs, orient="index").reset_index()
auto_df.columns=["doc_id", "text"]
auto_df['type'] = "auto_corrected"

In [4]:
ground_df = pd.DataFrame.from_dict(ground_truth_docs, orient="index").reset_index()
ground_df.columns = ["doc_id", "text"]
ground_df['type'] = "ground_truth"

In [5]:
text_df = auto_df.append(ground_df, ignore_index=True)

In [6]:
text_df

Unnamed: 0,doc_id,text,type
0,HR18660801-V01-01-p8.txt-auto_corrected,""" "" O\nC\n'l'HE HEALTH REFORMER. : ,. r c ,;...",auto_corrected
1,YI18540801-V02-08-p4.txt-auto_corrected,",man , III z onnasalpm twzI 11 11 11 1, 'IAA U...",auto_corrected
2,ST18740813-V01-06-p1.txt-auto_corrected,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...,auto_corrected
3,RH18540815-V06-01-p1.txt-auto_corrected,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...,auto_corrected
4,HR18660801-V01-01-p8.txt-ground_truth,8 THE HEALTH REFORMER\n\nEditorial.\n\nTo ...,ground_truth
5,YI18540801-V02-08-p4.txt-ground_truth,60 YOUTH'S INSTRUCTOR. \n\nhumble man Davi...,ground_truth
6,ST18740813-V01-06-p1.txt-ground_truth,"The Signs of the Times.\n""Behold, I come quick...",ground_truth
7,RH18540815-V06-01-p1.txt-ground_truth,"THE ADVENT REVIEW, \nAND SABBATH HERALD.\n\nHe...",ground_truth


In [7]:
# Do standard cleaning by removing stopwords
stop_words_list = stopwords.words('english')

text_df['text_cleaned'] = text_df.text.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z]',' ',w).lower() not in stop_words_list) )
text_df

Unnamed: 0,doc_id,text,type,text_cleaned
0,HR18660801-V01-01-p8.txt-auto_corrected,""" "" O\nC\n'l'HE HEALTH REFORMER. : ,. r c ,;...",auto_corrected,c l he health reformer r c cy ...
1,YI18540801-V02-08-p4.txt-auto_corrected,",man , III z onnasalpm twzI 11 11 11 1, 'IAA U...",auto_corrected,man iii z onnasalpm twzi iaa u...
2,ST18740813-V01-06-p1.txt-auto_corrected,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...,auto_corrected,kr ffiigno zintro issued weekly oakland calif...
3,RH18540815-V06-01-p1.txt-auto_corrected,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...,auto_corrected,review herald published weekly south st paul...
4,HR18660801-V01-01-p8.txt-ground_truth,8 THE HEALTH REFORMER\n\nEditorial.\n\nTo ...,ground_truth,health reformer editorial reader h s lay...
5,YI18540801-V02-08-p4.txt-ground_truth,60 YOUTH'S INSTRUCTOR. \n\nhumble man Davi...,ground_truth,youth s instructor humble man david was a...
6,ST18740813-V01-06-p1.txt-ground_truth,"The Signs of the Times.\n""Behold, I come quick...",ground_truth,signs times behold come quickly reward me ...
7,RH18540815-V06-01-p1.txt-ground_truth,"THE ADVENT REVIEW, \nAND SABBATH HERALD.\n\nHe...",ground_truth,advent review sabbath herald patience saints...


In [8]:
# Standard cleaning but keeping sentences

text_df['text_cleaned_sent'] = text_df.text.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z.?!]',' ',w).lower() for w in x.split() if re.sub(r'[^a-zA-Z.?!]',' ',w).lower() not in stop_words_list) )
text_df

Unnamed: 0,doc_id,text,type,text_cleaned,text_cleaned_sent
0,HR18660801-V01-01-p8.txt-auto_corrected,""" "" O\nC\n'l'HE HEALTH REFORMER. : ,. r c ,;...",auto_corrected,c l he health reformer r c cy ...,c l he health reformer. . r c cy ...
1,YI18540801-V02-08-p4.txt-auto_corrected,",man , III z onnasalpm twzI 11 11 11 1, 'IAA U...",auto_corrected,man iii z onnasalpm twzi iaa u...,man iii z onnasalpm twzi iaa u...
2,ST18740813-V01-06-p1.txt-auto_corrected,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...,auto_corrected,kr ffiigno zintro issued weekly oakland calif...,kr ffiigno zintro issued weekly oakland calif...
3,RH18540815-V06-01-p1.txt-auto_corrected,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...,auto_corrected,review herald published weekly south st paul...,review herald. published weekly south st. paul...
4,HR18660801-V01-01-p8.txt-ground_truth,8 THE HEALTH REFORMER\n\nEditorial.\n\nTo ...,ground_truth,health reformer editorial reader h s lay...,health reformer editorial. reader. h. s. lay...
5,YI18540801-V02-08-p4.txt-ground_truth,60 YOUTH'S INSTRUCTOR. \n\nhumble man Davi...,ground_truth,youth s instructor humble man david was a...,youth s instructor. humble man david was. a...
6,ST18740813-V01-06-p1.txt-ground_truth,"The Signs of the Times.\n""Behold, I come quick...",ground_truth,signs times behold come quickly reward me ...,signs times. behold come quickly reward me ...
7,RH18540815-V06-01-p1.txt-ground_truth,"THE ADVENT REVIEW, \nAND SABBATH HERALD.\n\nHe...",ground_truth,advent review sabbath herald patience saints...,advent review sabbath herald. patience saints...


In [9]:
# Cleaned but with punctuation and casing

text_df['text_cleaned_nlp'] = text_df.text.apply(lambda x: " ".join(re.sub(r'[^a-zA-Z.?!]', ' ', w) for w in x.split()))
text_df['text_cleaned_nlp'] = text_df.text_cleaned_nlp.apply(lambda x: " ".join(re.sub(r'\s{2,}', ' ', w) for w in x.split()))
text_df

Unnamed: 0,doc_id,text,type,text_cleaned,text_cleaned_sent,text_cleaned_nlp
0,HR18660801-V01-01-p8.txt-auto_corrected,""" "" O\nC\n'l'HE HEALTH REFORMER. : ,. r c ,;...",auto_corrected,c l he health reformer r c cy ...,c l he health reformer. . r c cy ...,O C l HE HEALTH REFORMER. . r c CY c c x . x x...
1,YI18540801-V02-08-p4.txt-auto_corrected,",man , III z onnasalpm twzI 11 11 11 1, 'IAA U...",auto_corrected,man iii z onnasalpm twzi iaa u...,man iii z onnasalpm twzi iaa u...,man III z onnasalpm twzI IAA UM I II . KJ . . ...
2,ST18740813-V01-06-p1.txt-auto_corrected,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...,auto_corrected,kr ffiigno zintro issued weekly oakland calif...,kr ffiigno zintro issued weekly oakland calif...,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...
3,RH18540815-V06-01-p1.txt-auto_corrected,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...,auto_corrected,review herald published weekly south st paul...,review herald. published weekly south st. paul...,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...
4,HR18660801-V01-01-p8.txt-ground_truth,8 THE HEALTH REFORMER\n\nEditorial.\n\nTo ...,ground_truth,health reformer editorial reader h s lay...,health reformer editorial. reader. h. s. lay...,THE HEALTH REFORMER Editorial. To the Reader. ...
5,YI18540801-V02-08-p4.txt-ground_truth,60 YOUTH'S INSTRUCTOR. \n\nhumble man Davi...,ground_truth,youth s instructor humble man david was a...,youth s instructor. humble man david was. a...,YOUTH S INSTRUCTOR. humble man David was. Alth...
6,ST18740813-V01-06-p1.txt-ground_truth,"The Signs of the Times.\n""Behold, I come quick...",ground_truth,signs times behold come quickly reward me ...,signs times. behold come quickly reward me ...,The Signs of the Times. Behold I come quickly ...
7,RH18540815-V06-01-p1.txt-ground_truth,"THE ADVENT REVIEW, \nAND SABBATH HERALD.\n\nHe...",ground_truth,advent review sabbath herald patience saints...,advent review sabbath herald. patience saints...,THE ADVENT REVIEW AND SABBATH HERALD. Here is ...


In [10]:
text_df.to_pickle("../data/all_text.pkl")

In [11]:
import json

In [12]:
text_df['series'] = "not_bible"
text_df

Unnamed: 0,doc_id,text,type,text_cleaned,text_cleaned_sent,text_cleaned_nlp,series
0,HR18660801-V01-01-p8.txt-auto_corrected,""" "" O\nC\n'l'HE HEALTH REFORMER. : ,. r c ,;...",auto_corrected,c l he health reformer r c cy ...,c l he health reformer. . r c cy ...,O C l HE HEALTH REFORMER. . r c CY c c x . x x...,not_bible
1,YI18540801-V02-08-p4.txt-auto_corrected,",man , III z onnasalpm twzI 11 11 11 1, 'IAA U...",auto_corrected,man iii z onnasalpm twzi iaa u...,man iii z onnasalpm twzi iaa u...,man III z onnasalpm twzI IAA UM I II . KJ . . ...,not_bible
2,ST18740813-V01-06-p1.txt-auto_corrected,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...,auto_corrected,kr ffiigno zintro issued weekly oakland calif...,kr ffiigno zintro issued weekly oakland calif...,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...,not_bible
3,RH18540815-V06-01-p1.txt-auto_corrected,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...,auto_corrected,review herald published weekly south st paul...,review herald. published weekly south st. paul...,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...,not_bible
4,HR18660801-V01-01-p8.txt-ground_truth,8 THE HEALTH REFORMER\n\nEditorial.\n\nTo ...,ground_truth,health reformer editorial reader h s lay...,health reformer editorial. reader. h. s. lay...,THE HEALTH REFORMER Editorial. To the Reader. ...,not_bible
5,YI18540801-V02-08-p4.txt-ground_truth,60 YOUTH'S INSTRUCTOR. \n\nhumble man Davi...,ground_truth,youth s instructor humble man david was a...,youth s instructor. humble man david was. a...,YOUTH S INSTRUCTOR. humble man David was. Alth...,not_bible
6,ST18740813-V01-06-p1.txt-ground_truth,"The Signs of the Times.\n""Behold, I come quick...",ground_truth,signs times behold come quickly reward me ...,signs times. behold come quickly reward me ...,The Signs of the Times. Behold I come quickly ...,not_bible
7,RH18540815-V06-01-p1.txt-ground_truth,"THE ADVENT REVIEW, \nAND SABBATH HERALD.\n\nHe...",ground_truth,advent review sabbath herald patience saints...,advent review sabbath herald. patience saints...,THE ADVENT REVIEW AND SABBATH HERALD. Here is ...,not_bible


In [13]:
json_data = text_df[['doc_id', 'series', 'text_cleaned_nlp']].copy()
json_data

Unnamed: 0,doc_id,series,text_cleaned_nlp
0,HR18660801-V01-01-p8.txt-auto_corrected,not_bible,O C l HE HEALTH REFORMER. . r c CY c c x . x x...
1,YI18540801-V02-08-p4.txt-auto_corrected,not_bible,man III z onnasalpm twzI IAA UM I II . KJ . . ...
2,ST18740813-V01-06-p1.txt-auto_corrected,not_bible,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...
3,RH18540815-V06-01-p1.txt-auto_corrected,not_bible,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...
4,HR18660801-V01-01-p8.txt-ground_truth,not_bible,THE HEALTH REFORMER Editorial. To the Reader. ...
5,YI18540801-V02-08-p4.txt-ground_truth,not_bible,YOUTH S INSTRUCTOR. humble man David was. Alth...
6,ST18740813-V01-06-p1.txt-ground_truth,not_bible,The Signs of the Times. Behold I come quickly ...
7,RH18540815-V06-01-p1.txt-ground_truth,not_bible,THE ADVENT REVIEW AND SABBATH HERALD. Here is ...


In [14]:
# Add Bible as text

with open('../software/PH-Passim-tutorial/eebo/ref/king_james.txt') as f:
    bible = f.read()
    
bible_df = pd.DataFrame([['king_james', 'bible', '{}'.format(bible)]], columns=['doc_id', 'series', 'text_cleaned_nlp'])
bible_df

Unnamed: 0,doc_id,series,text_cleaned_nlp
0,king_james,bible,﻿The Old Testament of the King James Version o...


In [15]:
json_data = json_data.append(bible_df)
json_data

Unnamed: 0,doc_id,series,text_cleaned_nlp
0,HR18660801-V01-01-p8.txt-auto_corrected,not_bible,O C l HE HEALTH REFORMER. . r c CY c c x . x x...
1,YI18540801-V02-08-p4.txt-auto_corrected,not_bible,man III z onnasalpm twzI IAA UM I II . KJ . . ...
2,ST18740813-V01-06-p1.txt-auto_corrected,not_bible,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...
3,RH18540815-V06-01-p1.txt-auto_corrected,not_bible,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...
4,HR18660801-V01-01-p8.txt-ground_truth,not_bible,THE HEALTH REFORMER Editorial. To the Reader. ...
5,YI18540801-V02-08-p4.txt-ground_truth,not_bible,YOUTH S INSTRUCTOR. humble man David was. Alth...
6,ST18740813-V01-06-p1.txt-ground_truth,not_bible,The Signs of the Times. Behold I come quickly ...
7,RH18540815-V06-01-p1.txt-ground_truth,not_bible,THE ADVENT REVIEW AND SABBATH HERALD. Here is ...
0,king_james,bible,﻿The Old Testament of the King James Version o...


In [16]:
json_data = json_data.rename(columns={'doc_id':'id', 'text_cleaned_nlp':'text'}).reset_index(drop=True)
json_data

Unnamed: 0,id,series,text
0,HR18660801-V01-01-p8.txt-auto_corrected,not_bible,O C l HE HEALTH REFORMER. . r c CY c c x . x x...
1,YI18540801-V02-08-p4.txt-auto_corrected,not_bible,man III z onnasalpm twzI IAA UM I II . KJ . . ...
2,ST18740813-V01-06-p1.txt-auto_corrected,not_bible,kr ffiigno the Zintro IS ISSUED WEEKLY AT OAKL...
3,RH18540815-V06-01-p1.txt-auto_corrected,not_bible,THE REVIEW AND HERALD. IS PUBLISHED WEEKLY At ...
4,HR18660801-V01-01-p8.txt-ground_truth,not_bible,THE HEALTH REFORMER Editorial. To the Reader. ...
5,YI18540801-V02-08-p4.txt-ground_truth,not_bible,YOUTH S INSTRUCTOR. humble man David was. Alth...
6,ST18740813-V01-06-p1.txt-ground_truth,not_bible,The Signs of the Times. Behold I come quickly ...
7,RH18540815-V06-01-p1.txt-ground_truth,not_bible,THE ADVENT REVIEW AND SABBATH HERALD. Here is ...
8,king_james,bible,﻿The Old Testament of the King James Version o...


In [17]:
json_dict = json_data.to_dict("records")
print(type(json_dict))

<class 'list'>


In [18]:
with open('../data/json_data.json', 'w') as f:
    for text in json_dict:    
        f.write("{}\n".format(text))