In [1]:
import pandas as pd
import pickle
import numpy as np
from collections import Counter
from gensim import corpora
import re
import json

In [2]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [3]:
trigramCount_cleaned_dict = pickle.load(open("../data/large_files/trigramCount_cleaned_dict.pickle", "rb"))

In [4]:
collocation_replacements = json.load(open("../data/collocation_replacements.json"))
collocations_keys = list(collocation_replacements.keys())

In [5]:
to_replace = json.load(open("../data/to_replace.json"))
to_replace_keys = list(to_replace.keys())

In [6]:
key = "Church"
re.sub(key, to_replace[key], "Church Jesus")

'church Jesus'

In [7]:
trigram_string = "St station Thess"
trigram_string_split = trigram_string.split()
for key in to_replace_keys:
    if key in trigram_string:
        trigram_string_split = [to_replace[w] if w==key else w for w in trigram_string_split]
trigram_string_split

['saint', 'station', 'Thessalonians']

In [8]:
def clean_trigram_string(trigram_string):
    trigram_string_split = trigram_string.split()
    if re.search("\D\d", trigram_string):
        trigram_string_split = [re.sub("\d+", "", w) for w in trigram_string_split]
    for key in to_replace_keys:
        if key in trigram_string:
            trigram_string_split = [to_replace[w] if w==key else w for w in trigram_string_split]
    trigram_string = " ".join(trigram_string_split)
    for key in collocations_keys:
        key_split = key.split()
        if (key_split[0] in trigram_string) & (key_split[1] in trigram_string):
            trigram_string = re.sub(key_split[0], collocation_replacements[key], trigram_string.replace(key_split[1], ""))
    return trigram_string

In [9]:
trigram_string = "Murphy name112 Rom7 Oconnor"
clean_trigram_string(trigram_string)

'Murphy_Oconnor name Romans '

In [10]:
trigram_string = "Old Testament text"
clean_trigram_string(trigram_string)

'Old_Testament  text'

In [11]:
trigram_string = "Ot text"
clean_trigram_string(trigram_string)

'Old_Testament  text'

# Preprocesing for embeddings etc.


In [12]:
unigrams_merged_cleaned = pickle.load(open("../data/large_files/unigrams_merged_cleaned.pickle", "rb"))

In [13]:
#
types_N = len(unigrams_merged_cleaned)
types_N

871634

In [14]:
tokens_N = sum([tup[1] for tup in unigrams_merged_cleaned.items()])
tokens_N

64389234

In [15]:
threshold = 50
unigrams_merged_thresh = dict([tup for tup in unigrams_merged_cleaned.items() if tup[1] >= threshold])
types_N_thres = len(unigrams_merged_thresh)
types_N_thres

41385

In [16]:
tokens_N_thres = sum([tup[1] for tup in unigrams_merged_thresh.items()])
tokens_N_thres

61468846

In [17]:
print(np.round(types_N_thres / types_N * 100, 2))
print(np.round(tokens_N_thres / tokens_N * 100, 2))

4.75
95.46


In [18]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('God', 540049),
 ('Paul', 412645),
 ('church', 384560),
 ('Christian', 322497),
 ('new', 297469),
 ('Jesus', 285583),
 ('Christ', 275021),
 ('work', 210300),
 ('man', 200756),
 ('book', 197991)]

In [19]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('God', 540049),
 ('Paul', 412645),
 ('church', 384560),
 ('Christian', 322497),
 ('new', 297469),
 ('Jesus', 285583),
 ('Christ', 275021),
 ('work', 210300),
 ('man', 200756),
 ('book', 197991)]

In [22]:
pickle.dump(unigrams_sorted_tups, open("../data/freqs_tups.pickle", "wb"))

In [22]:
nonewords = ["esus"]
[tup for tup in unigrams_sorted_tups if tup[0] in nonewords]

[('esus', 331)]

In [20]:
vocabulary = [tup[0] for tup in unigrams_sorted_tups]

['God',
 'Paul',
 'church',
 'Christian',
 'new',
 'Jesus',
 'Christ',
 'work',
 'man',
 'book',
 'time',
 'life',
 'use',
 'text',
 'word',
 'come',
 'people',
 'way',
 'world',
 'gospel',
 'study',
 'John',
 'mean',
 'faith',
 'Spirit',
 'write',
 'law',
 'letter',
 'find',
 'know',
 'century',
 'Jewish',
 'history',
 'Testament',
 'place',
 'good',
 'human',
 'understand',
 'woman',
 'theology',
 'great',
 'religious',
 'read',
 'act',
 'lord',
 'saint',
 'point',
 'tradition',
 'live',
 'love',
 'university',
 'community',
 'chapter',
 'follow',
 'Romans',
 'think',
 'religion',
 'press',
 'power',
 'present',
 'thing',
 'question',
 'form',
 'view',
 'holy',
 'biblical',
 'state',
 'son',
 'father',
 'year',
 'order',
 'day',
 'note',
 'old',
 'second',
 'christianity',
 'death',
 'speak',
 'bible',
 'scripture',
 'author',
 'theological',
 'divine',
 'example',
 'sin',
 'term',
 'body',
 'include',
 'end',
 'interpretation',
 'jew',
 'accord',
 'need',
 'social',
 'fact',
 'apost

In [186]:
dictionary = corpora.Dictionary([vocabulary])

In [187]:
dictionary.token2id["Christ"]

2605

In [188]:
len(dictionary)

41385

In [189]:
pickle.dump(dictionary, open("../data/dictionary_main.pickle", "wb"))

In [190]:
dictionary.doc2bow(["Jesus", "Christ", "nonsenseword"])

[(2605, 1), (6506, 1)]

In [191]:
trigram_string = "Christ Jesus Paul"
trigram_string_split = trigram_string.split()
print(trigram_string_split)

['Christ', 'Jesus', 'Paul']


In [192]:
trigram_string = "Christ Jesus rom23"
doc = dictionary.doc2bow(clean_trigram_string(trigram_string).split())
doc

[(2605, 1), (6506, 1), (10963, 1)]

In [193]:
trigram_string = "Jesus12 New Testament"
doc = dictionary.doc2bow(clean_trigram_string(trigram_string).split())
doc

[(6506, 1), (9235, 1)]

In [194]:
[(dictionary[tup[0]], tup[1]) for tup in doc]

[('Jesus', 1), ('New_Testament', 1)]

In [195]:
dictionary.doc2bow(["Jesus", "Jesus", "Christ"])

[(2605, 1), (6506, 2)]

# generate bows data for individual articles and save them one by one

In [196]:
article_ids_dict = dict(zip(trigramCount_cleaned_dict.keys(), range(len(trigramCount_cleaned_dict))))

In [197]:
list(article_ids_dict.items())[:10]

[('ark://27927/phx66812gq6', 0),
 ('ark://27927/pbd6fpf5fh', 1),
 ('ark://27927/phw1kd8s300', 2),
 ('ark://27927/phx64fptrwj', 3),
 ('ark://27927/phx64k1x5c2', 4),
 ('ark://27927/phx64fkrk6m', 5),
 ('http://www.jstor.org/stable/43052718', 6),
 ('ark://27927/phx68d6dm3t', 7),
 ('ark://27927/pbd934r3jr', 8),
 ('ark://27927/phx2t1wjwnt', 9)]

In [198]:
pickle.dump(article_ids_dict, open("../data/article_ids_dict.pickle", "wb"))

In [199]:
%%time
checkers = range(500, len(trigramCount_cleaned_dict), 500)
trigramCount_bows = {}
for id, data in list(trigramCount_cleaned_dict.items()):
    data = dict([(clean_trigram_string(trigram), count) for trigram, count in data.items() if len(trigram.split()) > 1])
    data_bows = []
    for trigram, count in data.items():
        bow = dictionary.doc2bow(trigram.split())
        if len(bow) > 1:
            data_bows.extend([bow] * count)
    simple_id = article_ids_dict[id]
    pickle.dump(data_bows, open("../data/large_files/article_docs/" + str(simple_id) + ".pickle", "wb"))
    if simple_id in checkers:
        print(simple_id)

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
CPU times: user 14min 20s, sys: 1min 18s, total: 15min 39s
Wall time: 16min 35s
