In [34]:
import pandas as pd
import pickle
import numpy as np
from collections import Counter
from gensim import corpora
import re

In [35]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [36]:
trigramCount_cleaned_dict = pickle.load(open("../data/large_files/trigramCount_cleaned_dict.pickle", "rb"))

In [37]:
collocation_replacements = {
    "Greco Roman" : "Greco_Roman",
    "Graeco Roman" : "Greco_Roman",
    "Acts Luke" : "Luke_Acts",
    "Luke act" : "Luke_Acts",
    "Christian Jewish" : "Jewish_Christian",
    "Judeo Christian" : "Judeo_Christian",
    "Judaeo Christian" : "Judeo_Christian",
    "Paul s" : "Paul",
    "Denis Saint" : "Saint_Denis",
    "Pauline deutero" : "deutero_Pauline",
    "Deutero Pauline" : "deutero_Pauline",
    "Saxon anglo" : "Anglo_Saxon",
    "Murphy Oconnor" : "Murphy_Oconnor",
    "Engberg Pedersen" : "Engberg_Pedersen"
}
collocations_keys = list(collocation_replacements.keys())

In [38]:
to_replace ={
    "Century" : "century",
    "christian": "Christian",
    "Biblical": "biblical",
    "Church": "church",
    "Faith": "faith",
    "Apostle": "apostle",
    "american": "American",
    "jewish": "Jewish",
    "roman" : "Roman",
    "testament": "Testament",
    "Lord": "lord",
    "St": "saint",
    "st": "saint",
    "Rom": "Romans",
    "rom" : "Romans",
    "Cor" : "Corinthians",
    "cor" : "Corinthians",
    "Gal" : "Galatians",
    "Phil" : "Philippians",
    "Thess" : "Thessalonians"
}
to_replace_keys = list(to_replace.keys())

In [39]:
key = "Church"
re.sub(key, to_replace[key], "Church Jesus")

'church Jesus'

In [89]:
trigram_string = "St station Thess"
trigram_string_split = trigram_string.split()
for key in to_replace_keys:
    if key in trigram_string:
        trigram_string_split = [to_replace[w] if w==key else w for w in trigram_string_split]
trigram_string_split

['saint', 'station', 'Thessalonians']

In [86]:
def clean_trigram_string(trigram_string):
    trigram_string_split = trigram_string.split()
    if re.search("\D\d", trigram_string):
        trigram_string_split = [re.sub("\d+", "", w) for w in trigram_string_split]
    for key in to_replace_keys:
        if key in trigram_string:
            trigram_string_split = [to_replace[w] if w==key else w for w in trigram_string_split]
    trigram_string = " ".join(trigram_string_split)
    for key in collocations_keys:
        key_split = key.split()
        if (key_split[0] in trigram_string) & (key_split[1] in trigram_string):
            trigram_string = re.sub(key_split[0], collocation_replacements[key], trigram_string.replace(key_split[1], ""))
    return trigram_string

In [87]:
trigram_string = "Murphy name112 Rom7 Oconnor"
clean_trigram_string(trigram_string)

'Murphy_Oconnor name Romans '

# Preprocesing for embeddings etc.


In [90]:
unigrams_merged_cleaned = pickle.load(open("../data/large_files/unigrams_merged_cleaned.pickle", "rb"))

In [91]:
#
types_N = len(unigrams_merged_cleaned)
types_N

973490

In [92]:
tokens_N = sum([tup[1] for tup in unigrams_merged_cleaned.items()])
tokens_N

64396679

In [93]:
threshold = 50
unigrams_merged_thresh = dict([tup for tup in unigrams_merged_cleaned.items() if tup[1] >= threshold])
types_N_thres = len(unigrams_merged_thresh)
types_N_thres

43817

In [94]:
tokens_N_thres = sum([tup[1] for tup in unigrams_merged_thresh.items()])
tokens_N_thres

61128623

In [95]:
print(np.round(types_N_thres / types_N * 100, 2))
print(np.round(tokens_N_thres / tokens_N * 100, 2))

4.5
94.93


In [96]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('God', 506299),
 ('Paul', 410813),
 ('church', 384135),
 ('Christian', 319700),
 ('new', 286785),
 ('Jesus', 284717),
 ('Christ', 267051),
 ('work', 210266),
 ('man', 196657),
 ('time', 192015)]

In [97]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('God', 506299),
 ('Paul', 410813),
 ('church', 384135),
 ('Christian', 319700),
 ('new', 286785),
 ('Jesus', 284717),
 ('Christ', 267051),
 ('work', 210266),
 ('man', 196657),
 ('time', 192015)]

In [98]:
vocabulary = [tup[0] for tup in unigrams_sorted_tups]

In [99]:
dictionary = corpora.Dictionary([vocabulary])

In [100]:
dictionary.token2id["Christ"]

2911

In [101]:
len(dictionary)

43817

In [102]:
pickle.dump(dictionary, open("../data/dictionary_main.pickle", "wb"))

In [103]:
dictionary.doc2bow(["Jesus", "Christ", "nonsenseword"])

[(2911, 1), (7521, 1)]

In [104]:
trigram_string = "Christ Jesus Paul"
trigram_string_split = trigram_string.split()
print(trigram_string_split)

['Christ', 'Jesus', 'Paul']


In [116]:
trigram_string = "Christ Jesus rom23"
doc = dictionary.doc2bow(clean_trigram_string(trigram_string).split())
doc

[(2911, 1), (7521, 1), (12651, 1)]

In [117]:
[(dictionary[tup[0]], tup[1]) for tup in doc]

[('Christ', 1), ('Jesus', 1), ('Romans', 1)]

In [118]:
dictionary.doc2bow(["Jesus", "Jesus", "Christ"])

[(2911, 1), (7521, 2)]

# generate bows data for individual articles and save them one by one

In [119]:
article_ids_dict = dict(zip(trigramCount_cleaned_dict.keys(), range(len(trigramCount_cleaned_dict))))

In [120]:
list(article_ids_dict.items())[:10]

[('ark://27927/phx66812gq6', 0),
 ('ark://27927/pbd6fpf5fh', 1),
 ('ark://27927/phw1kd8s300', 2),
 ('ark://27927/phx64fptrwj', 3),
 ('ark://27927/phx64k1x5c2', 4),
 ('ark://27927/phx64fkrk6m', 5),
 ('http://www.jstor.org/stable/43052718', 6),
 ('ark://27927/phx68d6dm3t', 7),
 ('ark://27927/pbd934r3jr', 8),
 ('ark://27927/phx2t1wjwnt', 9)]

In [121]:
pickle.dump(article_ids_dict, open("../data/article_ids_dict.pickle", "wb"))

In [122]:
%%time
trigramCount_bows = {}
for id, data in list(trigramCount_cleaned_dict.items()):
    data = dict([(clean_trigram_string(trigram), count) for trigram, count in data.items() if len(trigram.split()) > 1])
    data_bows = []
    for trigram, count in data.items():
        bow = dictionary.doc2bow(trigram.split())
        if len(bow) > 1:
            data_bows.extend([bow] * count)
    pickle.dump(data_bows, open("../data/large_files/article_docs/" + str(article_ids_dict[id]) + ".pickle", "wb"))

CPU times: user 13min 50s, sys: 4min 3s, total: 17min 54s
Wall time: 19min 54s
