In [1]:
import pandas as pd
import pickle
import numpy as np
from collections import Counter
from gensim import corpora
import re



In [2]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [3]:
trigramCount_cleaned_dict = pickle.load(open("../data/large_files/trigramCount_cleaned_dict.pickle", "rb"))

In [4]:
collocation_replacements = {
    "Greco Roman" : "Greco_Roman",
    "Graeco Roman" : "Greco_Roman",
    "Acts Luke" : "Luke_Acts",
    "Luke act" : "Luke_Acts",
    "Christian Jewish" : "Jewish_Christian",
    "Judeo Christian" : "Jewish_Christian",
    "Judaeo Christian" : "Jewish_Christian",
    "Paul s" : "Paul",
    "Denis Saint" : "Saint_Denis",
    "Pauline deutero" : "deutero_Pauline",
    "Deutero Pauline" : "deutero_Pauline",
    "Saxon anglo" : "Anglo_Saxon",
    "Murphy Oconnor" : "Murphy_Oconnor",
    "Engberg Pedersen" : "Engberg_Pedersen"
}
collocations_keys = list(collocation_replacements.keys())

In [5]:
to_replace ={
    "Century" : "century",
    "christian": "Christian",
    "Biblical": "biblical",
    "Church": "church",
    "Faith": "faith",
    "Apostle": "apostle",
    "american": "American",
    "jewish": "Jewish",
    "roman" : "Roman",
    "testament": "Testament",
    "Lord": "lord"
}
to_replace_keys = list(to_replace.keys())

In [6]:
key = "Church"
re.sub(key, to_replace[key], "Church Jesus")

'church Jesus'

In [7]:
def clean_trigram_string(trigram_string):
    for key in to_replace_keys:
        if key in trigram_string:
            trigram_string = re.sub(key, to_replace[key], trigram_string)
    for key in collocations_keys:
        key_split = key.split()
        if (key_split[0] in trigram_string) & (key_split[1] in trigram_string):
            trigram_string = re.sub(key_split[0], collocation_replacements[key], trigram_string.replace(key_split[1], ""))
    return trigram_string

In [8]:
trigram_string = "Murphy name Oconnor"
clean_trigram_string(trigram_string)

'Murphy_Oconnor name '

# Preprocesing for embeddings etc.


In [9]:
unigrams_merged_cleaned = pickle.load(open("../data/large_files/unigrams_merged_cleaned.pickle", "rb"))

In [10]:
#
types_N = len(unigrams_merged_cleaned)
types_N

1161580

In [11]:
tokens_N = sum([tup[1] for tup in unigrams_merged_cleaned.items()])
tokens_N

66248485

In [12]:
threshold = 50
unigrams_merged_thresh = dict([tup for tup in unigrams_merged_cleaned.items() if tup[1] >= threshold])
types_N_thres = len(unigrams_merged_thresh)
types_N_thres

44592

In [13]:
tokens_N_thres = sum([tup[1] for tup in unigrams_merged_thresh.items()])
tokens_N_thres

62698922

In [14]:
print(np.round(types_N_thres / types_N * 100, 2))
print(np.round(tokens_N_thres / tokens_N * 100, 2))

3.84
94.64


In [15]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('God', 504403),
 ('Paul', 410214),
 ('church', 382062),
 ('Christian', 322291),
 ('new', 286753),
 ('Jesus', 284019),
 ('Christ', 271838),
 ('work', 209831),
 ('man', 196322),
 ('time', 191191)]

In [16]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('God', 504403),
 ('Paul', 410214),
 ('church', 382062),
 ('Christian', 322291),
 ('new', 286753),
 ('Jesus', 284019),
 ('Christ', 271838),
 ('work', 209831),
 ('man', 196322),
 ('time', 191191)]

In [17]:
vocabulary = [tup[0] for tup in unigrams_sorted_tups]

In [18]:
dictionary = corpora.Dictionary([vocabulary])

In [19]:
dictionary.token2id["Christ"]

3052

In [20]:
dictionary[3052]

'Christ'

In [21]:
len(dictionary)

44592

In [22]:
pickle.dump(dictionary, open("../data/dictionary_main.pickle", "wb"))

In [23]:
dictionary.doc2bow(["Jesus", "Christ", "nonsenseword"])

[(3052, 1), (7884, 1)]

# generate bows data for individual articles and save them one by one

In [24]:
article_ids_dict = dict(zip(trigramCount_cleaned_dict.keys(), range(len(trigramCount_cleaned_dict))))

In [25]:
list(article_ids_dict.items())[:10]

[('ark://27927/phx66812gq6', 0),
 ('ark://27927/pbd6fpf5fh', 1),
 ('ark://27927/phw1kd8s300', 2),
 ('ark://27927/phx64fptrwj', 3),
 ('ark://27927/phx64k1x5c2', 4),
 ('ark://27927/phx64fkrk6m', 5),
 ('http://www.jstor.org/stable/43052718', 6),
 ('ark://27927/phx68d6dm3t', 7),
 ('ark://27927/pbd934r3jr', 8),
 ('ark://27927/phx2t1wjwnt', 9)]

In [26]:
pickle.dump(article_ids_dict, open("../data/article_ids_dict.pickle", "wb"))

In [27]:
%%time
trigramCount_bows = {}
for id, data in list(trigramCount_cleaned_dict.items()):
    data = dict([(clean_trigram_string(trigram), count) for trigram, count in data.items() if len(trigram.split()) > 1])
    data_bows = []
    for trigram, count in data.items():
        bow = dictionary.doc2bow(trigram.split())
        if len(bow) > 1:
            data_bows.extend([bow] * count)
    pickle.dump(data_bows, open("../data/large_files/article_docs/" + str(article_ids_dict[id]) + ".pickle", "wb"))

CPU times: user 12min 36s, sys: 3min 29s, total: 16min 5s
Wall time: 18min 10s
