In [14]:
import pandas as pd
import pickle
import numpy as np
from collections import Counter
from gensim import corpora
import re

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [15]:
trigramCount_cleaned_dict = pickle.load(open("../data/large_files/trigramCount_cleaned_dict.pickle", "rb"))

In [16]:
collocation_replacements = {
    "Greco Roman" : "Greco_Roman",
    "Graeco Roman" : "Greco_Roman",
    "Acts Luke" : "Luke_Acts",
    "Luke act" : "Luke_Acts",
    "Christian Jewish" : "Jewish_Christian",
    "Judeo Christian" : "Jewish_Christian",
    "Judaeo Christian" : "Jewish_Christian",
    "Paul s" : "Paul",
    "Denis Saint" : "Saint_Denis",
    "Pauline deutero" : "deutero_Pauline",
    "Deutero Pauline" : "deutero_Pauline",
    "Saxon anglo" : "Anglo_Saxon",
    "Murphy Oconnor" : "Murphy_Oconnor",
    "Engberg Pedersen" : "Engberg_Pedersen"
}
collocations_keys = list(collocation_replacements.keys())

In [17]:
to_replace ={
    "Century" : "century",
    "christian": "Christian",
    "Biblical": "biblical",
    "Church": "church",
    "Faith": "faith",
    "Apostle": "apostle",
    "american": "American",
    "jewish": "Jewish",
    "roman" : "Roman",
    "testament": "Testament",
    "Lord": "lord"
}
to_replace_keys = list(to_replace.keys())

In [18]:
key = "Church"
re.sub(key, to_replace[key], "Church Jesus")

'church Jesus'

In [19]:
def clean_trigram_string(trigram_string):
    for key in to_replace_keys:
        if key in trigram_string:
            trigram_string = re.sub(key, to_replace[key], trigram_string)
    for key in collocations_keys:
        key_split = key.split()
        if (key_split[0] in trigram_string) & (key_split[1] in trigram_string):
            trigram_string = re.sub(key_split[0], collocation_replacements[key], trigram_string.replace(key_split[1], ""))
    return trigram_string

In [20]:
trigram_string = "Murphy name Oconnor"
clean_trigram_string(trigram_string)

'Murphy_Oconnor name '

# Preprocesing for embeddings etc.


In [21]:
unigrams_merged_cleaned = pickle.load(open("../data/large_files/unigrams_merged_cleaned.pickle", "rb"))

In [22]:
#
types_N = len(unigrams_merged_cleaned)
types_N

1161882

In [23]:
tokens_N = sum([tup[1] for tup in unigrams_merged_cleaned.items()])
tokens_N

71944547

In [24]:
threshold = 50
unigrams_merged_thresh = dict([tup for tup in unigrams_merged_cleaned.items() if tup[1] >= threshold])
types_N_thres = len(unigrams_merged_thresh)
types_N_thres

44761

In [25]:
tokens_N_thres = sum([tup[1] for tup in unigrams_merged_thresh.items()])
tokens_N_thres

68393530

In [26]:
print(np.round(types_N_thres / types_N * 100, 2))
print(np.round(tokens_N_thres / tokens_N * 100, 2))

3.85
95.06


In [28]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('have', 1105466),
 ('God', 504403),
 ('do', 465122),
 ('Paul', 410214),
 ('church', 382062),
 ('see', 373909),
 ('Christian', 322291),
 ('other', 295237),
 ('new', 286753),
 ('Jesus', 284019)]

In [29]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('have', 1105466),
 ('God', 504403),
 ('do', 465122),
 ('Paul', 410214),
 ('church', 382062),
 ('see', 373909),
 ('Christian', 322291),
 ('other', 295237),
 ('new', 286753),
 ('Jesus', 284019)]

In [30]:
vocabulary = [tup[0] for tup in unigrams_sorted_tups]

In [31]:
dictionary = corpora.Dictionary([vocabulary])

In [32]:
dictionary.token2id["Christ"]

3052

In [34]:
dictionary[3052]

'Christ'

In [35]:
len(dictionary)

44761

In [36]:
pickle.dump(dictionary, open("../data/dictionary_main.pickle", "wb"))

In [38]:
dictionary.doc2bow(["Jesus", "Christ", "nonsenseword"])

[(3052, 1), (7884, 1)]

# generate bows data for individual articles and save them one by one

In [40]:
article_ids_dict = dict(zip(trigramCount_cleaned_dict.keys(), range(len(trigramCount_cleaned_dict))))

In [41]:
list(article_ids_dict.items())[:10]

[('ark://27927/phx66812gq6', 0),
 ('ark://27927/pbd6fpf5fh', 1),
 ('ark://27927/phw1kd8s300', 2),
 ('ark://27927/phx64fptrwj', 3),
 ('ark://27927/phx64k1x5c2', 4),
 ('ark://27927/phx64fkrk6m', 5),
 ('http://www.jstor.org/stable/43052718', 6),
 ('ark://27927/phx68d6dm3t', 7),
 ('ark://27927/pbd934r3jr', 8),
 ('ark://27927/phx2t1wjwnt', 9)]

In [42]:
pickle.dump(article_ids_dict, open("../data/article_ids_dict.pickle", "wb"))

In [44]:
%%time
trigramCount_bows = {}
for id, data in list(trigramCount_cleaned_dict.items()):
    data = dict([(clean_trigram_string(trigram), count) for trigram, count in data.items() if len(trigram.split()) > 1])
    data_bows = []
    for trigram, count in data.items():
        bow = dictionary.doc2bow(trigram.split())
        if len(bow) > 1:
            data_bows.extend([bow] * count)
    pickle.dump(data_bows, open("../data/large_files/article_docs/" + str(article_ids_dict[id]) + ".pickle", "wb"))

CPU times: user 15min 8s, sys: 11min 36s, total: 26min 45s
Wall time: 31min 55s
