In [125]:
import pandas as pd
import pickle
import random
pd.set_option("display.max_columns", None)
from collections import Counter
import spacy
import re

In [13]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [20]:
unigramCount_dict = pickle.load(open("../data/large_files/unigramCount_dict.pickle", "rb"))

In [136]:
bigramCount_dict = pickle.load(open("../data/large_files/bigramCount_dict.pickle", "rb"))

In [16]:
trigramCount_dict = pickle.load(open("../data/large_files/trigramCount_dict.pickle", "rb"))

# Generating most frequent ngrams for preprocessing

In [17]:
def merge_data_from_ids(ids, datadict):
    c = Counter()
    for id in ids:
        d = datadict[id]
        c.update(d)
    return c


In [200]:
# create a sample to preprocess the most frequent ngrams
random.seed(0)
ids = random.sample(list(unigramCount_dict.keys()), 5000)

In [None]:
data_unigrams = merge_data_from_ids(ids, unigramCount_dict)
data_bigrams = merge_data_from_ids(ids, bigramCount_dict)
data_trigrams = merge_data_from_ids(ids, trigramCount_dict)

# Most frequent unigrams to nlp docs

In [138]:
#english stopwords...
nlp = spacy.load('en_core_web_lg')
stop_words = nlp.Defaults.stop_words

In [139]:
# check how raw unigrams look like
data_unigrams_sorted = sorted(data_unigrams.items(), key=lambda kv: kv[1], reverse=True)
data_unigrams_sorted[:100]

[('the', 685375),
 ('of', 469302),
 ('and', 301990),
 ('to', 255776),
 ('in', 236293),
 ('a', 162034),
 ('is', 130518),
 ('that', 128544),
 ('as', 87930),
 ('for', 71446),
 ('The', 65300),
 ('with', 61132),
 ('his', 59222),
 ('by', 56837),
 ('was', 56632),
 ('not', 56560),
 ('be', 51834),
 ('on', 51290),
 ('this', 48676),
 ('he', 46988),
 ('it', 46724),
 ('from', 45904),
 ('are', 39980),
 ('which', 38964),
 ('or', 34802),
 ('have', 32562),
 ('an', 31941),
 ('but', 30530),
 ('their', 30193),
 ('who', 30147),
 ('at', 28609),
 ('they', 26319),
 ('I', 25802),
 ('one', 24324),
 ('In', 24115),
 ('were', 24076),
 ('.', 23377),
 ('has', 23247),
 ('had', 22137),
 ('we', 21984),
 ('all', 21824),
 ('God', 20702),
 ('also', 19900),
 ('its', 18719),
 ('more', 17119),
 ('will', 16627),
 ('been', 16608),
 ('Christian', 16249),
 ('would', 15640),
 ('Paul', 15371),
 ('This', 15191),
 ('these', 15057),
 ('what', 14940),
 ('other', 14579),
 ('only', 14382),
 ('into', 13888),
 ('He', 13273),
 ('so', 13237

In [140]:
# how many of the 100 most frequent unigrams are actually not stopwords?
len([el for el in data_unigrams_sorted[:100] if el[0].lower() not in stop_words])

10

In [141]:
# what is the number of unigrams appearing more than 10 times?
len([el for el in data_unigrams_sorted if el[1] > 10])

51694

In [142]:
%%time
# use spacy to generate "Doc" object for ngram
# (use carefully, takes some time)
data_unigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_unigrams_sorted if el[1] > 10]

CPU times: user 2min 11s, sys: 2.27 s, total: 2min 13s
Wall time: 2min 15s


In [143]:
data_unigrams_sorted_nlp[:5]

[('the', {'doc': the, 'count': 685375}),
 ('of', {'doc': of, 'count': 469302}),
 ('and', {'doc': and, 'count': 301990}),
 ('to', {'doc': to, 'count': 255776}),
 ('in', {'doc': in, 'count': 236293})]

In [144]:
data_unigrams_nlp_dict = dict(data_unigrams_sorted_nlp)

In [145]:
# save for future usage
with open("../data/large_files/data_unigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_unigrams_nlp_dict, f)

In [146]:
# remove middle-step data to save CPU...
del data_unigrams
del data_unigrams_sorted
del data_unigrams_sorted_nlp

# Bigrams to nlp docs

In [147]:
data_bigrams_sorted = sorted(data_bigrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_bigrams_sorted if el[1] > 10])

100053

In [148]:
data_bigrams_sorted[:10]

[('of the', 129296),
 ('in the', 63799),
 ('to the', 43688),
 ('and the', 29429),
 ('that the', 18916),
 ('on the', 18678),
 ('to be', 16996),
 ('for the', 16229),
 ('with the', 15980),
 ('from the', 14888)]

In [149]:
%%time
data_bigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_bigrams_sorted if el[1] > 10]

CPU times: user 4min 4s, sys: 3.19 s, total: 4min 7s
Wall time: 4min 9s


In [152]:
data_bigrams_nlp_dict = dict(data_bigrams_sorted_nlp)

In [153]:
with open("../data/large_files/data_bigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_bigrams_nlp_dict, f)

In [154]:
del data_bigrams
del data_bigrams_sorted_nlp

# Trigrams to nlp docs

In [155]:
data_trigrams_sorted = sorted(data_trigrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_trigrams_sorted if el[1] > 10])

56376

In [156]:
%%time
data_trigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_trigrams_sorted if el[1] > 10]

CPU times: user 2min 21s, sys: 2.15 s, total: 2min 23s
Wall time: 2min 25s


In [157]:
data_trigrams_nlp_dict = dict(data_trigrams_sorted_nlp)

In [158]:
with open("../data/large_files/data_trigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_trigrams_nlp_dict, f)

# Applying preprocessing to individual documents

In [183]:
postags = ["PROPN", "NOUN", "VERB", "ADJ"]

def clean_token(token):
    token = re.sub("\W", "", token)
    token = token[0] + token[1:].lower()
    return token

def doc_to_lemmata(doc, postags):
    lemmata_list = [t.lemma_ for t in doc if t.pos_ in postags]
    lemmata_list = [clean_token(el) for el in lemmata_list if len(el)>0]
    joined_lemmata_sorted = " ".join(sorted(lemmata_list))
    return joined_lemmata_sorted


ngram_nlp_dicts = {
    "unigram" : data_unigrams_nlp_dict,
    "bigram" : data_bigrams_nlp_dict,
    "trigram" : data_trigrams_nlp_dict
}

def ngram_to_lemmata(ngram, ngram_type="trigram"):
    ngram_nlp_dict = ngram_nlp_dicts[ngram_type]
    try:
        lemmata_sorted_str = doc_to_lemmata(ngram_nlp_dict[ngram]["doc"], postags)
        #print("found in preprocessed") # (used for execution time tests...)
    except:
        try:
            lemmata_sorted_str = doc_to_lemmata(nlp(ngram), postags)
            # print("processed now") # (used for execution time tests...)
        except:
            lemmata_sorted_str = ""
    return lemmata_sorted_str

def article_data_to_lemmata(ngrams_dict, ngram_type="trigram"):
    lemmata_tups = []
    for string, count in ngrams_dict.items():
        lemmata_tups.append((ngram_to_lemmata(string, ngram_type=ngram_type), count))
    lemmata_tups = [tup for tup in lemmata_tups if len(tup[0]) > 1]
    lemmata_dict = Counter()
    for x,y in lemmata_tups:
        lemmata_dict.update({x : y})
    return lemmata_dict


In [187]:
%%time
test_ngrams_dict = (list(trigramCount_dict.items())[4000][1])
test_output = article_data_to_lemmata(test_ngrams_dict, ngram_type="trigram")
sorted(test_output.items(), key=lambda pair: pair[1], reverse=True)[:100]

CPU times: user 2.73 s, sys: 117 ms, total: 2.85 s
Wall time: 2.99 s


[('Paul', 11),
 ('paraphrase', 9),
 ('deal', 9),
 ('debate', 8),
 ('text', 8),
 ('biblical text', 7),
 ('meaning', 7),
 ('gap', 7),
 ('give', 6),
 ('section', 6),
 ('argument', 6),
 ('New Testament', 6),
 ('make', 5),
 ('historical', 5),
 ('translation', 5),
 ('constitute', 5),
 ('relevant', 5),
 ('chapter', 5),
 ('Pauline', 5),
 ('occasion', 5),
 ('letter', 5),
 ('center', 5),
 ('book', 4),
 ('context', 4),
 ('biblical material', 4),
 ('Pauline epistle', 4),
 ('guide', 4),
 ('relevance', 4),
 ('clear make', 4),
 ('hand', 4),
 ('article', 4),
 ('nature', 4),
 ('work', 4),
 ('number', 4),
 ('pp', 4),
 ('relationship', 4),
 ('follow', 4),
 ('New', 4),
 ('wish', 4),
 ('address', 4),
 ('serve', 4),
 ('know', 4),
 ('situation', 4),
 ('deliver', 3),
 ('treatment', 3),
 ('treat', 3),
 ('difficult', 3),
 ('act', 3),
 ('literal', 3),
 ('discussion', 3),
 ('translate', 3),
 ('Paul thought', 3),
 ('add', 3),
 ('expand', 3),
 ('write', 3),
 ('Bruce work', 3),
 ('understanding', 3),
 ('expansion', 

In [188]:
%%time
# small test...


cleanedTrigrams_tups = []
for k, v in list(trigramCount_dict.items())[:10]:
    cleanedTrigrams_tups.append((k, article_data_to_lemmata(v)))

In [191]:
cleanedTrigrams_tups[0]

('ark://27927/phx66812gq6',
 Counter({'F H mathews': 1,
          'Jesus': 24,
          'life vicissitude': 2,
          'preparation': 3,
          'age': 10,
          'death man': 2,
          'ambition uncleanness': 1,
          'lord': 2,
          'Bd Crieff Hepburn': 1,
          'knotty point': 2,
          'easy': 10,
          'act': 12,
          'story': 8,
          'compose': 2,
          'do thing': 2,
          'Dr Hunter point': 1,
          'Somme': 4,
          'consequence obedience': 1,
          'human': 7,
          'field': 3,
          'allow take': 1,
          'Sir speaker': 1,
          'hear': 15,
          'Ba Brown Robert': 1,
          'Kingdom supernatural': 2,
          'sentry': 3,
          'share truth': 1,
          'God': 62,
          'go': 27,
          'take': 12,
          'frightening world': 2,
          'world': 18,
          'clean': 1,
          'Jesus react': 1,
          'convenient moment': 1,
          'generous': 3,
          'avoid

In [193]:
for n in range(10):
    print(sorted([el for el in cleanedTrigrams_tups[n][1].items() if len(el[0].split()) > 1], key=lambda pair: pair[1], reverse=True)[:10])

[('reverent silence', 8), ('ask question', 8), ('New Testament', 7), ('God will', 6), ('do man', 6), ('man ordinary', 5), ('only way', 5), ('God Lord', 5), ('half inch', 4), ('man wise', 4)]
[('Jesus tradition', 31), ('Press University', 26), ('Books Cambridge Online', 26), ('Cambridge Press University', 26), ('Jesus accord', 22), ('Jesus saying', 20), ('Paul letter', 20), ('Jesus say', 16), ('saying traditional', 15), ('Jesus Paul', 15)]
[('Judaism rabbinic', 18), ('Paul genius', 16), ('Sandmel genius', 15), ('Paul Schoeps', 15), ('Judaism palestinian', 15), ('Paul have', 13), ('Genius Sandmel', 13), ('Paul Sanders', 12), ('Davies Paul', 12), ('Paul Pharisaism', 11)]
[('Church true', 14), ('Development Historical', 14), ('Apocalypse Winter', 12), ('Bible Scofield', 11), ('Christ Jesus', 10), ('Dispensationalism Premillennial', 10), ('Holy Spirit', 7), ('dispensational system', 7), ('Premillennial development', 7), ('dichotomous entity', 6)]
[('Holy Spirit', 43), ('baptism infant', 21)

In [195]:
%%time
# small test...
cleanedTrigrams_tups = [cleanedTrigrams_tups.append((k, article_data_to_lemmata(v))) for k, v in list(trigramCount_dict.items())[:3]]

CPU times: user 46.3 s, sys: 315 ms, total: 46.6 s
Wall time: 47.2 s


In [199]:
%%time
cleanedTrigrams_tups = []
for k, v in list(trigramCount_dict.items())[:3]:
    cleanedTrigrams_tups.append((k, article_data_to_lemmata(v)))

CPU times: user 52.6 s, sys: 1.24 s, total: 53.9 s
Wall time: 55.1 s


In [None]:

doc = nlp("our Lord Jesus")
doc_to_lemmata(doc, postags)
trigram_to_lemmata("our Lord Jesus")
[data_trigrams_nlp_dict[el[0]]["doc"] for el in v.items()

In [39]:
len(trigramCount_dict)

14103

In [40]:
list(trigramCount_dict.items())[1000]

('http://www.jstor.org/stable/1453640',
 {'who maintain the': 1,
  'un- translatable. The': 1,
  'des personnes plus': 1,
  'same caves at': 1,
  'the Rule, "But': 1,
  'The archaeologists are': 1,
  'Zion." The publicity': 1,
  'written by Bar': 3,
  'pp. I96-2II; Jan.': 1,
  'I96I, pp. 269-272.': 1,
  'teacher of righteousness.': 1,
  'stand regarding the': 1,
  'by Claudius Lysis': 1,
  'are ancient does': 1,
  'The Jewish Chronicle,': 1,
  'First I shall': 1,
  'were righteous but': 1,
  'nfl:3 ` I': 1,
  'Kokba discovered in': 1,
  'time sought to': 1,
  'is a very': 1,
  'discovery of this': 1,
  'the Devil is': 1,
  'tlhe existence of': 1,
  'The sages who': 1,
  'of this movement': 1,
  'The members of': 1,
  'and placed the': 1,
  'copied texts, which': 1,
  'men. Were there': 1,
  'that it is': 1,
  'Israel Exploration Society,': 1,
  'independent state. Some': 1,
  'refute my arguments-they': 1,
  'translates the phrase': 1,
  'sections in the': 2,
  'said, "If the': 1,
  'n

In [None]:
trigramCount_tups_of_tups = []
for k, v in list(trigramCount_dict.items())[:10]:
    trigramCount_tups_of_tups.append((k, v))

# Explorations

In [None]:
def get_tops(col, matchstring, n=10):
    ids = ids_from_colvals(col, matchstring)
    c = merge_data_from_ids(ids, unigramCount_dict)
    c_tups = [el for el in c.items() if el[0].lower() not in list(stop_words) + [".", "-", "\""]]
    top10 = sorted(c_tups, key=lambda kv: kv[1], reverse=True)[:n]
    return top10

In [None]:
get_tops("publicationYear", "==1951", 20)

In [None]:
get_tops("publicationYear", ".between(1950,1959)")

In [None]:
for decade_n in range(0,10):
    print("19{0}0,19{0}9".format(str(decade_n)))

In [None]:
for decade_n in range(0,10):
    decade = "19{0}0,19{0}9".format(str(decade_n))
    print(decade, get_tops("publicationYear", ".between({0})".format(decade), 20))


In [None]:
bigramCount_dict = pickle.load(open("../data/large_files/bigramCount_dict.pickle", "rb"))

In [None]:
def get_top_bigrams(col, matchstring, n=10):
    ids = ids_from_colvals(col, matchstring)
    c = merge_data_from_ids(ids, bigramCount_dict)
    #c_tups = [el for el in c.items() if el[0].lower() not in list(stop_words)]
    tops = sorted(c.items(), key=lambda kv: kv[1], reverse=True)[:n]
    return tops

In [None]:
for decade_n in range(0,10):
    decade = "19{0}0,19{0}9".format(str(decade_n))
    print(decade, get_top_bigrams("publicationYear", ".between({0})".format(decade), 10))