In [12]:
import pandas as pd
import pickle
pd.set_option("display.max_columns", None)
from collections import Counter
import spacy

In [13]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [14]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [15]:
jstor_df.head(5)

Unnamed: 0,creator,datePublished,docType,doi,id,identifier,isPartOf,issueNumber,keyphrase,language,outputFormat,pageCount,pageEnd,pageStart,pagination,provider,publicationYear,publisher,sequence,tdmCategory,title,url,volumeNumber,wordCount,docSubType,sourceCategory,subTitle,hasPartTitle
0,[],1959-10-01,article,10.1177/001452465907100107,ark://27927/phx66812gq6,"[{'name': 'doi', 'value': '10.1177/00145246590...",Expository Times,1.0,"[omnipotent reigneth, god omnipotent, silence,...",[eng],"[unigram, bigram, trigram]",8.0,31,24,pp. 24-31,portico,1959,SAGE Publications,7.0,"[Religion - Theology, Religion - Spiritual bel...",In the Study,http://doi.org/10.1177/001452465907100107,71.0,7382,,,,
1,[Victor Paul Furnish],2009-01-01,chapter,10.1017/CBO9780511621321.003,ark://27927/pbd6fpf5fh,"[{'name': 'isbn', 'value': '9780511621321'}, {...",Jesus According to PaulJesus According to Paul,,"[saying, books online, jesus tradition, pauls ...",[eng],"[unigram, bigram, trigram]",26.0,65,40,40-65,portico,2009,Cambridge University Press,5.0,"[History - Historical methodology, Religion - ...",3 Sayings of Jesus in Paul's Letters,https://doi.org/10.1017/CBO9780511621321.003,,8577,,,,
2,[Leander E. Keck],2015-01-01,chapter,,ark://27927/phw1kd8s300,[],Christ&#39;s First Theologian,,"[pharisaism, sandmel genius, judaism, rabbinic...",[eng],"[unigram, bigram, trigram]",14.0,42,29,29-42,portico,2015,Baylor University Press,8.0,"[History - Historical methodology, Religion - ...",3. The Quest for Pauls Pharisaism,,,5354,,,,
3,[LeAnn Snow Flesher],2009-02-01,article,10.1177/003463730910600105,ark://27927/phx64fptrwj,"[{'name': 'doi', 'value': '10.1177/00346373091...",Review & Expositor: An International Baptist J...,1.0,"[scofield, premillennial, premillennial dispen...",[eng],"[unigram, bigram, trigram]",11.0,45,35,pp. 35-45,portico,2009,SAGE Publications,5.0,"[Religion - Theology, Religion - Spiritual bel...",The Historical Development of Premillennial Di...,http://doi.org/10.1177/003463730910600105,106.0,3614,,,,
4,[A. Daunton-Fear],1995-07-01,article,10.1177/0040571X9509800404,ark://27927/phx64k1x5c2,"[{'name': 'doi', 'value': '10.1177/0040571X950...",Theology,784.0,"[baptism, holy spirit, communion, infant bapti...",[eng],"[unigram, bigram, trigram]",10.0,282,273,273-282,portico,1995,SAGE Publications,4.0,[Religion - Spiritual belief systems],Resisting the Tide Christian Initiation and Co...,http://doi.org/10.1177/0040571X9509800404,98.0,4323,,,,


In [20]:
unigramCount_dict = pickle.load(open("../data/large_files/unigramCount_dict.pickle", "rb"))

In [None]:
bigramCount_dict = pickle.load(open("../data/large_files/bigramCount_dict.pickle", "rb"))

In [16]:
trigramCount_dict = pickle.load(open("../data/large_files/trigramCount_dict.pickle", "rb"))

In [17]:
def ids_from_colvals(col, matchstring):
    ids = eval('jstor_df[jstor_df["{0}"]{1}]'.format(col, matchstring))["id"].tolist()
    return ids

def merge_data_from_ids(ids, datadict):
    c = Counter()
    for id in ids:
        d = datadict[id]
        c.update(d)
    return c


# Cleaning the texts

In [18]:
ids = jstor_df.sample(1000, random_state=0)["id"].tolist()

In [21]:
data_unigrams = merge_data_from_ids(ids, unigramCount_dict)
data_trigrams = merge_data_from_ids(ids, trigramCount_dict)

# Unigrams to nlp docs

In [25]:
#english stopwords...
nlp = spacy.load('en_core_web_lg')
stop_words = nlp.Defaults.stop_words

In [26]:
data_unigrams_sorted = sorted(data_unigrams.items(), key=lambda kv: kv[1], reverse=True)
data_unigrams_sorted[:100]

[('the', 646933),
 ('of', 439108),
 ('and', 285408),
 ('to', 241632),
 ('in', 222439),
 ('a', 151320),
 ('is', 122360),
 ('that', 121717),
 ('as', 81155),
 ('for', 68025),
 ('The', 60693),
 ('with', 57359),
 ('by', 54623),
 ('not', 54283),
 ('his', 53612),
 ('was', 52093),
 ('be', 49716),
 ('on', 49153),
 ('this', 45342),
 ('it', 45089),
 ('from', 44563),
 ('he', 43151),
 ('are', 37823),
 ('which', 35341),
 ('or', 34513),
 ('have', 30836),
 ('an', 30718),
 ('their', 29473),
 ('but', 29468),
 ('who', 29056),
 ('at', 28020),
 ('they', 25664),
 ('I', 23505),
 ('were', 22907),
 ('one', 22361),
 ('In', 22322),
 ('we', 22190),
 ('.', 21743),
 ('all', 21097),
 ('has', 20842),
 ('had', 20628),
 ('also', 20608),
 ('God', 19791),
 ('its', 16607),
 ('Christian', 15804),
 ('will', 15758),
 ('been', 15531),
 ('more', 15288),
 ('Paul', 15171),
 ('would', 15040),
 ('what', 14032),
 ('other', 13642),
 ('only', 13577),
 ('This', 13570),
 ('about', 13395),
 ('these', 13373),
 ('so', 13096),
 ('our', 129

In [27]:
len([el for el in data_unigrams_sorted[:100] if el[0].lower() not in stop_words])

10

In [28]:
len([el for el in data_unigrams_sorted if el[1] > 10])

50927

In [29]:
%%time
data_unigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_unigrams_sorted if el[1] > 10]

CPU times: user 2min 6s, sys: 2.1 s, total: 2min 8s
Wall time: 2min 8s


In [30]:
data_unigrams_sorted_nlp[:5]

[('the', {'doc': the, 'count': 646933}),
 ('of', {'doc': of, 'count': 439108}),
 ('and', {'doc': and, 'count': 285408}),
 ('to', {'doc': to, 'count': 241632}),
 ('in', {'doc': in, 'count': 222439})]

In [31]:
data_unigrams_nlp_dict = dict(data_unigrams_sorted_nlp)

In [32]:
with open("../data/large_files/data_unigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_unigrams_nlp_dict, f)

# Trigrams to nlp docs

In [33]:
data_trigrams_sorted = sorted(data_trigrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_trigrams_sorted if el[1] > 10])

58612

In [34]:
%%time
data_trigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_trigrams_sorted if el[1] > 10]

CPU times: user 2min 30s, sys: 1.33 s, total: 2min 31s
Wall time: 2min 32s


In [35]:
data_trigrams_nlp_dict = dict(data_trigrams_sorted_nlp)

In [36]:
with open("../data/large_files/data_trigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_trigrams_nlp_dict, f)

# Cleaning trigrams for individual documents

In [37]:
trigramCount_tups = []
for k, v in list(trigramCount_dict.items())[:10]:
    trigramCount_tups.append((k, v))

In [38]:
trigramCount_tups

[('ark://27927/phx66812gq6',
  {'H. F. MATHEWS,': 1,
   'of Jesus. For': 1,
   'through life’s vicissitudes': 1,
   'their preparations the': 1,
   'age of twelve': 1,
   'in death, man': 1,
   'ambition or uncleanness.': 1,
   'then, your lord': 1,
   'HEPBURN, B.D., CRIEFF': 1,
   'knotty points of': 1,
   'not so easy': 1,
   'His acts of': 1,
   'the story is': 2,
   'Him was composed': 1,
   'thing to do.': 1,
   'Dr. Hunter points': 1,
   'the Somme recently': 1,
   'consequences. This obedience': 1,
   'down into human': 1,
   'fields and in': 1,
   'allowed to take': 1,
   'and speaker, Sir': 1,
   'heard of the': 1,
   'ROBERT BROWN, B.A.,': 1,
   'supernatural Kingdom will': 1,
   'sentry at his': 1,
   'shared this truth.': 1,
   'This is God’s': 1,
   'twenty and went': 1,
   'nor take away.': 1,
   'frightening world. How': 1,
   'world in which': 1,
   'me clean.’ And,': 1,
   'Jesus, react positively': 1,
   'a convenient moment,': 1,
   'is generous, the': 1,
   'for av

In [None]:

postags = ["PROPN", "NOUN", "VERB", "ADJ"]

def doc_to_lemmata(doc, postags):
    return " ".join(sorted([t.lemma_ for t in doc if t.pos_ in postags]))


def trigram_to_lemmata(trigram):
    try:
        lemmata_sorted_str = doc_to_lemmata(data_trigrams_nlp_dict[trigram]["doc"], postags)
    except:
        try:
            lemmata_sorted_str = doc_to_lemmata(nlp(trigram), postags)
        except:
            lemmata_sorted_str = ""
    return lemmata_sorted_str


In [None]:

doc = nlp("our Lord Jesus")
doc_to_lemmata(doc, postags)
trigram_to_lemmata("our Lord Jesus")
[data_trigrams_nlp_dict[el[0]]["doc"] for el in v.items()
237 * 24.5

In [None]:
len(trigramCount_dict)

In [11]:
data_trigrams_nlp_dict = pickle.load(open("../data/large_files/data_trigrams_nlp_dict.pickle", "rb"))

EOFError: Ran out of input

# Explorations

In [None]:
def get_tops(col, matchstring, n=10):
    ids = ids_from_colvals(col, matchstring)
    c = merge_data_from_ids(ids, unigramCount_dict)
    c_tups = [el for el in c.items() if el[0].lower() not in list(stop_words) + [".", "-", "\""]]
    top10 = sorted(c_tups, key=lambda kv: kv[1], reverse=True)[:n]
    return top10

In [None]:
get_tops("publicationYear", "==1951", 20)

In [None]:
get_tops("publicationYear", ".between(1950,1959)")

In [None]:
for decade_n in range(0,10):
    print("19{0}0,19{0}9".format(str(decade_n)))

In [None]:
for decade_n in range(0,10):
    decade = "19{0}0,19{0}9".format(str(decade_n))
    print(decade, get_tops("publicationYear", ".between({0})".format(decade), 20))


In [None]:
bigramCount_dict = pickle.load(open("../data/large_files/bigramCount_dict.pickle", "rb"))

In [None]:
def get_top_bigrams(col, matchstring, n=10):
    ids = ids_from_colvals(col, matchstring)
    c = merge_data_from_ids(ids, bigramCount_dict)
    #c_tups = [el for el in c.items() if el[0].lower() not in list(stop_words)]
    tops = sorted(c.items(), key=lambda kv: kv[1], reverse=True)[:n]
    return tops

In [None]:
for decade_n in range(0,10):
    decade = "19{0}0,19{0}9".format(str(decade_n))
    print(decade, get_top_bigrams("publicationYear", ".between({0})".format(decade), 10))