In [1]:
import pickle
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary
import spacy
from gensim.models.ldamulticore import LdaMulticore
from gensim.test.utils import datapath
import pyLDAvis.gensim_models as gensimvis

  from scipy.sparse.base import spmatrix
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1
  from scipy.optimize.linesearch import line_search_wolfe2, line_search_wolfe1


In [146]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [2]:
import multiprocessing
multiprocessing.cpu_count()

8

In [3]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [4]:
len(jstor_df)

14103

In [5]:
jstor_df.head(5)

Unnamed: 0,creator,datePublished,docType,doi,id,identifier,isPartOf,issueNumber,keyphrase,language,...,sequence,tdmCategory,title,url,volumeNumber,wordCount,docSubType,sourceCategory,subTitle,hasPartTitle
0,[],1959-10-01,article,10.1177/001452465907100107,ark://27927/phx66812gq6,"[{'name': 'doi', 'value': '10.1177/00145246590...",Expository Times,1.0,"[omnipotent reigneth, god omnipotent, silence,...",[eng],...,7.0,"[Religion - Theology, Religion - Spiritual bel...",In the Study,http://doi.org/10.1177/001452465907100107,71.0,7382,,,,
1,[Victor Paul Furnish],2009-01-01,chapter,10.1017/CBO9780511621321.003,ark://27927/pbd6fpf5fh,"[{'name': 'isbn', 'value': '9780511621321'}, {...",Jesus According to PaulJesus According to Paul,,"[saying, books online, jesus tradition, pauls ...",[eng],...,5.0,"[History - Historical methodology, Religion - ...",3 Sayings of Jesus in Paul's Letters,https://doi.org/10.1017/CBO9780511621321.003,,8577,,,,
2,[Leander E. Keck],2015-01-01,chapter,,ark://27927/phw1kd8s300,[],Christ&#39;s First Theologian,,"[pharisaism, sandmel genius, judaism, rabbinic...",[eng],...,8.0,"[History - Historical methodology, Religion - ...",3. The Quest for Pauls Pharisaism,,,5354,,,,
3,[LeAnn Snow Flesher],2009-02-01,article,10.1177/003463730910600105,ark://27927/phx64fptrwj,"[{'name': 'doi', 'value': '10.1177/00346373091...",Review & Expositor: An International Baptist J...,1.0,"[scofield, premillennial, premillennial dispen...",[eng],...,5.0,"[Religion - Theology, Religion - Spiritual bel...",The Historical Development of Premillennial Di...,http://doi.org/10.1177/003463730910600105,106.0,3614,,,,
4,[A. Daunton-Fear],1995-07-01,article,10.1177/0040571X9509800404,ark://27927/phx64k1x5c2,"[{'name': 'doi', 'value': '10.1177/0040571X950...",Theology,784.0,"[baptism, holy spirit, communion, infant bapti...",[eng],...,4.0,[Religion - Spiritual belief systems],Resisting the Tide Christian Initiation and Co...,http://doi.org/10.1177/0040571X9509800404,98.0,4323,,,,


In [6]:
# in english only?
jstor_df["language"].apply(lambda x: ("eng" in x) & (len(x)==1)).sum()

14096

In [7]:
article_ids_dict = pickle.load(open("../data/article_ids_dict.pickle", "rb"))

In [8]:
dictionary = pickle.load(open("../data/dictionary_main.pickle", "rb"))
len(dictionary)

41385

In [31]:
class FriendlyCorpus:
    def __iter__(self):
        docsn = 0
        for id in ids:
            short_id_str = str(article_ids_dict[id])
            id_filepath = "../data/large_files/article_docs/{}.pickle".format(short_id_str)
            for doc in pickle.load(open(id_filepath, "rb")):
                yield doc
                docsn += 1
        self.length = docsn
        return self
    def __len__(self):
        return self.length

In [32]:
ids = jstor_df["id"].tolist()
corpus = FriendlyCorpus()  # doesn't load the corpus into memory!
print(corpus)

<__main__.FriendlyCorpus object at 0x297de75e0>


In [36]:
dct_corpus = Dictionary.from_corpus(corpus, id2word=dictionary)

In [37]:
nt_id = dct_corpus.token2id["New_Testament"]
nt_id

9235

In [38]:
dct_corpus.dfs[nt_id]

51371

In [39]:
vocabulary = list(dct_corpus.values())

In [40]:
len(dct_corpus)

41385

In [73]:
dct_corpus.token2id["Paul"]

9932

In [74]:
dct_corpus.doc2bow(["Paul", "New_Testament", "Old_Testament"])

[(9235, 1), (9603, 1), (9932, 1)]

In [126]:
ntopics = 5

In [82]:
%%time
lda = LdaMulticore(corpus, num_topics=ntopics, id2word=dct_corpus)



CPU times: user 2min 6s, sys: 33.2 s, total: 2min 39s
Wall time: 4min 4s


In [89]:
lda.save("../data/large_files/lda_global_v1")

In [83]:
lda.show_topic(0)

[('early', 0.014643986),
 ('work', 0.013115274),
 ('biblical', 0.012963274),
 ('text', 0.012277658),
 ('divine', 0.011409644),
 ('spiritual', 0.009946071),
 ('faith', 0.009898072),
 ('social', 0.009851946),
 ('Christians', 0.009469147),
 ('order', 0.008436895)]

In [147]:
ntopics = 5
topic_words_joined = []
for n in range(ntopics):
    topic_words_joined.append(", ".join([tup[0] for tup in lda.show_topic(n, topn=10)]))
topic_words_joined

['early, work, biblical, text, divine, spiritual, faith, social, Christians, order',
 'Christian, Jesus, church, Christ, new, Paul, way, theology, tradition, theological',
 'God, great, history, historical, Paul, religious, love, Israel, power, letter',
 'Jewish, human, world, good, use, second, century, death, study, Paul',
 'life, John, man, like, time, people, Roman, Paul, law, point']

In [95]:
for word in ["Christ", "Jesus", "Paul", "New_Testament", "Old_Testament", "Romans", "Corinthians", "Galatians", "law", "Jewish", "Jews", "Judea"]:
    print(word, lda.get_term_topics(dct_corpus.token2id[word], minimum_probability=0))

Christ [(1, 0.025210822)]
Jesus [(1, 0.031531658)]
Paul [(0, 0.006974078), (1, 0.011069962), (2, 0.0088760415), (3, 0.007267367), (4, 0.007393098)]
New_Testament [(0, 0.00074381643), (1, 0.0008302843), (2, 0.00048067883), (3, 0.0008008713), (4, 0.00057056826)]
Old_Testament [(1, 0.0014251649)]
Romans [(1, 0.0023071175), (3, 0.0066799927)]
Corinthians [(0, 0.0007317764), (1, 0.0011300398), (2, 0.00075825327), (3, 0.0008429485), (4, 0.0007668329)]
Galatians [(1, 0.0041462937)]
law [(4, 0.007385621)]
Jewish [(3, 0.017416224)]
Jews [(0, 0.0013330865), (1, 0.003820091), (4, 0.0013761432)]
Judea [(2, 0.00031325355)]


In [102]:
jstor_df.groupby("isPartOf").size().sort_values(ascending=False)

isPartOf
Religious Studies Review                                544
Journal of Biblical Literature                          442
The Catholic Biblical Quarterly                         332
Expository Times                                        330
Review & Expositor: An International Baptist Journal    265
                                                       ... 
Kenyan, Christian, Queer                                  1
Karl BarthKarl Barth                                      1
Jung on Christianity                                      1
Julian of Norwich's "Showings"                            1
Milton&#39;s Scriptural Theology                          1
Length: 3047, dtype: int64

In [104]:
journals = ["The Catholic Biblical Quarterly", "Interpretation: A Journal of Bible and Theology", "Neotestamentica", "The Harvard Theological Review", "New Testament Studies", "Novum Testamentum", "Journal for the Study of the New Testament", "The Journal of Theological Studies", "The Heythrop Journal", "Church History"]
len(journals)

10

In [110]:
ids = jstor_df[jstor_df["isPartOf"]=="Journal of Biblical Literature"]["id"].tolist()
corpus_friendly = FriendlyCorpus()
bows = [doc for doc in corpus_friendly]

In [116]:
lda.show_topics()

[(0,
  '0.015*"early" + 0.013*"work" + 0.013*"biblical" + 0.012*"text" + 0.011*"divine" + 0.010*"spiritual" + 0.010*"faith" + 0.010*"social" + 0.009*"Christians" + 0.008*"order"'),
 (1,
  '0.033*"Christian" + 0.032*"Jesus" + 0.027*"church" + 0.025*"Christ" + 0.018*"new" + 0.011*"Paul" + 0.011*"way" + 0.009*"theology" + 0.008*"tradition" + 0.008*"theological"'),
 (2,
  '0.067*"God" + 0.011*"great" + 0.010*"history" + 0.009*"historical" + 0.009*"Paul" + 0.009*"religious" + 0.009*"love" + 0.008*"Israel" + 0.008*"power" + 0.007*"letter"'),
 (3,
  '0.017*"Jewish" + 0.015*"human" + 0.011*"world" + 0.011*"good" + 0.010*"use" + 0.009*"second" + 0.009*"century" + 0.008*"death" + 0.008*"study" + 0.007*"Paul"'),
 (4,
  '0.019*"life" + 0.019*"John" + 0.017*"man" + 0.011*"like" + 0.011*"time" + 0.011*"people" + 0.009*"Roman" + 0.007*"Paul" + 0.007*"law" + 0.007*"point"')]

In [115]:
lda.top_topics(bows,dictionary=dct_corpus, topn=10)

[([(0.033020873, 'Christian'),
   (0.03152251, 'Jesus'),
   (0.026798924, 'church'),
   (0.025203507, 'Christ'),
   (0.018387958, 'new'),
   (0.011066769, 'Paul'),
   (0.010710236, 'way'),
   (0.008522751, 'theology'),
   (0.008237446, 'tradition'),
   (0.008100662, 'theological')],
  -9.149542267193706),
 ([(0.019393314, 'life'),
   (0.018929198, 'John'),
   (0.01744557, 'man'),
   (0.011083867, 'like'),
   (0.010942507, 'time'),
   (0.01059002, 'people'),
   (0.008658209, 'Roman'),
   (0.007391173, 'Paul'),
   (0.0073836986, 'law'),
   (0.007327001, 'point')],
  -9.361288051605175),
 ([(0.014643988, 'early'),
   (0.013115276, 'work'),
   (0.012963275, 'biblical'),
   (0.01227766, 'text'),
   (0.011409645, 'divine'),
   (0.009946072, 'spiritual'),
   (0.009898073, 'faith'),
   (0.0098519465, 'social'),
   (0.009469148, 'Christians'),
   (0.008436896, 'order')],
  -11.2419232116576),
 ([(0.01741168, 'Jewish'),
   (0.015278202, 'human'),
   (0.010943653, 'world'),
   (0.010819748, 'good

In [121]:
len(bows_probs)

535018

In [124]:
bows_probs_list = [[tup[1] for tup in probs_tups] for probs_tups in bows_probs]

In [129]:
means = []
for n in range(ntopics):
    means.append(np.mean([topics[n][1] for topics in bows_probs]))

In [132]:
means

[0.19203253, 0.21457747, 0.19176441, 0.20533733, 0.1962883]

In [140]:
def get_subcorpus_topic_probs(subcorpus_bows):
    bows_probs = [lda.get_document_topics(bow) for bow in subcorpus_bows]
    means = []
    for n in range(ntopics):
        means.append(np.mean([topics[n][1] for topics in bows_probs]))
    return means

def topic_probs_from_ids_subset(ids):
    corpus_friendly = FriendlyCorpus()
    bows = [doc for doc in corpus_friendly]
    means = get_subcorpus_topic_probs(bows)
    return means

In [143]:
journals_means = []
for journal in journals:
    ids = jstor_df[jstor_df["isPartOf"]==journal]["id"].tolist()
    means = topic_probs_from_ids_subset(ids)
    journal_data = [journal]
    journal_data.extend(means)
    journals_means.append(journal_data)

In [148]:
[["topics (top 10 words)"] + topic_words_joined]

['early, work, biblical, text, divine, spiritual, faith, social, Christians, order',
 'Christian, Jesus, church, Christ, new, Paul, way, theology, tradition, theological',
 'God, great, history, historical, Paul, religious, love, Israel, power, letter',
 'Jewish, human, world, good, use, second, century, death, study, Paul',
 'life, John, man, like, time, people, Roman, Paul, law, point']

In [161]:
journals_means_df = pd.DataFrame(journals_means, columns=["journal"] + topic_words_joined)
journals_means_df = journals_means_df.round(2)
journals_means_df

Unnamed: 0,journal,"early, work, biblical, text, divine, spiritual, faith, social, Christians, order","Christian, Jesus, church, Christ, new, Paul, way, theology, tradition, theological","God, great, history, historical, Paul, religious, love, Israel, power, letter","Jewish, human, world, good, use, second, century, death, study, Paul","life, John, man, like, time, people, Roman, Paul, law, point"
0,The Catholic Biblical Quarterly,0.19,0.22,0.19,0.2,0.19
1,Interpretation: A Journal of Bible and Theology,0.19,0.22,0.21,0.2,0.19
2,Neotestamentica,0.19,0.22,0.2,0.21,0.18
3,The Harvard Theological Review,0.2,0.21,0.19,0.2,0.2
4,New Testament Studies,0.19,0.22,0.19,0.2,0.19
5,Novum Testamentum,0.19,0.22,0.19,0.2,0.2
6,Journal for the Study of the New Testament,0.19,0.22,0.2,0.2,0.19
7,The Journal of Theological Studies,0.19,0.21,0.19,0.2,0.2
8,The Heythrop Journal,0.2,0.22,0.2,0.2,0.18
9,Church History,0.2,0.21,0.19,0.2,0.2


In [163]:
journals_means_df

Unnamed: 0,journal,"early, work, biblical, text, divine, spiritual, faith, social, Christians, order","Christian, Jesus, church, Christ, new, Paul, way, theology, tradition, theological","God, great, history, historical, Paul, religious, love, Israel, power, letter","Jewish, human, world, good, use, second, century, death, study, Paul","life, John, man, like, time, people, Roman, Paul, law, point"
0,The Catholic Biblical Quarterly,0.19,0.22,0.19,0.2,0.19
1,Interpretation: A Journal of Bible and Theology,0.19,0.22,0.21,0.2,0.19
2,Neotestamentica,0.19,0.22,0.2,0.21,0.18
3,The Harvard Theological Review,0.2,0.21,0.19,0.2,0.2
4,New Testament Studies,0.19,0.22,0.19,0.2,0.19
5,Novum Testamentum,0.19,0.22,0.19,0.2,0.2
6,Journal for the Study of the New Testament,0.19,0.22,0.2,0.2,0.19
7,The Journal of Theological Studies,0.19,0.21,0.19,0.2,0.2
8,The Heythrop Journal,0.2,0.22,0.2,0.2,0.18
9,Church History,0.2,0.21,0.19,0.2,0.2


In [165]:
set_with_dataframe(mops_data.add_worksheet("journals_topic_probs_v2", 1,1), journals_means_df.round(2))

In [156]:
set_with_dataframe(mops_data.add_worksheet("topic_words", 1,1), pd.DataFrame(topic_words_joined))