In [39]:
import pickle
import numpy as np
import pandas as pd
from gensim.matutils import corpus2csc
from gensim.matutils import corpus2dense
from gensim.corpora import Dictionary
from datetime import datetime

In [40]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [2]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [3]:
len(jstor_df)

14103

In [4]:
jstor_df.head(5)

Unnamed: 0,creator,datePublished,docType,doi,id,identifier,isPartOf,issueNumber,keyphrase,language,...,sequence,tdmCategory,title,url,volumeNumber,wordCount,docSubType,sourceCategory,subTitle,hasPartTitle
0,[],1959-10-01,article,10.1177/001452465907100107,ark://27927/phx66812gq6,"[{'name': 'doi', 'value': '10.1177/00145246590...",Expository Times,1.0,"[omnipotent reigneth, god omnipotent, silence,...",[eng],...,7.0,"[Religion - Theology, Religion - Spiritual bel...",In the Study,http://doi.org/10.1177/001452465907100107,71.0,7382,,,,
1,[Victor Paul Furnish],2009-01-01,chapter,10.1017/CBO9780511621321.003,ark://27927/pbd6fpf5fh,"[{'name': 'isbn', 'value': '9780511621321'}, {...",Jesus According to PaulJesus According to Paul,,"[saying, books online, jesus tradition, pauls ...",[eng],...,5.0,"[History - Historical methodology, Religion - ...",3 Sayings of Jesus in Paul's Letters,https://doi.org/10.1017/CBO9780511621321.003,,8577,,,,
2,[Leander E. Keck],2015-01-01,chapter,,ark://27927/phw1kd8s300,[],Christ&#39;s First Theologian,,"[pharisaism, sandmel genius, judaism, rabbinic...",[eng],...,8.0,"[History - Historical methodology, Religion - ...",3. The Quest for Pauls Pharisaism,,,5354,,,,
3,[LeAnn Snow Flesher],2009-02-01,article,10.1177/003463730910600105,ark://27927/phx64fptrwj,"[{'name': 'doi', 'value': '10.1177/00346373091...",Review & Expositor: An International Baptist J...,1.0,"[scofield, premillennial, premillennial dispen...",[eng],...,5.0,"[Religion - Theology, Religion - Spiritual bel...",The Historical Development of Premillennial Di...,http://doi.org/10.1177/003463730910600105,106.0,3614,,,,
4,[A. Daunton-Fear],1995-07-01,article,10.1177/0040571X9509800404,ark://27927/phx64k1x5c2,"[{'name': 'doi', 'value': '10.1177/0040571X950...",Theology,784.0,"[baptism, holy spirit, communion, infant bapti...",[eng],...,4.0,[Religion - Spiritual belief systems],Resisting the Tide Christian Initiation and Co...,http://doi.org/10.1177/0040571X9509800404,98.0,4323,,,,


In [5]:
# in english only?
jstor_df["language"].apply(lambda x: ("eng" in x) & (len(x)==1)).sum()

14096

In [6]:
article_ids_dict = pickle.load(open("../data/article_ids_dict.pickle", "rb"))

In [7]:
dictionary = pickle.load(open("../data/dictionary_main.pickle", "rb"))
len(dictionary)

44592

In [8]:
decades = []
for decade_n in range(0,10):
    decades.append("19{0}0,19{0}9".format(str(decade_n)))
decades += ["2000,2009", "2010,2019"]
decades

['1900,1909',
 '1910,1919',
 '1920,1929',
 '1930,1939',
 '1940,1949',
 '1950,1959',
 '1960,1969',
 '1970,1979',
 '1980,1989',
 '1990,1999',
 '2000,2009',
 '2010,2019']

In [9]:
def ids_from_colvals(df_name, col, matchstring):
    ids = eval('{0}[{0}["{1}"]{2}]'.format(df_name, col, matchstring))["id"].tolist()
    return ids

In [10]:
decade = decades[2]
ids = ids_from_colvals("jstor_df", "publicationYear", ".between({})".format(decade))
len(ids)

132

In [34]:
def corpus_from_ids(ids):
    corpus = []
    for id in ids:
        short_id_str = str(article_ids_dict[id])
        id_filepath = "../data/large_files/article_docs/{}.pickle".format(short_id_str)
        corpus.extend(pickle.load(open(id_filepath, "rb")))
    return corpus

class MyCorpus:
    def __iter__(self):
        for id in ids:
            short_id_str = str(article_ids_dict[id])
            id_filepath = "../data/large_files/article_docs/{}.pickle".format(short_id_str)
            for doc in pickle.load(open(id_filepath, "rb")):
                yield doc

In [12]:
corpus = corpus_from_ids(ids)

In [13]:
len(corpus)
corpus[:10]

[[(15054, 1), (25971, 1)],
 [(2802, 1), (13164, 1), (20706, 1)],
 [(231, 1), (41987, 1)],
 [(7884, 1), (10898, 1)],
 [(19525, 1), (29542, 1)],
 [(27686, 1), (36201, 1)],
 [(21394, 1), (28249, 1)],
 [(18414, 1), (19037, 1)],
 [(23985, 1), (30919, 1)],
 [(29860, 1), (33322, 1)]]

In [14]:
dct_corpus = Dictionary.from_corpus(corpus, id2word=dictionary)

In [15]:
vocabulary = list(dct_corpus.values())

In [16]:
len(dct_corpus)

44592

In [17]:
 len([tup for tup in dct_corpus.dfs.items() if tup[1] > 0])

17724

In [18]:
dct_corpus.num_docs

249295

In [19]:
dct_corpus.token2id["Christ"]

3052

In [20]:
dct_corpus.token2id["Harnack"]

6553

In [21]:
dct_corpus.dfs[dct_corpus.token2id["Harnack"]]

75

In [22]:
term_doc_mat = corpus2csc(corpus, num_terms=len(dct_corpus))

In [23]:
type(term_doc_mat)

scipy.sparse._csc.csc_matrix

In [24]:
term_doc_mat.shape

(44592, 249295)

In [25]:
cooc = np.dot(term_doc_mat, term_doc_mat.T)

In [26]:
cooc_dense = cooc.todense()

In [27]:
# target term frequency in coocurrence matrix (i.e. the value on the diagonal
target = "Christ"
cooc[dct_corpus.token2id[target],dct_corpus.token2id[target]]

1383.0

In [28]:
# coocurrence calculator
target1, target2 = "Christ", "Jesus"
cooc[dct_corpus.token2id[target1],dct_corpus.token2id[target2]]

364.0

In [30]:
def normalize_ppmi3_matrix(pmi_matrix_df):
    minval, maxval = pmi_matrix_df.min().min(), pmi_matrix_df.max().max()
    diff = abs(maxval-minval)
    minval_doubled = minval - diff
    pmi_matrix_df.fillna(minval_doubled, inplace=True)
    pmi_matrix_norm_df = (pmi_matrix_df - minval_doubled) / (maxval - minval_doubled)
    return pmi_matrix_norm_df

def get_ppmi_df(cooc, vocabulary, normalize=True, exp=2):
    pmi_rows_list = []
    for i in range(cooc.shape[1]):
        ab = np.array([row_el for row_el in list(cooc[i].toarray()[0])], dtype=float)
        ab_exp = np.power(ab, exp)
        axb = np.array([cooc[row_el[0], row_el[0]] * cooc[i, i] for row_el in enumerate(list(cooc[i].toarray()[0]))], dtype=float)
        pmi_row = np.divide(ab_exp, axb, out=np.zeros_like(ab_exp), where=axb!=0)
        pmi_row = [np.log(n) if n>0 else None for n in pmi_row]
        pmi_rows_list.append(pmi_row)
    pmi_matrix_df = pd.DataFrame(pmi_rows_list, columns=vocabulary, index=vocabulary)
    if normalize == True:
        pmi_matrix_df = normalize_ppmi3_matrix(pmi_matrix_df)
        np.fill_diagonal(pmi_matrix_df.to_numpy(), 1)
    return pmi_matrix_df #pmi_matrix_norm_df

In [None]:
ppmi_matrix = get_ppmi_df(cooc, vocabulary, normalize=False)

In [31]:
%%time
# test...
ppmi_matrix = get_ppmi_df(cooc, vocabulary, normalize=False)

CPU times: user 16 s, sys: 99.1 ms, total: 16.1 s
Wall time: 16.4 s


In [None]:
pickle.dump(ppmi_matrix , open("../data/large_files/ppmi_matrix_test.pickle", "wb"))

In [None]:
gspread_cell_row = 1
for decade in decades:
    ids = ids_from_colvals("jstor_df", "publicationYear", ".between({0})".format(decade))
    corpus = MyCorpus()
    dct_corpus = Dictionary.from_corpus(corpus, id2word=dictionary)
    term_doc_mat = corpus2csc(corpus, num_terms=len(dct_corpus))
    cooc = np.dot(term_doc_mat, term_doc_mat.T)
    ppmi_matrix = get_ppmi_df(cooc, vocabulary, normalize=False)
    pickle.dump(ppmi_matrix, open("../data/large_files/ppmi_matrix_{}.pickle".format(decade.replace(",", "_")), "wb"))
    mops_data.worksheet("ppmi_progress").update_cell(gspread_cell_row, 1, decade + " ({0})".format(datetime.now().strftime("%H:%M:%S")))
    gspread_cell_row += 1