In [1]:
import pandas as pd
import pickle
pd.set_option("display.max_columns", None)
from collections import Counter
import spacy

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [4]:
jstor_df.head(5)

Unnamed: 0,creator,datePublished,docType,doi,id,identifier,isPartOf,issueNumber,keyphrase,language,outputFormat,pageCount,pageEnd,pageStart,pagination,provider,publicationYear,publisher,sequence,tdmCategory,title,url,volumeNumber,wordCount,docSubType,sourceCategory,subTitle,hasPartTitle
0,[],1959-10-01,article,10.1177/001452465907100107,ark://27927/phx66812gq6,"[{'name': 'doi', 'value': '10.1177/00145246590...",Expository Times,1.0,"[omnipotent reigneth, god omnipotent, silence,...",[eng],"[unigram, bigram, trigram]",8.0,31,24,pp. 24-31,portico,1959,SAGE Publications,7.0,"[Religion - Theology, Religion - Spiritual bel...",In the Study,http://doi.org/10.1177/001452465907100107,71.0,7382,,,,
1,[Victor Paul Furnish],2009-01-01,chapter,10.1017/CBO9780511621321.003,ark://27927/pbd6fpf5fh,"[{'name': 'isbn', 'value': '9780511621321'}, {...",Jesus According to PaulJesus According to Paul,,"[saying, books online, jesus tradition, pauls ...",[eng],"[unigram, bigram, trigram]",26.0,65,40,40-65,portico,2009,Cambridge University Press,5.0,"[History - Historical methodology, Religion - ...",3 Sayings of Jesus in Paul's Letters,https://doi.org/10.1017/CBO9780511621321.003,,8577,,,,
2,[Leander E. Keck],2015-01-01,chapter,,ark://27927/phw1kd8s300,[],Christ&#39;s First Theologian,,"[pharisaism, sandmel genius, judaism, rabbinic...",[eng],"[unigram, bigram, trigram]",14.0,42,29,29-42,portico,2015,Baylor University Press,8.0,"[History - Historical methodology, Religion - ...",3. The Quest for Pauls Pharisaism,,,5354,,,,
3,[LeAnn Snow Flesher],2009-02-01,article,10.1177/003463730910600105,ark://27927/phx64fptrwj,"[{'name': 'doi', 'value': '10.1177/00346373091...",Review & Expositor: An International Baptist J...,1.0,"[scofield, premillennial, premillennial dispen...",[eng],"[unigram, bigram, trigram]",11.0,45,35,pp. 35-45,portico,2009,SAGE Publications,5.0,"[Religion - Theology, Religion - Spiritual bel...",The Historical Development of Premillennial Di...,http://doi.org/10.1177/003463730910600105,106.0,3614,,,,
4,[A. Daunton-Fear],1995-07-01,article,10.1177/0040571X9509800404,ark://27927/phx64k1x5c2,"[{'name': 'doi', 'value': '10.1177/0040571X950...",Theology,784.0,"[baptism, holy spirit, communion, infant bapti...",[eng],"[unigram, bigram, trigram]",10.0,282,273,273-282,portico,1995,SAGE Publications,4.0,[Religion - Spiritual belief systems],Resisting the Tide Christian Initiation and Co...,http://doi.org/10.1177/0040571X9509800404,98.0,4323,,,,


In [5]:
unigramCount_dict = pickle.load(open("../data/large_files/unigramCount_dict.pickle", "rb"))

In [None]:
bigramCount_dict = pickle.load(open("../data/large_files/bigramCount_dict.pickle", "rb"))

In [6]:
trigramCount_dict = pickle.load(open("../data/large_files/trigramCount_dict.pickle", "rb"))

In [9]:
def ids_from_colvals(col, matchstring):
    ids = eval('jstor_df[jstor_df["{0}"]{1}]'.format(col, matchstring))["id"].tolist()
    return ids

def merge_data_from_ids(ids, datadict):
    c = Counter()
    for id in ids:
        d = datadict[id]
        c.update(d)
    return c


In [8]:
from userfunctions import *

# Cleaning the texts

In [19]:
ids = jstor_df.sample(500, random_state=0)["id"].tolist()

In [20]:
data_unigrams = merge_data_from_ids(ids, unigramCount_dict)
data_trigrams = merge_data_from_ids(ids, trigramCount_dict)

In [27]:
len(data_unigrams_sorted)

383643

In [29]:
len([el for el in data_unigrams_sorted if el[1] > 100])

4910

In [33]:
#english stopwords...
nlp = spacy.load('en_core_web_lg')
stop_words = nlp.Defaults.stop_words

# Unigrams to nlp docs

In [21]:
data_unigrams_sorted = sorted(data_unigrams.items(), key=lambda kv: kv[1], reverse=True)
data_unigrams_sorted[:100]

[('the', 336493),
 ('of', 225089),
 ('and', 148677),
 ('to', 126990),
 ('in', 114966),
 ('a', 78570),
 ('that', 65269),
 ('is', 62699),
 ('as', 42797),
 ('for', 35255),
 ('The', 31223),
 ('with', 30096),
 ('not', 29514),
 ('by', 28451),
 ('his', 27668),
 ('was', 26984),
 ('be', 26606),
 ('on', 25270),
 ('it', 23724),
 ('this', 23606),
 ('from', 23214),
 ('he', 22876),
 ('are', 19724),
 ('which', 18696),
 ('or', 18425),
 ('have', 16821),
 ('their', 16045),
 ('an', 15817),
 ('but', 15780),
 ('who', 15725),
 ('they', 14424),
 ('at', 13556),
 ('I', 12627),
 ('.', 12309),
 ('were', 12115),
 ('we', 11799),
 ('one', 11640),
 ('all', 11409),
 ('In', 11151),
 ('has', 11101),
 ('had', 10916),
 ('God', 10602),
 ('also', 10216),
 ('will', 8742),
 ('its', 8556),
 ('more', 8359),
 ('Christian', 8215),
 ('been', 8139),
 ('would', 7957),
 ('Paul', 7668),
 ('what', 7518),
 ('so', 7216),
 ('only', 7186),
 ('other', 7140),
 ('these', 6983),
 ('This', 6917),
 ('no', 6784),
 ('our', 6780),
 ('about', 6772)

In [34]:
len([el for el in data_unigrams_sorted[:100] if el[0].lower() not in stop_words])

9

In [58]:
len([el for el in data_unigrams_sorted if el[1] > 10])

30897

In [59]:
%%time
data_unigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_unigrams_sorted if el[1] > 10]

CPU times: user 1min 15s, sys: 1.64 s, total: 1min 17s
Wall time: 1min 18s


In [60]:
data_unigrams_sorted_nlp[:5]

[('the', {'doc': the, 'count': 336493}),
 ('of', {'doc': of, 'count': 225089}),
 ('and', {'doc': and, 'count': 148677}),
 ('to', {'doc': to, 'count': 126990}),
 ('in', {'doc': in, 'count': 114966})]

In [71]:
data_unigrams_nlp_dict = dict(data_unigrams_sorted_nlp)

In [62]:
with open("../data/large_files/data_unigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_unigrams_nlp_dict, f)

In [63]:
#data_unigrams_nlp_dict = pickle.load(open("../data/large_files/data_unigrams_nlp_dict.pickle", "rb"))

In [74]:
[t.lemma_ for t in data_unigrams_nlp_dict["goes"]["doc"]]

['go']

# Trigrams to nlp docs

In [51]:
data_trigrams_sorted = sorted(data_trigrams.items(), key=lambda kv: kv[1], reverse=True)
data_trigrams_sorted_nlp = [(nlp(el[0]), el[1]) for el in data_trigrams_sorted[:1000]]

In [77]:
len([el for el in data_trigrams_sorted if el[1] > 10])

22972

In [78]:
%%time
data_trigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_trigrams_sorted if el[1] > 10]

CPU times: user 58.7 s, sys: 660 ms, total: 59.4 s
Wall time: 59.8 s


In [79]:
data_trigrams_nlp_dict = dict(data_trigrams_sorted_nlp)

In [80]:
with open("../data/large_files/data_trigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_trigrams_nlp_dict, f)

# Explorations

In [109]:
def get_tops(col, matchstring, n=10):
    ids = ids_from_colvals(col, matchstring)
    c = merge_data_from_ids(ids, unigramCount_dict)
    c_tups = [el for el in c.items() if el[0].lower() not in list(stop_words) + [".", "-", "\""]]
    top10 = sorted(c_tups, key=lambda kv: kv[1], reverse=True)[:n]
    return top10

In [110]:
get_tops("publicationYear", "==1951", 20)

[('Paul', 409),
 ('God', 404),
 ('Testament', 339),
 ('Christian', 331),
 ('New', 281),
 ('Old', 279),
 ('p.', 260),
 ('Jesus', 248),
 ('man', 238),
 ('Christ', 237),
 ('Gospel', 233),
 ('life', 225),
 ('law', 208),
 ('love', 192),
 ('new', 173),
 ('Church', 170),
 ('St.', 168),
 (';', 167),
 ('men', 161),
 ("God's", 156)]

In [111]:
get_tops("publicationYear", ".between(1950,1959)")

[('God', 5002),
 ('Paul', 3773),
 ('Christian', 3543),
 ('Christ', 3098),
 ('New', 2981),
 ('Church', 2966),
 ('Jesus', 2862),
 ('p.', 2506),
 ('life', 2377),
 ('St.', 2364)]

In [112]:
for decade_n in range(0,10):
    print("19{0}0,19{0}9".format(str(decade_n)))

1900,1909
1910,1919
1920,1929
1930,1939
1940,1949
1950,1959
1960,1969
1970,1979
1980,1989
1990,1999


In [113]:
for decade_n in range(0,10):
    decade = "19{0}0,19{0}9".format(str(decade_n))
    print(decade, get_tops("publicationYear", ".between({0})".format(decade), 20))


1900,1909 [('Proc.', 3172), ('d.', 2347), (':', 2342), ('2', 2219), ('de', 2113), ('Jesus', 2013), ('u.', 1737), ('&', 1650), ('God', 1647), ('New', 1622), ('Christian', 1493), ('der', 1478), ('Paul', 1440), ('3', 1371), ('des', 1366), ('Testament', 1330), ('Christ', 1290), ('?', 1251), ('et', 1209), ('1', 1179)]
1910,1919 [('Jesus', 1776), ('God', 1736), ('Christian', 1542), ('great', 1405), ('church', 1200), ('life', 1182), ('time', 1177), ('p.', 1171), ('man', 1128), ('Paul', 1126), ('New', 1096), ('religious', 950), ('Christ', 905), ('men', 893), ('shall', 878), ('?', 858), (';', 845), ('found', 812), ('new', 794), ('work', 787)]
1920,1929 [('|', 7527), ('p.', 2363), ('pp.', 1682), ('New', 1568), (':', 1110), ('Christian', 1054), ('1', 900), ('work', 871), ('Paul', 852), ('St.', 845), (';', 811), ('great', 788), ('time', 764), ('God', 754), ('religious', 725), ('A.', 702), ('Jewish', 692), ('American', 683), ("'", 635), ('new', 629)]
1930,1939 [('et', 2512), ('p.', 1790), ('God', 1

In [102]:
bigramCount_dict = pickle.load(open("../data/large_files/bigramCount_dict.pickle", "rb"))

In [107]:
def get_top_bigrams(col, matchstring, n=10):
    ids = ids_from_colvals(col, matchstring)
    c = merge_data_from_ids(ids, bigramCount_dict)
    #c_tups = [el for el in c.items() if el[0].lower() not in list(stop_words)]
    tops = sorted(c.items(), key=lambda kv: kv[1], reverse=True)[:n]
    return tops

In [108]:
for decade_n in range(0,10):
    decade = "19{0}0,19{0}9".format(str(decade_n))
    print(decade, get_top_bigrams("publicationYear", ".between({0})".format(decade), 10))

1900,1909 [('of the', 21104), ('in the', 8596), ('to the', 5662), ('and the', 3227), ('- -', 3015), ('. .', 2882), ('that the', 2608), ('on the', 2244), ('by the', 2183), ('to be', 2122)]
1910,1919 [('of the', 18350), ('in the', 8101), ('to the', 5355), ('and the', 3300), ('that the', 2540), ('. .', 2416), ('on the', 2058), ('to be', 2054), ('from the', 2042), ('with the', 1988)]
1920,1929 [('of the', 13452), ('in the', 5457), ('. .', 4018), ('to the', 3686), ('and the', 2241), ('on the', 1742), ('for the', 1564), ('with the', 1452), ('from the', 1426), ('by the', 1419)]
1930,1939 [('of the', 13587), ('in the', 5959), ('to the', 4491), ('and the', 2441), ('that the', 1689), ('for the', 1599), ('by the', 1516), ('to be', 1460), ('with the', 1453), ('on the', 1420)]
1940,1949 [('of the', 21661), ('in the', 9715), ('to the', 6489), ('and the', 3567), ('that the', 3104), ('to be', 2730), ('on the', 2425), ('for the', 2410), ('with the', 2384), ('by the', 2219)]
1950,1959 [('of the', 37166)