In [90]:
import pandas as pd
import pickle
import numpy as np
from collections import Counter
from gensim import corpora

In [2]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

endpoint variable has been configured to: https://sciencedata.dk/files/


In [3]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [4]:
trigramCount_cleaned_dict = pickle.load(open("../data/large_files/trigramCount_cleaned_dict.pickle", "rb"))

In [5]:
id = list(trigramCount_cleaned_dict.keys())[0]

In [6]:
sorted(list(trigramCount_cleaned_dict[id].items()), key=lambda kv: kv[1], reverse=True)[:10]

[('God', 40),
 ('have', 38),
 ('be', 28),
 ('life', 28),
 ('man', 27),
 ('know', 25),
 ('do', 24),
 ('see', 24),
 ('way', 19),
 ('come', 18)]

# Exploring all trigrams together

In [7]:
def merge_data_from_ids(ids, datadict):
    c = Counter()
    for id in ids:
        d = datadict[id]
        c.update(d)
    return c

In [8]:
print("hello")

hello


In [9]:
all_ids = jstor_df["id"].tolist()

In [10]:
data_trigrams = merge_data_from_ids(all_ids, trigramCount_cleaned_dict)
len(data_trigrams)

55806863

In [13]:
len([kv for kv in data_trigrams.items() if kv[1] > 1000])

4184

In [14]:
len([kv for kv in data_trigrams.items() if kv[1] > 100])

53686

In [15]:
len([kv for kv in data_trigrams.items() if kv[1] > 10])

965066

In [16]:
len([kv for kv in data_trigrams.items() if kv[1] > 1])

14519930

In [17]:
# how many are actually trigrams?
len([kv for kv in data_trigrams.items() if len(kv[0].split()) > 2])

54627127

In [23]:
# how many are actually bigrams or trigrams?
len([kv for kv in data_trigrams.items() if len(kv[0].split()) >= 2])

55742943

In [19]:
# how many trigrams are actually unigrams (after prerocessing)?
len([kv for kv in data_trigrams.items() if len(kv[0].split()) == 1])

63918

In [21]:
# total number of trigram instances..:
sum([kv[1] for kv in data_trigrams.items()])

143358401

In [22]:
# the number of covered by unigrams, i.e. useless for our analysis...
sum([kv[1] for kv in data_trigrams.items() if len(kv[0].split()) == 1])

31706486

# Exploring trigrams over time

In [24]:
def ids_from_colvals(df_name, col, matchstring):
    ids = eval('{0}[{0}["{1}"]{2}]'.format(df_name, col, matchstring))["id"].tolist()
    return ids

def merge_data_from_ids(ids, datadict):
    c = Counter()
    for id in ids:
        d = datadict[id]
        c.update(d)
    return c

def get_tops(df_name, col, matchstring, n=10):
    ids = ids_from_colvals(df_name, col, matchstring)
    c = merge_data_from_ids(ids, trigramCount_cleaned_dict)
    c_tups = [el for el in c.items() if el[0] != ""]
    top10 = sorted(c_tups, key=lambda kv: kv[1], reverse=True)[:n]
    return top10

decades = []
for decade_n in range(0,10):
    decades.append("19{0}0,19{0}9".format(str(decade_n)))
decades += ["2000,2009", "2010,2019"]
decades

['1900,1909',
 '1910,1919',
 '1920,1929',
 '1930,1939',
 '1940,1949',
 '1950,1959',
 '1960,1969',
 '1970,1979',
 '1980,1989',
 '1990,1999',
 '2000,2009',
 '2010,2019']

In [25]:
# test with function...
ids = ids_from_colvals("jstor_df", "publicationYear", ".between(1920, 1929)")
len(ids)

132

In [26]:
c = merge_data_from_ids(ids, trigramCount_cleaned_dict)

In [27]:
sorted(c.items(), key=lambda kv: kv[1], reverse=True)[:10]

[('have', 3081),
 ('do', 1380),
 ('be', 1229),
 ('time', 1207),
 ('work', 1206),
 ('make', 1180),
 ('say', 1105),
 ('find', 1025),
 ('God', 999),
 ('give', 966)]

In [28]:
def get_tops(df_name, col, matchstring, n=10):
    ids = ids_from_colvals(df_name, col, matchstring)
    c = merge_data_from_ids(ids, trigramCount_cleaned_dict)
    c_tups = [el for el in c.items() if len(el[0].split()) > 1]
    tops = sorted(c_tups, key=lambda kv: kv[1], reverse=True)[:n]
    return tops

In [29]:
for decade in decades:
    print(get_tops("jstor_df", "publicationYear", ".between({0})".format(decade), n=30))

[('New Testament', 1097), ('Old Testament', 783), ('American Journal', 764), ('Christ Jesus', 515), ('Journal Theology', 363), ('hand other', 348), ('same time', 260), ('Epic Indian', 224), ('Biblical World', 217), ('Paul St', 202), ('Paul Saint', 176), ('Fourth Gospel', 172), ('century second', 168), ('School Sunday', 159), ('point view', 157), ('Holy Spirit', 151), ('Man Son', 142), ('States United', 141), ('place take', 134), ('birth virgin', 130), ('New York', 124), ('Jesus life', 120), ('X X', 120), ('Paul apostle', 114), ('J P Peters', 114), ('have seem', 111), ('century first', 107), ('Christ resurrection', 104), ('Apostle Paul', 103), ('Biblical Literature', 102)]
[('New Testament', 821), ('American Journal', 534), ('Old Testament', 313), ('Christ Jesus', 308), ('Paul St', 232), ('Journal Theology', 228), ('same time', 224), ('hand other', 223), ('Biblical World', 216), ('point view', 165), ('place take', 162), ('Apostle Paul', 160), ('Holy Spirit', 156), ('Fourth Gospel', 151)

In [30]:
to_replace = {
    "christian" : "Christian",
    "Biblical" : "biblical",
    "Church" : "church",
    "Faith" : "faith",
    "Apostle" : "apostle",
    "american" : "American",
    "jewish" : "Jewish",
    "old Testament" : "Old Testament",
    "Paul St" : "Paul saint",
    "Lord" : "lord"
}

# Preprocesing for embeddings etc.


In [103]:
unigrams_merged_cleaned = pickle.load(open("../data/large_files/unigrams_merged_cleaned.pickle", "rb"))

In [104]:
#
types_N = len(unigrams_merged_cleaned)
types_N

1349906

In [105]:
tokens_N = sum([tup[1] for tup in unigrams_merged_cleaned.items()])
tokens_N

71637349

In [118]:
threshold = 50
unigrams_merged_thresh = dict([tup for tup in unigrams_merged_cleaned.items() if tup[1] >= threshold])
types_N_thres = len(unigrams_merged_thresh)
types_N_thres

45412

In [119]:
tokens_N_thres = sum([tup[1] for tup in unigrams_merged_thresh.items()])
tokens_N_thres

67711862

In [120]:
print(np.round(types_N_thres / types_N * 100, 2))
print(np.round(tokens_N_thres / tokens_N * 100, 2))

3.36
94.52


In [123]:
unigrams_sorted_tups = sorted(list(unigrams_merged_thresh.items()), key = lambda tup: tup[1], reverse=True)
unigrams_sorted_tups[:10]

[('have', 1104349),
 ('God', 497732),
 ('do', 464566),
 ('Paul', 407736),
 ('church', 378646),
 ('see', 373526),
 ('Christian', 319494),
 ('other', 294611),
 ('new', 286333),
 ('Jesus', 281420)]

In [124]:
vocabulary = [tup[0] for tup in unigrams_sorted_tups]

In [125]:
%%time
dictionary = corpora.Dictionary([vocabulary])

CPU times: user 85.1 ms, sys: 26.6 ms, total: 112 ms
Wall time: 117 ms


In [129]:
dictionary.token2id["Christ"]

3026

In [130]:
dictionary.doc2bow(["Jesus", "Christ", "Vojtěch"])

[(3026, 1), (7872, 1)]

In [68]:
%%time
words_counter = Counter()
for tup in list(data_trigrams.items()):
    trigram_split = tup[0].split()
    trigram_words_dict = dict([(w, tup[1]) for w in tup[0].split()])
    words_counter.update(trigram_words_dict)

CPU times: user 1min 30s, sys: 5.68 s, total: 1min 35s
Wall time: 1min 38s


In [75]:
sorted(list(words_counter.items()), key=lambda kv: kv[1], reverse=True)[:100]

[('the', 11069425),
 ('of', 7794989),
 ('and', 7658222),
 ('in', 5068121),
 ('to', 4724140),
 ('a', 3412881),
 ('that', 2592173),
 ('is', 2544188),
 ('The', 2117179),
 ('as', 1926657),
 ('for', 1719904),
 ('God', 1466690),
 ('with', 1465979),
 ('have', 1436410),
 ('by', 1430733),
 ('his', 1366978),
 ('was', 1306292),
 ('this', 1159116),
 ('on', 1147087),
 ('from', 1130107),
 ('or', 1124321),
 ('be', 1098792),
 ('it', 1095173),
 ('not', 1054076),
 ('he', 1009057),
 ('Paul', 989362),
 ('are', 981070),
 ('which', 938571),
 ('I', 856019),
 ('but', 854734),
 ('their', 823829),
 ('Jesus', 818779),
 ('an', 775872),
 ('at', 720394),
 ('Christ', 717573),
 ('other', 713187),
 ('In', 709537),
 ('who', 690035),
 ('do', 674599),
 ('see', 673994),
 ('one', 665063),
 ('were', 647401),
 ('they', 600823),
 ('all', 573193),
 ('church', 565475),
 ('also', 551245),
 ('Christian', 548838),
 ('1', 544121),
 ('A', 537050),
 ('has', 530712),
 ('more', 525323),
 ('had', 521816),
 ('life', 511836),
 ('we', 5114

In [49]:
len(types)

2115985

In [52]:
types[:10]

['REGEMORTER',
 'patenand',
 'nonetheless6',
 'Albergatis',
 'TsGALI',
 '利瑪竇',
 'agesperhaps',
 'dimness',
 'Bane',
 'stereotypes40']

In [53]:
dictionary = corpora.Dictionary([types])

In [54]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

In [60]:
dictionary["100000"]

'19918788'

In [41]:
len(tokens)

166178449

In [None]:
types = list(set(tokens)

In [36]:
dictionary.save("../data/dictionary_v1.dict")

In [37]:
list(dictionary.items())[:10]

[(0, 'ark://27927/phx66812gq6'),
 (1, 'ark://27927/pbd6fpf5fh'),
 (2, 'ark://27927/phw1kd8s300'),
 (3, 'ark://27927/phx64fptrwj'),
 (4, 'ark://27927/phx64k1x5c2'),
 (5, 'ark://27927/phx64fkrk6m'),
 (6, 'http://www.jstor.org/stable/43052718'),
 (7, 'ark://27927/phx68d6dm3t'),
 (8, 'ark://27927/pbd934r3jr'),
 (9, 'ark://27927/phx2t1wjwnt')]