In [None]:
import pandas as pd
import pickle
pd.set_option("display.max_columns", None)
from collections import Counter
import spacy

In [None]:
# to communicate with google spreadsheet...
import gspread
from gspread_dataframe import get_as_dataframe
from gspread_dataframe import set_with_dataframe
from google.oauth2 import service_account # based on google-auth library
import sddk

s = sddk.cloudSession("sciencedata.dk")
# establish connection with gogglesheets...
file_data = s.read_file("https://sciencedata.dk/files/ServiceAccountsKey.json", "dict") # or load it from a local storage: json.load(open("../../ServiceAccountsKey.json", "r"))
credentials = service_account.Credentials.from_service_account_info(file_data)
gc = gspread.Client(auth=credentials.with_scopes(['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive']))
mops_data = gc.open_by_url("https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing")

In [None]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [None]:
unigramCount_dict = pickle.load(open("../data/large_files/unigramCount_cleaned_dict.pickle", "rb"))

In [None]:
list(unigramCount_dict.items())[0][1]

In [None]:
# manual test
col = "publicationYear"
matchstring = "==1951"
eval('jstor_df[jstor_df["{0}"]{1}]'.format(col, matchstring))["id"].tolist()

In [None]:
def ids_from_colvals(df_name, col, matchstring):
    ids = eval('{0}[{0}["{1}"]{2}]'.format(df_name, col, matchstring))["id"].tolist()
    return ids

In [None]:
# test with function...
len(ids_from_colvals("jstor_df", "publicationYear", ".between(1920, 1929)"))

In [None]:
def merge_data_from_ids(ids, datadict):
    c = Counter()
    for id in ids:
        d = datadict[id]
        c.update(d)
    return c

In [None]:
def get_tops(df_name, col, matchstring, n=10):
    ids = ids_from_colvals(df_name, col, matchstring)
    c = merge_data_from_ids(ids, unigramCount_dict)
    c_tups = [el for el in c.items() if el[0] != ""]
    top10 = sorted(c_tups, key=lambda kv: kv[1], reverse=True)[:n]
    return top10

In [None]:
decades = []
for decade_n in range(0,10):
    decades.append("19{0}0,19{0}9".format(str(decade_n)))
decades += ["2000,2009", "2010,2019"]
decades

In [None]:
def pct_frequency(df_name, col, matchstring, wordlist):
    ids = ids_from_colvals(df_name, col, matchstring)
    jstor_df_subset = eval('{0}[{0}["id"].isin(ids)]'.format(df_name))
    totalwords = jstor_df_subset["wordCount"].sum()
    c = merge_data_from_ids(ids, unigramCount_dict)
    wordlist_data = {}
    for word in wordlist:
        word_pct_frequency = (c[word] / totalwords) * 100
        wordlist_data[word] = word_pct_frequency
    return wordlist_data

In [None]:
data = []
for decade_n in decades:
    decade_data = {"decade" : decade}
    decade_data.update(pct_frequency("jstor_df", "publicationYear", ".between({0})".format(decade), ["God", "Paul", "Jesus", "Christ"]))
    data.append(decade_data)

In [None]:
pd.DataFrame(data)

In [None]:
words = ["social", "gender", "Sanders"]
data = []
for decade in decades:
    decade_data = {"decade" : decade}
    decade_data.update(pct_frequency("jstor_df", "publicationYear", ".between({0})".format(decade), words))
    data.append(decade_data)
data_df = pd.DataFrame(data)
data_df


In [None]:
ax = data_df.plot()
ax.set_xticks(range(len(data_df)))
ax.set_xticklabels(data_df["decade"].tolist(), rotation=90)

In [None]:
words = ["authenticity", "authentic"]
data = []
for decade in decades:
    decade_data = {"decade" : decade}
    decade_data.update(pct_frequency("jstor_df", "publicationYear", ".between({0})".format(decade), words))
    data.append(decade_data)
data_df = pd.DataFrame(data)
data_df

ax = data_df.plot()
ax.set_xticks(range(len(data_df)))
ax.set_xticklabels(data_df["decade"].tolist(), rotation=90)

In [None]:
jstor_articles = jstor_df[jstor_df["docType"]=="article"]
len(jstor_articles)

In [None]:
words = ["scientific", "sociology", "anthropology", "psychology", "postmodern", "interdisciplinary"]
data = []
for decade in decades:
    decade_data = {"decade" : decade}
    decade_data.update(pct_frequency("jstor_articles", "publicationYear", ".between({0})".format(decade), words))
    data.append(decade_data)
data_df = pd.DataFrame(data)
data_df

ax = data_df.plot()
ax.set_xticks(range(len(data_df)))
ax.set_xticklabels(data_df["decade"].tolist(), rotation=90)

In [None]:
words = ["Durkheim", "Freud", "Jung", "Marx", "Darwin", "Spencer"]
data = []
for decade_n in range(0,10):
    decade = "19{0}0,19{0}9".format(str(decade_n))
    decade_data = {"decade" : decade}
    decade_data.update(pct_frequency("jstor_articles", "publicationYear", ".between({0})".format(decade), words))
    data.append(decade_data)
data_df = pd.DataFrame(data)
data_df

ax = data_df.plot()
ax.set_xticks(range(len(data_df)))
ax.set_xticklabels(data_df["decade"].tolist(), rotation=90)

# Exploring interest in science in the 20s

In [None]:
# test with function...
ids = ids_from_colvals("jstor_df", "publicationYear", ".between(1920, 1929)")
jstor_1920 = jstor_df[jstor_df["id"].isin(ids)]
jstor_1920.head(20)

In [None]:
set_with_dataframe(mops_data.add_worksheet("jstor_1920", 1,1), jstor_1920)

In [None]:
jstor_df[jstor_df["id"].isin(ids)][]


In [None]:
decades += ["2000,2009", "2010,2019"]

In [None]:
words = ["doctrine", "doctrinal", "dogma", "dogmatic", "theology", "theological"]
data = []
for decade_n in range(0,10):
    decade = "19{0}0,19{0}9".format(str(decade_n))
    decade_data = {"decade" : decade}
    decade_data.update(pct_frequency("jstor_articles", "publicationYear", ".between({0})".format(decade), words))
    data.append(decade_data)
data_df = pd.DataFrame(data)
data_df

ax = data_df.plot()
ax.set_xticks(range(len(data_df)))
ax.set_xticklabels(data_df["decade"].tolist(), rotation=90)

In [None]:
# test with function...
ids = ids_from_colvals("jstor_df", "publicationYear", ".between(1930, 1939)")
jstor_1930 = jstor_df[jstor_df["id"].isin(ids)]
jstor_1930.head(20)


In [None]:
set_with_dataframe(mops_data.add_worksheet("jstor_1930", 1,1), jstor_1930)

In [None]:
jstor_df[jstor_df["isPartOf"]=="The Muslim World"]["title"]

In [None]:
words = ["theory", "theoretical", "explanation", "interpretation", "method", "methodology", "methodological"]
data = []
for decade in decades:
    decade_data = {"decade" : decade}
    decade_data.update(pct_frequency("jstor_articles", "publicationYear", ".between({0})".format(decade), words))
    data.append(decade_data)
data_df = pd.DataFrame(data)
data_df

ax = data_df.plot()
ax.set_xticks(range(len(data_df)))
ax.set_xticklabels(data_df["decade"].tolist(), rotation=90)