In [20]:
import pickle
import numpy as np
import pandas as pd
from gensim.matutils import corpus2csc
from gensim.matutils import corpus2dense
from gensim.corpora import Dictionary

In [3]:
jstor_df = pd.read_feather("../data/large_files/jstor_df_v1.feather")

In [4]:
article_ids_dict = pickle.load(open("../data/article_ids_dict.pickle", "rb"))

In [5]:
dictionary = pickle.load(open("../data/dictionary_main.pickle", "rb"))
len(dictionary)

45412

In [6]:
decades = []
for decade_n in range(0,10):
    decades.append("19{0}0,19{0}9".format(str(decade_n)))
decades += ["2000,2009", "2010,2019"]
decades

['1900,1909',
 '1910,1919',
 '1920,1929',
 '1930,1939',
 '1940,1949',
 '1950,1959',
 '1960,1969',
 '1970,1979',
 '1980,1989',
 '1990,1999',
 '2000,2009',
 '2010,2019']

In [7]:
def ids_from_colvals(df_name, col, matchstring):
    ids = eval('{0}[{0}["{1}"]{2}]'.format(df_name, col, matchstring))["id"].tolist()
    return ids

In [8]:
decade = decades[2]
ids = ids_from_colvals("jstor_df", "publicationYear", ".between({})".format(decade))
len(ids)

132

In [9]:
def corpus_from_ids(ids):
    corpus = []
    for id in ids:
        short_id_str = str(article_ids_dict[id])
        id_filepath = "../data/large_files/article_docs/{}.pickle".format(short_id_str)
        corpus.extend(pickle.load(open(id_filepath, "rb")))
    return corpus

In [10]:
corpus = corpus_from_ids(ids)

In [12]:
len(corpus)
corpus[:10]

[[(2766, 1), (20683, 1), (38516, 1)],
 [(35789, 1), (41829, 1)],
 [(17677, 1), (20366, 1)],
 [(4781, 1), (29109, 1)],
 [(18186, 1), (28298, 1)],
 [(209, 1), (42320, 1)],
 [(7872, 1), (10870, 1)],
 [(16696, 1), (17677, 1)],
 [(35640, 1), (41829, 1)],
 [(19439, 1), (29775, 1), (32952, 1)]]

In [13]:
dct_corpus = Dictionary.from_corpus(corpus, id2word=dictionary)

In [14]:
len(dct_corpus)

45412

In [15]:
dct_corpus.num_docs

494101

In [48]:
[el[1] for el in dct_corpus.items() if (len(el[1].split())>1) & ("Paul" in el[1])]

['Apollos Paul',
 'Deutero Pauline',
 'Deutero Paulines',
 'Jean Paul',
 'Jesus Paul',
 'Paul Saul',
 'Paul Stephen',
 'Paul jerusalempillars',
 'Paul s',
 'Pauline deutero',
 'Pauline pseudo',
 'Pauline un',
 'Pauly Wissowa']

In [29]:
dct_corpus.token2id["Christ"]

3026

In [30]:
dct_corpus.token2id["Jesus"]

7872

In [39]:
len([doc for doc in [[el[0] for el in doc] for doc in corpus] if (3026 in doc) & (7872 in doc)])

364

In [17]:
term_doc_mat = corpus2csc(corpus, num_terms=len(dct_corpus))

In [18]:
type(term_doc_mat)

scipy.sparse._csc.csc_matrix

In [19]:
term_doc_mat.shape

(45412, 494101)

In [21]:
cooc_mat = np.dot(term_doc_mat, term_doc_mat.T)

In [25]:
cooc_mat_dense = cooc_mat.todense()

In [40]:
# target term frequency in coocurrence matrix (i.e. the value on the diagonal
cooc_mat_dense[dct_corpus.token2id["Christ"],dct_corpus.token2id["Christ"]]

1765.0

In [38]:
# target term frequency in coocurrence matrix (i.e. the value on the diagonal
cooc_mat_dense[dct_corpus.token2id["Christ"],dct_corpus.token2id["Jesus"]]

364.0

In [33]:

term_doc_mat_dense = term_doc_mat.todense()
term_doc_mat_dense.shape

(45287, 494101)

In [None]:
term_doc_mat_dense = corpus2dense(corpus, len(dct_corpus))
term_doc_mat_dense.shape()

In [None]:
corpus2dense

In [32]:
term_doc_mat.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
term_doc_mat.index()

AttributeError: index not found

In [36]:
term_doc_mat_dense.index()

AttributeError: 'matrix' object has no attribute 'index'

In [1]:
term_doc_mat_dense.nonzero()

NameError: name 'term_doc_mat_dense' is not defined

In [None]:
from gensim.matutils import corpus2csc
from gensim.corpora import Dictionary

# somehow create your corpus

dct = Dictionary(corpus)
bow_corpus = [dct.doc2bow(line) for line in corpus]
term_doc_mat = corpus2csc(bow_corpus)