In [1]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
metadata_df = pd.read_json("../data/metadata_rich_df.json")


In [3]:
sourcepath = "../data/large_files/trigrams_lemmatized/"
with open(sourcepath + "trigrams_lemmata_id_0.txt".format(str(id)), "r") as f:
    trigrams_list = f.readlines()

In [4]:
[el.strip() for el in trigrams_list][:10]

['religious matter',
 'divine hope',
 'divine hope',
 'leave unresolved',
 'leave unresolved',
 'racial equality',
 'racial equality',
 'joint action',
 'joint action',
 'joint action']

# Load vocabulary

In [5]:
bidecades_vocabs_counts_df = pickle.load(open("../data/bidecades_vocabs_counts_df.pickle", "rb"))
bidecades_vocabs_counts_df.head(5)

Unnamed: 0,1900-1919,1920-1939,1940-1959,1960-1979,1980-1999,2000-2019
man,11218.0,8342.0,24108.0,39199.0,48435.0,100670.0
new,10183.0,10192.0,23469.0,42092.0,97421.0,293457.0
Jesus,9650.0,4911.0,12134.0,23416.0,49765.0,215687.0
et,9325.0,11818.0,20805.0,35969.0,49326.0,73464.0
God,9248.0,6276.0,26328.0,45751.0,99505.0,293240.0


In [6]:
vocabulary = dict([(el[1], el[0]) for el in enumerate(bidecades_vocabs_counts_df.index)])

# Load trigrams

In [7]:
bidecades_strs = sorted([el for el in list(set(metadata_df["bidecade"])) if el != None])
bidecades_strs

['1900-1919', '1920-1939', '1940-1959', '1960-1979', '1980-1999', '2000-2019']

In [8]:
bidecade = "1900-1919"
ids = metadata_df[metadata_df["bidecade"]==bidecade]["id_kase"]
len(ids)

782

In [9]:
with open("../data/large_files/bidecade_trigrams_{}.txt".format(bidecade), "r") as f:
    subcorpus_ngrams = f.readlines()

In [10]:
[el.strip() for el in subcorpus_ngrams[:10]]

['Michigan Ann Arbor',
 'Bonner University',
 'Bonner University',
 'Iowa Iowa',
 'Iowa Iowa',
 'Kansas Lawrence',
 'Ginn Co',
 'Elliott University',
 'Elliott University',
 'Kansas Lawrence Kansas']

In [11]:
vectorizer = TfidfVectorizer(vocabulary=vocabulary, lowercase=False)
X = vectorizer.fit_transform(subcorpus_ngrams)

In [12]:
X.shape

(722798, 4804)

In [15]:
cooc = X.T * X

In [16]:
cooc.shape

(4804, 4804)

In [30]:
print(cooc.todense()[0][:10])

[[4.18600351e+03 1.24084146e+01 3.95763797e+01 ... 0.00000000e+00
  0.00000000e+00 1.33054155e+00]]


In [31]:
cooc_norm = pd.DataFrame((cooc / cooc.max() ).todense())
cooc_norm.columns = vocabulary.keys()
cooc_norm.index = vocabulary.keys()
cooc_norm

Unnamed: 0,man,new,Jesus,et,God,Paul,make,church,life,see,...,angle,substantive,noteworthy,Kevin,locus,Ian,gesture,metaphorical,layer,less
man,0.666853,0.001977,0.006305,0.000000,0.019845,0.001990,0.008782,0.000871,0.008197,0.004872,...,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000212
new,0.001977,0.373283,0.001257,0.000000,0.001498,0.000113,0.001669,0.002708,0.010922,0.000875,...,0.00013,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000000
Jesus,0.006305,0.001257,0.828934,0.000309,0.005996,0.012408,0.006683,0.000945,0.031356,0.004366,...,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000000
et,0.000000,0.000000,0.000309,0.282104,0.000000,0.003248,0.000000,0.000000,0.000000,0.000716,...,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000000
God,0.019845,0.001498,0.005996,0.000000,0.771731,0.001713,0.010154,0.001658,0.003495,0.005688,...,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00000,0.000000,0.000490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ian,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.000538,0.000000,0.00000,0.000000,0.000000
gesture,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000112,0.00000,0.000000,0.000000
metaphorical,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00256,0.000000,0.000000
layer,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.00000,0.000797,0.000000


In [33]:
cooc_norm["Paul"].sort_values(ascending=False)

Paul              1.000000
epistle           0.055488
Apostle           0.035643
letter            0.028799
apostle           0.028537
                    ...   
reckon            0.000000
Eliot             0.000000
vase              0.000000
sister            0.000000
responsibility    0.000000
Name: Paul, Length: 4804, dtype: float64

In [35]:
cooc_norm_dict = {}
for bidecade in bidecades_strs:
    with open("../data/large_files/bidecade_trigrams_{}.txt".format(bidecade), "r") as f:
        subcorpus_ngrams = f.readlines()
    vectorizer = TfidfVectorizer(vocabulary=vocabulary, lowercase=False)
    X = vectorizer.fit_transform(subcorpus_ngrams)
    cooc = X.T * X
    cooc_norm = pd.DataFrame((cooc / cooc.max() ).todense())
    cooc_norm.columns = vocabulary.keys()
    cooc_norm.index = vocabulary.keys()
    cooc_norm_dict[bidecade] = cooc_norm

In [40]:
pickle.dump(cooc_norm_dict, open("../data/cooc_norm_dict.pickle", "wb"))

In [36]:
cooc_norm_dict["1920-1939"]["Paul"].sort_values(ascending=False)

Paul       1.000000
Apostle    0.043437
epistle    0.041307
letter     0.033104
saint      0.014529
             ...   
deu        0.000000
xvii       0.000000
Neuen      0.000000
gold       0.000000
less       0.000000
Name: Paul, Length: 4804, dtype: float64

In [39]:
cooc_norm_dict["1980-1999"]["Paul"].sort_values(ascending=False)

Paul        1.000000
letter      0.052684
Apostle     0.025385
use         0.021093
epistle     0.016693
              ...   
opposite    0.000000
Simeon      0.000000
withdraw    0.000000
vote        0.000000
less        0.000000
Name: Paul, Length: 4804, dtype: float64