In [2]:
import pandas as pd
import spacy
import pickle
import random
pd.set_option("display.max_columns", None)
from collections import Counter
import re
import google_conf
import os
import json

from nltk.corpus import stopwords

### Testing simpler stemmers and lemmatizers

In [4]:
from nltk.stem import PorterStemmer, SnowballStemmer
from gensim.parsing.preprocessing import stem_text

# Initialize Porter and Snowball Stemmers
porter = PorterStemmer()
snowball = SnowballStemmer(language="english")

# Words to test
words = ["running", "jumps", "easily", "happiness", "applies", "relational", "generously", "studies"]

# Stem each word with all three methods
for word in words:
    porter_stemmed = porter.stem(word)
    snowball_stemmed = snowball.stem(word)
    gensim_stemmed = stem_text(word)  # Gensim handles entire strings

    print(f"{word:<12} -> Porter: {porter_stemmed:<10} | Snowball: {snowball_stemmed:<10} | Gensim: {gensim_stemmed}")



running      -> Porter: run        | Snowball: run        | Gensim: run
jumps        -> Porter: jump       | Snowball: jump       | Gensim: jump
easily       -> Porter: easili     | Snowball: easili     | Gensim: easili
happiness    -> Porter: happi      | Snowball: happi      | Gensim: happi
applies      -> Porter: appli      | Snowball: appli      | Gensim: appli
relational   -> Porter: relat      | Snowball: relat      | Gensim: relat
generously   -> Porter: gener      | Snowball: generous   | Gensim: gener
studies      -> Porter: studi      | Snowball: studi      | Gensim: studi


In [5]:
###import nltk
###nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# Initialize WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

# Example trigram as a string
trigram = "Paul's letter writes"

# Split into words, lemmatize, and join back
lemmatized_trigram = " ".join([lemmatizer.lemmatize(word, pos="v") for word in trigram.split()])

print(lemmatized_trigram)

Paul's letter write


In [6]:
metadata_df = pd.read_json("../data/metadata_df.json")

In [7]:
ids = metadata_df["id_kase"].tolist()

In [6]:
mops_data = google_conf.setup(sheet_url="https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing", service_account_path="../../../ServiceAccountsKey.json")

In [7]:
nlp_spacy = spacy.load('en_core_web_lg') # python -m spacy download en_core_web_lg

In [8]:
tags = ["NOUN", "ADJ", "VERB", "PROPN"]

In [9]:
stops = set(stopwords.words('english'))

In [10]:
def clean_filter(string):
    string = [re.sub(r"[,.;:!?@#$~^&*{}}°\"\’\[\]\(\)\_\-\d+]+", "", w) for w in string.split()]
    string = " ".join([w[0] + w[1:].lower() for w in string if ((w not in stops) & (len(w)>1))])
    return string
clean_filter("the; L wouldn't _Word1... \"God's PRESS")

"Word God's Press"

In [11]:
def filter_dict(mydict, min=2):
    mycounter = Counter()
    for item in mydict.items():#, key=lambda kv: kv[1], reverse=True))
        string = clean_filter(item[0])
        string = " ".join([w for w in string.split() if w not in stops])
        if (len(string) >= min) & (len(string.split()) >= min):
            mycounter.update({string : item[1]})
    return mycounter

In [12]:
id = 1203
sourcepath = "../data/large_files/trigrams_raw/"
ngram_dict = json.load(open(sourcepath +  "trigrams_{}.json".format(str(id)), "r"))
len(ngram_dict)

9517

In [13]:
ngram_dict

{'the broad parameters': 1,
 'the polis/colonia. While': 1,
 '‘thoughts’ of Satan': 1,
 'immanent divine mind': 1,
 'or non-citizen, has': 1,
 'government (colonial) and': 1,
 'is the polar': 1,
 'interdependency and mutual': 1,
 'his followers that': 1,
 'Paul’s negative view': 1,
 'raised about the': 2,
 'It is also': 1,
 'they definitely made': 1,
 'of human connectedness.': 1,
 'with Judaism while': 1,
 'root quirky individualists;': 1,
 'this the focal': 1,
 'of Western imperialism': 1,
 'but we give': 1,
 '(\uf645 Cor \uf647.\uf647),': 1,
 'while their insistence': 1,
 'originate in other': 1,
 'of Downing’s Cynics,': 1,
 'to punish evil': 1,
 'interest in shaping': 1,
 'individuals.\uf644\uf64a Post-colonial theorists': 1,
 'he remained true': 1,
 'and neo-capitalism that': 1,
 'identity that justifies': 1,
 '\uf646\uf645 For Paul,': 1,
 'University Press, \uf645\uf643\uf644\uf64c': 1,
 'embraced only Christ-followers': 1,
 'Postcolonial Literature. Different': 1,
 'have guessed

In [14]:
unigram_dict = json.load(open("../data/large_files/unigrams_raw/unigrams_{}.json".format(str(id)), "r"))
unigram_dict

{'\uf644\uf649\uf643': 1,
 'boundary-crossing,': 2,
 '\uf647.\uf644–\uf64a).': 1,
 'B.': 6,
 'abolished': 1,
 '‘Solidarity': 1,
 'hated': 1,
 'Christ': 3,
 'growing': 2,
 'runs': 1,
 '\uf645\uf643\uf644\uf644)': 4,
 'came': 4,
 'Criticism’': 1,
 '\uf64b\uf649–\uf64a.': 1,
 'exposing': 1,
 'wanderer': 1,
 'allowed': 1,
 'teachings': 5,
 'nations;': 1,
 'opposite': 3,
 'field': 1,
 'cosmopolitanism;': 1,
 'Kleingeld,': 1,
 'vision,': 3,
 'Mitsis,': 1,
 'relatively': 2,
 'real-world': 1,
 'humanity': 9,
 'final': 1,
 'proliferation;': 1,
 'trustworthy,': 1,
 'began': 2,
 '\uf644.\uf644\uf646–\uf644\uf647;': 1,
 'embrace': 5,
 'traces': 1,
 'my': 1,
 'parallels': 2,
 'duties': 4,
 'society': 9,
 'did,': 2,
 'treated': 3,
 'Brief': 1,
 '\uf645.\uf644\uf64c': 1,
 'standing': 1,
 'Western': 19,
 'constituted': 1,
 'years,': 1,
 'Strangers’,': 1,
 '\uf644\uf647\uf648': 1,
 'Louis:': 1,
 'cri-': 1,
 'official': 1,
 '\uf646.\uf645\uf64b,': 1,
 'so': 6,
 'Bonaventure': 1,
 'found': 1,
 'non-domin

In [15]:
%%time
ngram_dict_filtered = filter_dict(ngram_dict)
len(ngram_dict_filtered)

CPU times: user 45 ms, sys: 1.7 ms, total: 46.7 ms
Wall time: 45.7 ms


5092

In [16]:
ngram_dict_filtered

Counter({'Paul Cosmopolitan': 12,
         'civic life': 8,
         'Greek Roman': 7,
         'people outside': 7,
         'Delanty Cosmopolitan Imagination': 7,
         'apocalyptic vision': 6,
         'religious group': 6,
         'God Israel': 5,
         'Pauls theology': 5,
         'cosmopolitan thought': 5,
         'Pauls letters': 5,
         '‘the wise': 5,
         'family friends': 5,
         'virtually nothing': 5,
         'Kang Cosmopolitan Theology': 4,
         'take place': 4,
         'apostle Paul': 4,
         'social environment': 4,
         'civic deities': 4,
         'cosmopolitan thinking': 4,
         'inseparably tied': 4,
         'various ways': 4,
         'one another': 4,
         '‘cosmopolitan vision': 4,
         'Cynics Paul': 4,
         'point see': 4,
         'higher loyalty': 4,
         'Jewish sectarian': 4,
         'cosmopolitan vision': 4,
         'give rise': 4,
         'broader society': 4,
         'Mcmahon ‘fear': 4,
        

# Unigrams cleaning test

In [17]:
id = 0
sourcepath = "../data/large_files/unigrams_raw/"
ngram_dict = json.load(open(sourcepath +  "unigrams_{}.json".format(str(id)), "r"))
len(ngram_dict)

2512

In [18]:
ngram_dict_filtered = filter_dict(ngram_dict, min=1)
sorted(ngram_dict_filtered.items(), key=lambda x: x[1], reverse=True)[10:]

[('united', 25),
 ('fellowship', 21),
 ('Christ', 20),
 ('new', 20),
 ('local', 20),
 ('many', 20),
 ('must', 19),
 ('world', 17),
 ('even', 17),
 ('‘the', 16),
 ('faith', 16),
 ('community', 15),
 ('confessional', 14),
 ('mission', 14),
 ('It', 14),
 ('together', 14),
 ('theological', 14),
 ('Christians', 14),
 ('Unity', 13),
 ('New', 13),
 ('In', 13),
 ('Of', 13),
 ('structure', 12),
 ('problems', 12),
 ('As', 12),
 ('Ecumenical', 12),
 ('But', 12),
 ('cit', 12),
 ('common', 12),
 ('place', 11),
 ('Review', 11),
 ('communion', 10),
 ('worship', 10),
 ('op', 10),
 ('question', 10),
 ('Report', 10),
 ('Model', 10),
 ('made', 10),
 ('model', 9),
 ('structures', 9),
 ('Order', 9),
 ('may', 9),
 ('form', 9),
 ('Faith', 9),
 ('level', 9),
 ('particular', 9),
 ('negotiations', 9),
 ('people', 9),
 ('experience', 9),
 ('visible', 9),
 ('would', 9),
 ('full', 8),
 ('witness', 8),
 ('questions', 8),
 ('This', 8),
 ('within', 8),
 ('another', 8),
 ('organization', 8),
 ('World', 8),
 ('organic'

In [19]:
len(ngram_dict_filtered)

1877

# Applying cleaning on all trigrams

In [21]:
destpath = "../data/large_files/trigrams_filtered/"
try:
    os.makedirs(destpath)
except:
    pass

In [22]:
with open(destpath + "id_{}.json".format(str(id)), "w") as f:
    json.dump(ngram_dict_filtered, f)

In [23]:
json.load(open(destpath + "id_0.json"))

{'Moede': 4,
 'identities': 1,
 'compelling': 1,
 'Scm': 2,
 'sug': 1,
 'mated': 1,
 'fundamentals': 1,
 '‘education': 1,
 'opportunities': 2,
 'full': 8,
 'operation': 1,
 'negotiating': 7,
 'funds': 1,
 'women': 1,
 'Reports': 1,
 'Christ': 20,
 'fellowship': 21,
 'continually': 1,
 'confessional': 14,
 'foundered': 1,
 'case': 1,
 'Incarnation': 1,
 'welding': 1,
 'secular': 1,
 'cerning': 1,
 'yet”': 1,
 'biblical': 1,
 'sacramental': 3,
 'ideas': 1,
 '‘the': 16,
 'fulfilled': 1,
 'Edinburgh': 3,
 'attempted': 1,
 'Role': 1,
 'structure': 12,
 'model': 9,
 'recogni': 1,
 'decisively': 1,
 'congregational': 1,
 'divorced': 1,
 'structures': 9,
 'fruitful': 1,
 'confined': 1,
 'possi': 1,
 'nonessentials': 1,
 'becoming': 1,
 'behind': 3,
 'international': 2,
 'traditions': 3,
 'defence': 1,
 'potential': 1,
 'long': 3,
 'Indeed': 2,
 'physical': 1,
 'lypse': 1,
 'Gods': 2,
 'injunction': 1,
 'differ': 1,
 'compel': 1,
 'something': 1,
 'mission': 14,
 'constitution': 1,
 'begun': 3,

In [24]:
%%time
data_ngrams = Counter()
sourcepath = "../data/large_files/trigrams_raw/"
for id in ids:
    ngram_dict = json.load(open(sourcepath +  "trigrams_{}.json".format(str(id)), "r"))
    d_cleaned = filter_dict(ngram_dict)
    with open(destpath + "filtered_id_{}.json".format(str(id)), "w") as f:
        json.dump(d_cleaned, f)
    data_ngrams.update(d_cleaned)

CPU times: user 12min 48s, sys: 5.03 s, total: 12min 53s
Wall time: 12min 54s


In [18]:
len(data_ngrams)

45296539

In [19]:
%%time
data_trigrams_sorted = sorted(data_ngrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_trigrams_sorted if el[1] > 10])

CPU times: user 6.15 s, sys: 2.57 s, total: 8.72 s
Wall time: 9.93 s


1325945

In [20]:
to_lemmatize = [el for el in data_trigrams_sorted if el[1] > 5]
len(to_lemmatize)

2330940

In [36]:
%%time
for sent in ["This is my favorite application"] * 10:
    stem_text(sent)
    #" ".join([t.lemma_ for t in nlp_spacy(sent) if t.pos_ in tags])

CPU times: user 86 μs, sys: 5 μs, total: 91 μs
Wall time: 93.2 μs


In [36]:
# use spacy to generate "Doc" object for ngram
# (use carefully, takes some time)
tags = ["NOUN", "ADJ", "VERB", "PROPN"]

def get_lemma_filtered(string):
    #lemmata = " ".join([t.lemma for t in nlp_stanza(string).sentences[0].words if t.pos in tags])
    lemmata = " ".join([t.lemma_ for t in nlp_spacy(string) if t.pos_ in tags])
    return lemmata

In [25]:
%%time
monitor = [n for n in range(0, len(to_lemmatize), 2000)]
trigrams_lemmata_dict= {}
for n, tup in enumerate(to_lemmatize):
    lemmata = get_lemma_filtered(tup[0])
    if n in monitor:
        print(n)
    if len(lemmata.split()) > 1:
        trigrams_lemmata_dict[tup[0]] = {"lemmata" : lemmata, "count" : tup[1]}

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000
132000
134000
136000
138000
140000
142000
144000
146000
148000
150000
152000
154000
156000
158000
160000
162000
164000
166000
168000
170000
172000
174000
176000
178000
180000
182000
184000
186000
188000
190000
192000
194000
196000
198000
200000
202000
204000
206000
208000
210000
212000
214000
216000
218000
220000
222000
224000
226000
228000
230000
232000
234000
236000
238000
240000
242000
244000
246000
248000
250000
252000
254000
256000
258000
260000
262000
264000
266000
268000
270000
272000
274000
276000
278000
280000
282000
284000
286000
288000
290000
292000
294000
296000
298000
300000
3

In [26]:
with open("../data/large_files/trigrams_lemmata_dict.pickle", "wb") as f:
    pickle.dump(trigrams_lemmata_dict, f)

In [13]:
trigrams_lemmata_dict = pickle.load(open("../data/large_files/trigrams_lemmata_dict.pickle", "rb"))
len(trigrams_lemmata_dict)

1768166

In [14]:
%%time
# developing application to individual documents
sourcepath = "../data/large_files/trigrams_filtered/"
id = 0
with open(sourcepath + "filtered_id_{}.json".format(str(id)), "r") as f:
    d_cleaned = json.load(f)
counter = Counter()
for item in d_cleaned.items():
    try:
        counter.update({trigrams_lemmata_dict[item[0]]["lemmata"] : item[1]})
    except:
        pass

CPU times: user 4.66 ms, sys: 2.88 ms, total: 7.54 ms
Wall time: 7.39 ms


In [15]:
trigrams_list = []
for tup in counter.items():
    trigrams_list.extend([tup[0]]*tup[1])

In [16]:
trigrams_list[:100]

['religious matter',
 'divine hope',
 'divine hope',
 'leave unresolved',
 'leave unresolved',
 'racial equality',
 'racial equality',
 'joint action',
 'joint action',
 'joint action',
 'joint action',
 'joint action',
 'christian fellowship',
 'biblical injunction',
 'biblical injunction',
 'important community',
 'degree order',
 'worldwide fellowship',
 'worldwide fellowship',
 'community establish',
 'new appreciation',
 'new appreciation',
 'Council church',
 'Council church',
 'local fellowship',
 'local fellowship',
 'establish maintain',
 'op cit',
 'op cit',
 'op cit',
 'op cit',
 'op cit',
 'op cit',
 'op cit',
 'op cit',
 'op cit',
 'op cit',
 'op cit',
 'Wcc Commission',
 'Wcc Commission',
 'use several',
 'nego tiating',
 'nego tiating',
 'church provide',
 'church provide',
 'church provide',
 'Association Press',
 'Association Press',
 'loss identity',
 'particular concept',
 'con versation',
 'form life',
 'doctrinal agreement',
 'genuine renewal',
 'genuine renewal',


In [18]:
trigrams_list = ["{}\n".format(i) for i in trigrams_list]

In [19]:
with open("test.txt", "w") as f:
    f.writelines(trigrams_list)

In [24]:
sourcepath = destpath
destpath = "../data/large_files/trigrams_lemmatized/"
try:
    os.makedirs(destpath)
except:
    pass

In [25]:
%%time
for id in ids:
    with open(sourcepath + "filtered_id_{}.json".format(str(id)), "r") as f:
        d_cleaned = json.load(f)
    counter = Counter()
    for item in d_cleaned.items():
        try:
            counter.update({trigrams_lemmata_dict[item[0]]["lemmata"] : item[1]})
        except:
            pass
    trigrams_list = []
    for tup in counter.items():
        trigrams_list.extend([tup[0]]*tup[1])
    trigrams_list = ["{}\n".format(i) for i in trigrams_list]
    with open(destpath + "trigrams_lemmata_id_{}.txt".format(str(id)), "w") as f:
        f.writelines(trigrams_list)

CPU times: user 1min 49s, sys: 6.9 s, total: 1min 56s
Wall time: 2min 6s


# Aplying cleaning to all unigrams

In [30]:
destpath = "../data/large_files/unigrams_filtered/"
os.makedirs(destpath)

In [31]:
%%time
data_ngrams = Counter()
sourcepath = "../data/large_files/unigrams_raw/"
for id in ids:
    ngram_dict = json.load(open(sourcepath + "unigrams_{}.json".format(str(id)), "r"))
    d_cleaned = filter_dict(ngram_dict, min=1)
    with open(destpath + "filtered_id_{}.json".format(str(id)), "w") as f:
        json.dump(d_cleaned, f)
    data_ngrams.update(d_cleaned)
len(data_ngrams)

CPU times: user 3min 41s, sys: 13.2 s, total: 3min 54s
Wall time: 4min 21s


2769088

In [32]:
%%time
data_unigrams_sorted = sorted(data_ngrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_unigrams_sorted if el[1] > 10])

CPU times: user 526 ms, sys: 92.2 ms, total: 618 ms
Wall time: 651 ms


272768

In [34]:
to_lemmatize = [el for el in data_unigrams_sorted if el[1] > 5]
len(to_lemmatize)

410708

In [37]:
%%time
monitor = [n for n in range(0, len(to_lemmatize), 2000)]
unigrams_lemmata_dict = {}
for n, tup in enumerate(to_lemmatize):
    lemmata = get_lemma_filtered(tup[0])
    if n in monitor:
        print(n)
    if len(lemmata.split()) > 0:
        unigrams_lemmata_dict[tup[0]] = {"lemmata": lemmata, "count": tup[1]}

0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000
36000
38000
40000
42000
44000
46000
48000
50000
52000
54000
56000
58000
60000
62000
64000
66000
68000
70000
72000
74000
76000
78000
80000
82000
84000
86000
88000
90000
92000
94000
96000
98000
100000
102000
104000
106000
108000
110000
112000
114000
116000
118000
120000
122000
124000
126000
128000
130000
132000
134000
136000
138000
140000
142000
144000
146000
148000
150000
152000
154000
156000
158000
160000
162000
164000
166000
168000
170000
172000
174000
176000
178000
180000
182000
184000
186000
188000
190000
192000
194000
196000
198000
200000
202000
204000
206000
208000
210000
212000
214000
216000
218000
220000
222000
224000
226000
228000
230000
232000
234000
236000
238000
240000
242000
244000
246000
248000
250000
252000
254000
256000
258000
260000
262000
264000
266000
268000
270000
272000
274000
276000
278000
280000
282000
284000
286000
288000
290000
292000
294000
296000
298000
300000
3

In [38]:
with open("../data/large_files/unigrams_lemmata_dict.pickle", "wb") as f:
    pickle.dump(unigrams_lemmata_dict, f)
#unigrams_lemmata_dict = pickle.load(open("../data/large_files/trigrams_lemmata_dict.pickle", "rb"))
len(unigrams_lemmata_dict)

388171

In [40]:
sorted(unigrams_lemmata_dict.items(), key=lambda x: x[1]["count"], reverse=True)[:10]

[('God', {'lemmata': 'God', 'count': 434622}),
 ('Christian', {'lemmata': 'christian', 'count': 336049}),
 ('Paul', {'lemmata': 'Paul', 'count': 335617}),
 ('Jesus', {'lemmata': 'Jesus', 'count': 300420}),
 ('book', {'lemmata': 'book', 'count': 299610}),
 ('New', {'lemmata': 'new', 'count': 291510}),
 ('University', {'lemmata': 'university', 'count': 268405}),
 ('work', {'lemmata': 'work', 'count': 250950}),
 ('life', {'lemmata': 'life', 'count': 212251}),
 ('Christ', {'lemmata': 'Christ', 'count': 210690})]

In [41]:
%%time
# developing application to individual documents
sourcepath = "../data/large_files/unigrams_filtered/"

CPU times: user 2 µs, sys: 10 µs, total: 12 µs
Wall time: 16.9 µs


In [42]:
destpath = "../data/large_files/unigrams_lemmatized/"
try:
    os.makedirs(destpath)
except:
    pass

In [43]:
%%time
for id in ids:
    with open(sourcepath + "filtered_id_{}.json".format(str(id)), "r") as f:
        d_cleaned = json.load(f)
    counter = Counter()
    for item in d_cleaned.items():
        try:
            counter.update({unigrams_lemmata_dict[item[0]]["lemmata"]: item[1]})
        except:
            pass
    unigrams_list = []
    for tup in counter.items():
        unigrams_list.extend([tup[0]] * tup[1])
    unigrams_list = ["{}\n".format(i) for i in unigrams_list]
    with open(destpath + "unigrams_lemmata_id_{}.txt".format(str(id)), "w") as f:
        f.writelines(unigrams_list)

CPU times: user 1min 41s, sys: 9.58 s, total: 1min 50s
Wall time: 2min 13s


# Backup

# Most frequent unigrams to nlp docs

In [108]:
# load spacy english nlp model & english stopwords...
#stop_words = nlp_spacy.Defaults.stop_words

In [109]:
# check how raw unigrams look like
data_unigrams_sorted = sorted(data_unigrams.items(), key=lambda kv: kv[1], reverse=True)
data_unigrams_sorted[:1000]

[('the', 64204),
 ('of', 43727),
 ('and', 29112),
 ('to', 24051),
 ('in', 23227),
 ('is', 11614),
 ('that', 11283),
 ('as', 8204),
 ('The', 6860),
 ('for', 6850),
 ('his', 6283),
 ('with', 6121),
 ('was', 5995),
 ('he', 5314),
 ('by', 5164),
 ('not', 5095),
 ('on', 4746),
 ('from', 4525),
 ('it', 4500),
 ('this', 4388),
 ('be', 4355),
 ('or', 3657),
 ('which', 3568),
 ('are', 3401),
 ('an', 3185),
 ('their', 3085),
 ('who', 3080),
 ('God', 3032),
 ('have', 2951),
 ('but', 2774),
 ('at', 2649),
 ('Paul', 2601),
 ('were', 2543),
 ('they', 2531),
 ('In', 2524),
 ('had', 2435),
 ('one', 2336),
 ('all', 2258),
 ('has', 2093),
 ('him', 1966),
 ('we', 1966),
 ('Christ', 1886),
 ('also', 1854),
 ('its', 1685),
 ('them', 1600),
 ('Christian', 1594),
 ('more', 1579),
 ('Jesus', 1546),
 ('would', 1496),
 ('what', 1489),
 ('you', 1474),
 ('This', 1472),
 ('been', 1441),
 ('will', 1439),
 ('other', 1423),
 ('into', 1356),
 ('only', 1342),
 ('It', 1324),
 ('so', 1322),
 ('no', 1299),
 ('about', 1293

In [110]:
data_unigrams_sorted[:1000]

[('the', 64204),
 ('of', 43727),
 ('and', 29112),
 ('to', 24051),
 ('in', 23227),
 ('is', 11614),
 ('that', 11283),
 ('as', 8204),
 ('The', 6860),
 ('for', 6850),
 ('his', 6283),
 ('with', 6121),
 ('was', 5995),
 ('he', 5314),
 ('by', 5164),
 ('not', 5095),
 ('on', 4746),
 ('from', 4525),
 ('it', 4500),
 ('this', 4388),
 ('be', 4355),
 ('or', 3657),
 ('which', 3568),
 ('are', 3401),
 ('an', 3185),
 ('their', 3085),
 ('who', 3080),
 ('God', 3032),
 ('have', 2951),
 ('but', 2774),
 ('at', 2649),
 ('Paul', 2601),
 ('were', 2543),
 ('they', 2531),
 ('In', 2524),
 ('had', 2435),
 ('one', 2336),
 ('all', 2258),
 ('has', 2093),
 ('him', 1966),
 ('we', 1966),
 ('Christ', 1886),
 ('also', 1854),
 ('its', 1685),
 ('them', 1600),
 ('Christian', 1594),
 ('more', 1579),
 ('Jesus', 1546),
 ('would', 1496),
 ('what', 1489),
 ('you', 1474),
 ('This', 1472),
 ('been', 1441),
 ('will', 1439),
 ('other', 1423),
 ('into', 1356),
 ('only', 1342),
 ('It', 1324),
 ('so', 1322),
 ('no', 1299),
 ('about', 1293

In [111]:
# how many of the 100 most frequent unigrams are actually not stopwords?
#len([el for el in data_unigrams_sorted[:100] if el[0].lower() not in stop_words])

In [113]:
# what is the number of unigrams appearing more than 10 times?
len([el for el in data_unigrams_sorted if el[1] > 10])

7810

In [124]:

# use spacy to generate "Doc" object for ngram
# (use carefully, takes some time)
tags = ["NOUN", "ADJ", "VERB", "PROPN"]
def get_lemma_filtered(string):
    lemmata = " ".join([t.lemma for t in nlp_stanza(string).sentences[0].words if t.pos in tags])
    return lemmata

In [131]:
data_unigrams_lemmata = Counter()
for el in data_unigrams_sorted:
    if el[1] > 10:
        lemmata_str = get_lemma_filtered(el[0])
        if len(lemmata_str) > 1:
            data_unigrams_lemmata.update({lemmata_str : el[1]})

In [None]:
data_unigrams = merge_data_from_ids(all, unigramCount_dict)
len(data_unigrams)

In [None]:
# check how raw unigrams look like
data_unigrams_sorted = sorted(data_unigrams.items(), key=lambda kv: kv[1], reverse=True)
data_unigrams_sorted[:1000]


In [None]:
[(el[0], {"doc" : get_lemma_filtered(el[0]), "count" : el[1]}) for el in data_unigrams_sorted if el[1] > 1000]

In [34]:

data_unigrams_sorted_nlp = [(el[0], {"doc" : get_lemma_filtered(el[0]), "count" : el[1]}) for el in data_unigrams_sorted if el[1] > 1000]

#data_unigrams_sorted_nlp = [(el[0], {"doc" : nlp_stanza(el[0]), "count" : el[1]}) for el in data_unigrams_sorted if el[1] > 1000]

CPU times: user 1min 12s, sys: 7.5 s, total: 1min 20s
Wall time: 1min 21s


In [38]:
data_unigrams_sorted_nlp[501]

('G.',
 {'doc': [
    [
      {
        "id": 1,
        "text": "G.",
        "lemma": "G.",
        "upos": "X",
        "xpos": "ADD",
        "head": 0,
        "deprel": "root",
        "start_char": 0,
        "end_char": 2,
        "ner": "O",
        "multi_ner": [
          "O"
        ]
      }
    ]
  ],
  'count': 1856})

In [15]:
data_unigrams_sorted_nlp[:5]

[('the', {'doc': the, 'count': 3333595}),
 ('of', {'doc': of, 'count': 2261372}),
 ('and', {'doc': and, 'count': 1459413}),
 ('to', {'doc': to, 'count': 1228717}),
 ('in', {'doc': in, 'count': 1151878})]

In [16]:
data_unigrams_nlp_dict = dict(data_unigrams_sorted_nlp)

In [17]:
# save for future usage
with open("../data/large_files/data_unigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_unigrams_nlp_dict, f)

In [18]:
# remove middle-step data to save CPU...
del data_unigrams
del data_unigrams_sorted
del data_unigrams_sorted_nlp

# Trigrams to nlp docs

In [None]:
trigramCount_dict = pickle.load(open("../data/large_files/trigramCount_dict.pickle", "rb"))

In [None]:
#data_unigrams = merge_data_from_ids(ids, unigramCount_dict)
#data_bigrams = merge_data_from_ids(ids, bigramCount_dict)
data_trigrams = merge_data_from_ids(ids, trigramCount_dict)

In [None]:
ids2 = [k for k in list(trigramCount_dict.keys()) if k not in ids]
len(ids2)

In [None]:
data_trigrams2 = merge_data_from_ids(ids2, trigramCount_dict)

In [None]:
del trigramCount_dict

In [11]:
data_trigrams_sorted = sorted(data_trigrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_trigrams_sorted if el[1] > 10])

458444

In [26]:
%%time
data_trigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_trigrams_sorted if el[1] > 10]

CPU times: user 21min 37s, sys: 33.8 s, total: 22min 11s
Wall time: 23min 14s


In [16]:
del data_trigrams_sorted

In [27]:
data_trigrams_nlp_dict = dict(data_trigrams_sorted_nlp)

In [28]:
with open("../data/large_files/data_trigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_trigrams_nlp_dict, f)

# Second round of merged trigrams nlp preprocessing

In [12]:
len(data_trigrams2)

46627339

In [42]:
data_trigrams2 = dict(data_trigrams2)

In [13]:
keys1 = [el[0] for el in data_trigrams.items() if el[1] > 10]
len(keys1)

458444

In [22]:
del data_trigrams

In [14]:
keys2 = [el[0] for el in data_trigrams2.items() if el[1] > 10]
len(keys2) #  [el[0] for el in data_trigrams2_filtered]

774574

In [15]:
keys2_filtered = list((set(keys2) ^ set(keys1)) & set(keys2))
len(keys2_filtered)

342693

In [31]:
del nlp

In [32]:
print("hello")

hello


In [39]:
pickle.dump(keys2_filtered, open("../data/large_files/keys2_filtered.pickle", "wb"))
pickle.dump(data_trigrams2, open("../data/large_files/data_trigrams2.pickle", "wb"))

In [40]:
len(data_trigrams2)

46627339

In [43]:
%%time
data_trigrams2_filtered = []
n = 0
for el in keys2_filtered:
    if n in range(0, len(keys2_filtered), 5000):
        print(n)
    n += 1
    data_trigrams2_filtered.append((el, data_trigrams2[el]))

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
CPU times: user 285 ms, sys: 286 ms, total: 572 ms
Wall time: 839 ms


In [44]:
len(data_trigrams2_filtered)

342693

In [46]:
pickle.dump(data_trigrams2_filtered, open("../data/large_files/data_trigrams2_filtered.pickle", "wb"))

In [47]:
nlp = spacy.load('en_core_web_lg')

In [48]:
%%time
data_trigrams2_nlp_tups = []
n = 0
for el in data_trigrams2_filtered:
    if n in range(0, len(data_trigrams2_filtered), 5000):
        print(n)
    n += 1
    data_trigrams2_nlp_tups.append((el[0], {"doc" : nlp(el[0]), "count" : el[1]}))

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
CPU times: user 14min 25s, sys: 14.3 s, total: 14min 39s
Wall time: 14min 44s


In [49]:
pickle.dump(dict(data_trigrams2_nlp_tups), open("../data/large_files/data_trigrams2_nlp_dict.pickle", "wb"))