In [34]:
import pandas as pd
import pickle
import random
pd.set_option("display.max_columns", None)
from collections import Counter
import google_conf

In [2]:
mops_data = google_conf.setup(sheet_url="https://docs.google.com/spreadsheets/d/1VbCIAJssHKV9hlRTwzVFfm40CGnHesq53KXjv2qy4OM/edit?usp=sharing", service_account_path="../../ServiceAccountsKey.json")

# Compare nltk, spacy & stanza lemmatizers

In [107]:
import stanza
import spacy
import nltk

In [None]:
#stanza.download('en') # download English model

#nltk.download("wordnet")
#nltk.download("stopwords")

In [73]:
nlp_spacy = spacy.load('en_core_web_lg') # python -m spacy download en_core_web_lg

In [47]:
nlp_stanza = stanza.Pipeline('en') # initialize English neural pipeline

2023-02-17 10:51:29 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-02-17 10:51:31 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2023-02-17 10:51:31 INFO: Use device: cpu
2023-02-17 10:51:31 INFO: Loading: tokenize
2023-02-17 10:51:31 INFO: Loading: pos
2023-02-17 10:51:31 INFO: Loading: lemma
2023-02-17 10:51:31 INFO: Loading: depparse
2023-02-17 10:51:31 INFO: Loading: sentiment
2023-02-17 10:51:32 INFO: Loading: constituency
2023-02-17 10:51:32 INFO: Loading: ner
2023-02-17 10:51:32 INFO: Done loading processors!


In [44]:
from nltk.stem import WordNetLemmatizer
nltk_lemmatizer = WordNetLemmatizer()

In [79]:
nltk_lemmatizer.lemmatize("did.")

'did.'

In [89]:
[t for t in nlp_stanza("Did.").sentences[0].words]

[{
   "id": 1,
   "text": "Did",
   "lemma": "do",
   "upos": "VERB",
   "xpos": "VBD",
   "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
   "head": 0,
   "deprel": "root",
   "start_char": 0,
   "end_char": 3
 },
 {
   "id": 2,
   "text": ".",
   "lemma": ".",
   "upos": "PUNCT",
   "xpos": ".",
   "head": 1,
   "deprel": "punct",
   "start_char": 3,
   "end_char": 4
 }]

In [87]:
[t.lemma for t in nlp_stanza("Did.").sentences[0].words if t.pos in ["NOUN", "VERB"]]

['do']

In [102]:
[t.lemma_ for t in nlp_spacy("rightwing")]

['rightwe']

In [103]:
[t.lemma for t in nlp_stanza("rightwing").sentences[0].words]

['rightwing']

In [110]:
%%time
[t.lemma_ for t in nlp_spacy("Wrote")]

CPU times: user 8 ms, sys: 1.75 ms, total: 9.75 ms
Wall time: 8.01 ms


['write']

In [111]:
%%time
[t.lemma for t in nlp_stanza("Wrote").sentences[0].words]

CPU times: user 85.4 ms, sys: 9.75 ms, total: 95.2 ms
Wall time: 92.8 ms


['write']

In [112]:
unigramCount_dict = pickle.load(open("../data/large_files/unigramCount_dict.pickle", "rb"))

In [None]:
#bigramCount_dict = pickle.load(open("../data/large_files/bigramCount_dict.pickle", "rb"))

In [None]:
# trigramCount_dict = pickle.load(open("../data/large_files/trigramCount_dict.pickle", "rb"))

In [113]:
def merge_data_from_ids(ids, datadict):
    c = Counter()
    for id in ids:
        d = datadict[id]
        c.update(d)
    return c

In [117]:
# create a sample to preprocess the most frequent ngrams
random.seed(0)
ids = random.sample(list(unigramCount_dict.keys()), 1000)
len(ids)

1000

In [118]:
data_unigrams = merge_data_from_ids(ids, unigramCount_dict)

In [119]:
data_unigrams

Counter({'himself;': 35,
         'ihre': 28,
         'cognize': 7,
         'saw': 1646,
         'possibility': 882,
         'notion': 1079,
         'history,': 739,
         'old': 1907,
         'creation': 1686,
         '14': 908,
         'points': 1655,
         'semester': 24,
         'Karl': 370,
         'reform': 526,
         'Scheel,': 1,
         'non-earthly': 1,
         'carries': 252,
         'fulfillment': 319,
         'age,': 284,
         'regard,': 100,
         'changes': 626,
         'terrible': 165,
         'Tbc': 1,
         'Salvation': 189,
         '26': 644,
         '1549,': 28,
         'attitudes': 348,
         'structures': 387,
         'discuss': 481,
         'Lutheranism': 26,
         'asked,': 88,
         'sint': 26,
         'representative': 342,
         'monks,': 113,
         'well': 6427,
         'it,': 2323,
         '"that': 153,
         'karma': 13,
         'theology.': 569,
         'case,': 855,
         'undemonstrable':

# Generating most frequent ngrams for preprocessing

In [6]:
#data_unigrams = merge_data_from_ids(ids, unigramCount_dict)
#data_bigrams = merge_data_from_ids(ids, bigramCount_dict)
data_trigrams = merge_data_from_ids(ids, trigramCount_dict)

In [7]:
ids2 = [k for k in list(trigramCount_dict.keys()) if k not in ids]
len(ids2)

9103

In [9]:
data_trigrams2 = merge_data_from_ids(ids2, trigramCount_dict)

In [17]:
del trigramCount_dict

# Most frequent unigrams to nlp docs

In [121]:
# load spacy english nlp model & english stopwords...
stop_words = nlp_spacy.Defaults.stop_words

In [120]:
# check how raw unigrams look like
data_unigrams_sorted = sorted(data_unigrams.items(), key=lambda kv: kv[1], reverse=True)
data_unigrams_sorted[:100]

[('the', 685375),
 ('of', 469302),
 ('and', 301990),
 ('to', 255776),
 ('in', 236293),
 ('a', 162034),
 ('is', 130518),
 ('that', 128544),
 ('as', 87930),
 ('for', 71446),
 ('The', 65300),
 ('with', 61132),
 ('his', 59222),
 ('by', 56837),
 ('was', 56632),
 ('not', 56560),
 ('be', 51834),
 ('on', 51290),
 ('this', 48676),
 ('he', 46988),
 ('it', 46724),
 ('from', 45904),
 ('are', 39980),
 ('which', 38964),
 ('or', 34802),
 ('have', 32562),
 ('an', 31941),
 ('but', 30530),
 ('their', 30193),
 ('who', 30147),
 ('at', 28609),
 ('they', 26319),
 ('I', 25802),
 ('one', 24324),
 ('In', 24115),
 ('were', 24076),
 ('.', 23377),
 ('has', 23247),
 ('had', 22137),
 ('we', 21984),
 ('all', 21824),
 ('God', 20702),
 ('also', 19900),
 ('its', 18719),
 ('more', 17119),
 ('will', 16627),
 ('been', 16608),
 ('Christian', 16249),
 ('would', 15640),
 ('Paul', 15371),
 ('This', 15191),
 ('these', 15057),
 ('what', 14940),
 ('other', 14579),
 ('only', 14382),
 ('into', 13888),
 ('He', 13273),
 ('so', 13237

In [11]:
# how many of the 100 most frequent unigrams are actually not stopwords?
len([el for el in data_unigrams_sorted[:100] if el[0].lower() not in stop_words])

9

In [12]:
# what is the number of unigrams appearing more than 10 times?
len([el for el in data_unigrams_sorted if el[1] > 10])

158342

In [14]:
%%time
# use spacy to generate "Doc" object for ngram
# (use carefully, takes some time)
data_unigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_unigrams_sorted if el[1] > 10]

CPU times: user 7min 42s, sys: 8.41 s, total: 7min 50s
Wall time: 8min 3s


In [15]:
data_unigrams_sorted_nlp[:5]

[('the', {'doc': the, 'count': 3333595}),
 ('of', {'doc': of, 'count': 2261372}),
 ('and', {'doc': and, 'count': 1459413}),
 ('to', {'doc': to, 'count': 1228717}),
 ('in', {'doc': in, 'count': 1151878})]

In [16]:
data_unigrams_nlp_dict = dict(data_unigrams_sorted_nlp)

In [17]:
# save for future usage
with open("../data/large_files/data_unigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_unigrams_nlp_dict, f)

In [18]:
# remove middle-step data to save CPU...
del data_unigrams
del data_unigrams_sorted
del data_unigrams_sorted_nlp

# Trigrams to nlp docs

In [11]:
data_trigrams_sorted = sorted(data_trigrams.items(), key=lambda kv: kv[1], reverse=True)
len([el for el in data_trigrams_sorted if el[1] > 10])

458444

In [26]:
%%time
data_trigrams_sorted_nlp = [(el[0], {"doc" : nlp(el[0]), "count" : el[1]}) for el in data_trigrams_sorted if el[1] > 10]

CPU times: user 21min 37s, sys: 33.8 s, total: 22min 11s
Wall time: 23min 14s


In [16]:
del data_trigrams_sorted

In [27]:
data_trigrams_nlp_dict = dict(data_trigrams_sorted_nlp)

In [28]:
with open("../data/large_files/data_trigrams_nlp_dict.pickle", "wb") as f:
    pickle.dump(data_trigrams_nlp_dict, f)

# Second round of merged trigrams nlp preprocessing

In [12]:
len(data_trigrams2)

46627339

In [42]:
data_trigrams2 = dict(data_trigrams2)

In [13]:
keys1 = [el[0] for el in data_trigrams.items() if el[1] > 10]
len(keys1)

458444

In [22]:
del data_trigrams

In [14]:
keys2 = [el[0] for el in data_trigrams2.items() if el[1] > 10]
len(keys2) #  [el[0] for el in data_trigrams2_filtered]

774574

In [15]:
keys2_filtered = list((set(keys2) ^ set(keys1)) & set(keys2))
len(keys2_filtered)

342693

In [31]:
del nlp

In [32]:
print("hello")

hello


In [39]:
pickle.dump(keys2_filtered, open("../data/large_files/keys2_filtered.pickle", "wb"))
pickle.dump(data_trigrams2, open("../data/large_files/data_trigrams2.pickle", "wb"))

In [40]:
len(data_trigrams2)

46627339

In [43]:
%%time
data_trigrams2_filtered = []
n = 0
for el in keys2_filtered:
    if n in range(0, len(keys2_filtered), 5000):
        print(n)
    n += 1
    data_trigrams2_filtered.append((el, data_trigrams2[el]))

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
CPU times: user 285 ms, sys: 286 ms, total: 572 ms
Wall time: 839 ms


In [44]:
len(data_trigrams2_filtered)

342693

In [46]:
pickle.dump(data_trigrams2_filtered, open("../data/large_files/data_trigrams2_filtered.pickle", "wb"))

In [47]:
nlp = spacy.load('en_core_web_lg')

In [48]:
%%time
data_trigrams2_nlp_tups = []
n = 0
for el in data_trigrams2_filtered:
    if n in range(0, len(data_trigrams2_filtered), 5000):
        print(n)
    n += 1
    data_trigrams2_nlp_tups.append((el[0], {"doc" : nlp(el[0]), "count" : el[1]}))

0
5000
10000
15000
20000
25000
30000
35000
40000
45000
50000
55000
60000
65000
70000
75000
80000
85000
90000
95000
100000
105000
110000
115000
120000
125000
130000
135000
140000
145000
150000
155000
160000
165000
170000
175000
180000
185000
190000
195000
200000
205000
210000
215000
220000
225000
230000
235000
240000
245000
250000
255000
260000
265000
270000
275000
280000
285000
290000
295000
300000
305000
310000
315000
320000
325000
330000
335000
340000
CPU times: user 14min 25s, sys: 14.3 s, total: 14min 39s
Wall time: 14min 44s


In [49]:
pickle.dump(dict(data_trigrams2_nlp_tups), open("../data/large_files/data_trigrams2_nlp_dict.pickle", "wb"))