In [25]:
import pandas as pd
import pickle
import random
pd.set_option("display.max_columns", None)
from collections import Counter
import spacy
import re
from concurrent.futures import ThreadPoolExecutor


In [8]:
nlp = spacy.load('en_core_web_lg')

In [36]:
postags = ["PROPN", "NOUN", "VERB", "ADJ"]

def clean_token(token):
    token = re.sub("\W", "", token)
    token = token[0] + token[1:].lower()
    return token

def doc_to_lemmata(doc, postags):
    lemmata_list = [t.lemma_ for t in doc if t.pos_ in postags]
    lemmata_list = [clean_token(el) for el in lemmata_list if len(el)>0]
    joined_lemmata_sorted = " ".join(sorted(lemmata_list))
    return joined_lemmata_sorted

def ngram_to_lemmata(ngram, ngram_nlp_dict):
    try:
        lemmata_sorted_str = doc_to_lemmata(ngram_nlp_dict[ngram]["doc"], postags)
        #print("found in preprocessed") # (used for execution time tests...)
    except:
        try:
            lemmata_sorted_str = doc_to_lemmata(nlp(ngram), postags)
            # print("processed now") # (used for execution time tests...)
        except:
            lemmata_sorted_str = ""
    return lemmata_sorted_str

def article_data_to_lemmata(ngrams_dict, ngram_nlp_dict):
    lemmata_tups = []
    for string, count in ngrams_dict.items():
        lemmata_tups.append((ngram_to_lemmata(string, ngram_nlp_dict), count))
    #lemmata_tups = [tup for tup in lemmata_tups if len(tup[0].split()) > 1]
    lemmata_dict = Counter()
    for x,y in lemmata_tups:
        lemmata_dict.update({x : y})
    return lemmata_dict

In [5]:
ngram_type = "unigram"
ngramCount_dict = pickle.load(open("../data/large_files/{0}Count_dict.pickle".format(ngram_type), "rb"))
ngram_nlp_dict = pickle.load(open("../data/large_files/data_{0}s_nlp_dict.pickle".format(ngram_type), "rb"))

In [38]:
%%time
test_ngrams_dict = (list(ngramCount_dict.items())[7000][1])
test_output = article_data_to_lemmata(test_ngrams_dict, ngram_nlp_dict)
sorted(test_output.items(), key=lambda pair: pair[1], reverse=True)[:100]

CPU times: user 520 ms, sys: 7.57 ms, total: 527 ms
Wall time: 551 ms


[('', 1460),
 ('have', 32),
 ('water', 26),
 ('cold', 21),
 ('hot', 20),
 ('city', 11),
 ('Paul', 10),
 ('church', 9),
 ('interpretation', 9),
 ('be', 9),
 ('word', 9),
 ('use', 9),
 ('lukewarm', 8),
 ('local', 8),
 ('Laodicea', 8),
 ('apply', 7),
 ('man', 7),
 ('mean', 7),
 ('spring', 7),
 ('allusion', 7),
 ('other', 6),
 ('certainty', 6),
 ('Christ', 6),
 ('Ramsay', 5),
 ('zesto', 5),
 ('mile', 5),
 ('letter', 5),
 ('fact', 5),
 ('see', 5),
 ('laodicean', 5),
 ('source', 5),
 ('person', 5),
 ('example', 5),
 ('do', 4),
 ('most', 4),
 ('christian', 4),
 ('passage', 4),
 ('supply', 4),
 ('heal', 4),
 ('M', 4),
 ('mineral', 4),
 ('certain', 4),
 ('usage', 4),
 ('purpose', 4),
 ('great', 4),
 ('circumstance', 3),
 ('know', 3),
 ('natural', 3),
 ('Ma', 3),
 ('course', 3),
 ('lukewarmness', 3),
 ('call', 3),
 ('hierapoli', 3),
 ('such', 3),
 ('spiritual', 3),
 ('stone', 3),
 ('sense', 3),
 ('normal', 3),
 ('form', 3),
 ('become', 3),
 ('text', 3),
 ('difficult', 3),
 ('common', 3),
 ('take

In [None]:
# comparing iteration vs comprehension

In [23]:
%%time
n = 100
# small test...
cleanedNgrams_tups = [(k, article_data_to_lemmata(v, ngram_nlp_dict)) for k, v in list(ngramCount_dict.items())[:n]]

CPU times: user 2min 14s, sys: 2.95 s, total: 2min 17s
Wall time: 2min 24s


In [33]:
%%time
# small test...
cleanedNgrams_tups = [(k, article_data_to_lemmata(v, ngram_nlp_dict)) for k, v in list(ngramCount_dict.items())[:100]]

CPU times: user 2min 9s, sys: 1.12 s, total: 2min 10s
Wall time: 2min 13s


In [40]:
# using parallel computing...

In [27]:
def data_from_article_id(article_id):
    article_data = article_data_to_lemmata(ngramCount_dict[article_id], ngram_nlp_dict)
    return (article_id, article_data)

In [26]:
article_ids =  list(ngramCount_dict.keys())
article_ids[0]

'ark://27927/phx66812gq6'

In [39]:
len(article_ids)

14103

In [34]:
%%time
step=20
cleanedNgrams_tups = []
for num in range(0, 100, step):
    actual_ids = article_ids[num:num+step]
    with ThreadPoolExecutor(max_workers=step*1.5) as pool:
        currently_parsed = list(pool.map(data_from_article_id,actual_ids))
    cleanedNgrams_tups.extend(currently_parsed)

CPU times: user 2min 28s, sys: 1min, total: 3min 28s
Wall time: 1min 43s


# Main application: unigrams

In [None]:
%%time

ngram_type = "unigram"
ngramCount_dict = pickle.load(open("../data/large_files/{0}Count_dict.pickle".format(ngram_type), "rb"))
ngram_nlp_dict = pickle.load(open("../data/large_files/data_{0}s_nlp_dict.pickle".format(ngram_type), "rb"))

step=50
cleanedNgrams_tups = []
for num in range(0, len(article_ids), step):
    actual_ids = article_ids[num:num+step]
    with ThreadPoolExecutor(max_workers=step*1.5) as pool:
        currently_parsed = list(pool.map(data_from_article_id,actual_ids))
    cleanedNgrams_tups.extend(currently_parsed)

In [None]:
with open("../data/large_files/{0}Count_cleaned_dict.pickle".format(ngram_type), "wb") as f:
    pickle.dump(dict(cleanedNgrams_tups), f)

# Main application: trigrams

In [None]:
%%time

ngram_type = "trigram"
ngramCount_dict = pickle.load(open("../data/large_files/{0}Count_dict.pickle".format(ngram_type), "rb"))
ngram_nlp_dict = pickle.load(open("../data/large_files/data_{0}s_nlp_dict.pickle".format(ngram_type), "rb"))

step=50
cleanedNgrams_tups = []
for num in range(0, len(article_ids), step):
    if num in range(0, len(article_ids), 1000):
        print(num)
    actual_ids = article_ids[num:num+step]
    with ThreadPoolExecutor(max_workers=step*1.5) as pool:
        currently_parsed = list(pool.map(data_from_article_id,actual_ids))
    cleanedNgrams_tups.extend(currently_parsed)

In [None]:
with open("../data/large_files/{0}Count_cleaned_dict.pickle".format(ngram_type), "wb") as f:
    pickle.dump(dict(cleanedNgrams_tups), f)

# Main application: bigrams

In [None]:
%%time

ngram_type = "bigram"
ngramCount_dict = pickle.load(open("../data/large_files/{0}Count_dict.pickle".format(ngram_type), "rb"))
ngram_nlp_dict = pickle.load(open("../data/large_files/data_{0}s_nlp_dict.pickle".format(ngram_type), "rb"))

step=50
cleanedNgrams_tups = []
for num in range(0, len(article_ids), step):
    actual_ids = article_ids[num:num+step]
    with ThreadPoolExecutor(max_workers=step*1.5) as pool:
        currently_parsed = list(pool.map(data_from_article_id,actual_ids))
    cleanedNgrams_tups.extend(currently_parsed)

In [None]:
with open("../data/large_files/{0}Count_cleaned_dict.pickle".format(ngram_type), "wb") as f:
    pickle.dump(dict(cleanedNgrams_tups), f)